Unverified Commit 34bf2877 authored by takatost's avatar takatost Committed by GitHub

fix: tongyi stream generate not incremental and add qwen max models (#2013)

parent 3ebec8fa
import decimal import decimal
import json
import logging
import os import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Optional from typing import Optional
...@@ -12,7 +10,6 @@ from core.model_runtime.entities.model_entities import (AIModelEntity, DefaultPa ...@@ -12,7 +10,6 @@ from core.model_runtime.entities.model_entities import (AIModelEntity, DefaultPa
PriceConfig, PriceInfo, PriceType) PriceConfig, PriceInfo, PriceType)
from core.model_runtime.errors.invoke import InvokeAuthorizationError, InvokeError from core.model_runtime.errors.invoke import InvokeAuthorizationError, InvokeError
from core.model_runtime.model_providers.__base.tokenizers.gpt2_tokenzier import GPT2Tokenizer from core.model_runtime.model_providers.__base.tokenizers.gpt2_tokenzier import GPT2Tokenizer
from pydantic import ValidationError
class AIModel(ABC): class AIModel(ABC):
...@@ -54,14 +51,16 @@ class AIModel(ABC): ...@@ -54,14 +51,16 @@ class AIModel(ABC):
:param error: model invoke error :param error: model invoke error
:return: unified error :return: unified error
""" """
provider_name = self.__class__.__module__.split('.')[-3]
for invoke_error, model_errors in self._invoke_error_mapping.items(): for invoke_error, model_errors in self._invoke_error_mapping.items():
if isinstance(error, tuple(model_errors)): if isinstance(error, tuple(model_errors)):
if invoke_error == InvokeAuthorizationError: if invoke_error == InvokeAuthorizationError:
return invoke_error(description="Incorrect model credentials provided, please check and try again. ") return invoke_error(description=f"[{provider_name}] Incorrect model credentials provided, please check and try again. ")
return invoke_error(description=f"{invoke_error.description}: {str(error)}") return invoke_error(description=f"[{provider_name}] {invoke_error.description}, {str(error)}")
return InvokeError(description=f"Error: {str(error)}") return InvokeError(description=f"[{provider_name}] Error: {str(error)}")
def get_price(self, model: str, credentials: dict, price_type: PriceType, tokens: int) -> PriceInfo: def get_price(self, model: str, credentials: dict, price_type: PriceType, tokens: int) -> PriceInfo:
""" """
......
from http import HTTPStatus
from typing import Generator, List, Optional, Union from typing import Generator, List, Optional, Union
import dashscope from dashscope import get_tokenizer
from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta
from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta, LLMMode
from core.model_runtime.entities.message_entities import (AssistantPromptMessage, PromptMessage, PromptMessageTool, from core.model_runtime.entities.message_entities import (AssistantPromptMessage, PromptMessage, PromptMessageTool,
SystemPromptMessage, UserPromptMessage) SystemPromptMessage, UserPromptMessage)
from core.model_runtime.errors.invoke import (InvokeAuthorizationError, InvokeBadRequestError, InvokeConnectionError, from core.model_runtime.errors.invoke import (InvokeAuthorizationError, InvokeBadRequestError, InvokeConnectionError,
...@@ -51,19 +51,12 @@ class TongyiLargeLanguageModel(LargeLanguageModel): ...@@ -51,19 +51,12 @@ class TongyiLargeLanguageModel(LargeLanguageModel):
:param tools: tools for tool calling :param tools: tools for tool calling
:return: :return:
""" """
# transform credentials to kwargs for model instance tokenizer = get_tokenizer(model)
credentials_kwargs = self._to_credential_kwargs(credentials)
response = dashscope.Tokenization.call( # convert string to token ids
model=model, tokens = tokenizer.encode(self._convert_messages_to_prompt(prompt_messages))
prompt=self._convert_messages_to_prompt(prompt_messages),
**credentials_kwargs return len(tokens)
)
if response.status_code == HTTPStatus.OK:
return response['usage']['input_tokens']
else:
raise self._invoke_error_mapping[InvokeBadRequestError][0](response['message'])
def validate_credentials(self, model: str, credentials: dict) -> None: def validate_credentials(self, model: str, credentials: dict) -> None:
""" """
...@@ -119,14 +112,22 @@ class TongyiLargeLanguageModel(LargeLanguageModel): ...@@ -119,14 +112,22 @@ class TongyiLargeLanguageModel(LargeLanguageModel):
params = { params = {
'model': model, 'model': model,
'prompt': self._convert_messages_to_prompt(prompt_messages),
**model_parameters, **model_parameters,
**credentials_kwargs **credentials_kwargs
} }
mode = self.get_model_mode(model, credentials)
if mode == LLMMode.CHAT:
params['messages'] = self._convert_prompt_messages_to_tongyi_messages(prompt_messages)
else:
params['prompt'] = self._convert_messages_to_prompt(prompt_messages)
if stream: if stream:
responses = stream_generate_with_retry( responses = stream_generate_with_retry(
client, client,
stream=True, stream=True,
incremental_output=True,
**params **params
) )
...@@ -267,6 +268,35 @@ class TongyiLargeLanguageModel(LargeLanguageModel): ...@@ -267,6 +268,35 @@ class TongyiLargeLanguageModel(LargeLanguageModel):
# trim off the trailing ' ' that might come from the "Assistant: " # trim off the trailing ' ' that might come from the "Assistant: "
return text.rstrip() return text.rstrip()
def _convert_prompt_messages_to_tongyi_messages(self, prompt_messages: list[PromptMessage]) -> list[dict]:
"""
Convert prompt messages to tongyi messages
:param prompt_messages: prompt messages
:return: tongyi messages
"""
tongyi_messages = []
for prompt_message in prompt_messages:
if isinstance(prompt_message, SystemPromptMessage):
tongyi_messages.append({
'role': 'system',
'content': prompt_message.content,
})
elif isinstance(prompt_message, UserPromptMessage):
tongyi_messages.append({
'role': 'user',
'content': prompt_message.content,
})
elif isinstance(prompt_message, AssistantPromptMessage):
tongyi_messages.append({
'role': 'assistant',
'content': prompt_message.content,
})
else:
raise ValueError(f"Got unknown type {prompt_message}")
return tongyi_messages
@property @property
def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]: def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]:
""" """
......
model: qwen-max-1201
label:
en_US: qwen-max-1201
model_type: llm
model_properties:
mode: chat
context_size: 8192
parameter_rules:
- name: temperature
use_template: temperature
default: 1.0
min: 0.0
max: 2.0
help:
zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。
en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain.
- name: top_p
use_template: top_p
default: 0.8
help:
zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。
en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
- name: max_tokens
use_template: max_tokens
default: 1500
min: 1
max: 6000
help:
zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。
en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated.
- name: top_k
label:
zh_Hans: 取样数量
en_US: Top k
type: int
help:
zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。默认不传递该参数,取值为None或当top_k大于100时,表示不启用top_k策略,此时,仅有top_p策略生效。
en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. This parameter is not passed by default. The value is None or when top_k is greater than 100, it means that the top_k policy is not enabled. At this time, only the top_p policy takes effect.
required: false
- name: seed
label:
zh_Hans: 随机种子
en_US: Random seed
type: int
help:
zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。
en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types.
required: false
- name: repetition_penalty
label:
en_US: Repetition penalty
type: float
default: 1.1
help:
zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment.
required: false
model: qwen-max-longcontext
label:
en_US: qwen-max-longcontext
model_type: llm
model_properties:
mode: chat
context_size: 30000
parameter_rules:
- name: temperature
use_template: temperature
default: 1.0
min: 0.0
max: 2.0
help:
zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。
en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain.
- name: top_p
use_template: top_p
default: 0.8
help:
zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。
en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
- name: max_tokens
use_template: max_tokens
default: 2000
min: 1
max: 28000
help:
zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。
en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated.
- name: top_k
label:
zh_Hans: 取样数量
en_US: Top k
type: int
help:
zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。默认不传递该参数,取值为None或当top_k大于100时,表示不启用top_k策略,此时,仅有top_p策略生效。
en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. This parameter is not passed by default. The value is None or when top_k is greater than 100, it means that the top_k policy is not enabled. At this time, only the top_p policy takes effect.
required: false
- name: seed
label:
zh_Hans: 随机种子
en_US: Random seed
type: int
help:
zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。
en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types.
required: false
- name: repetition_penalty
label:
en_US: Repetition penalty
type: float
default: 1.1
help:
zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment.
required: false
model: qwen-max
label:
en_US: qwen-max
model_type: llm
model_properties:
mode: chat
context_size: 8192
parameter_rules:
- name: temperature
use_template: temperature
default: 1.0
min: 0.0
max: 2.0
help:
zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。
en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain.
- name: top_p
use_template: top_p
default: 0.8
help:
zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。
en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
- name: max_tokens
use_template: max_tokens
default: 1500
min: 1
max: 6000
help:
zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。
en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated.
- name: top_k
label:
zh_Hans: 取样数量
en_US: Top k
type: int
help:
zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。默认不传递该参数,取值为None或当top_k大于100时,表示不启用top_k策略,此时,仅有top_p策略生效。
en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. This parameter is not passed by default. The value is None or when top_k is greater than 100, it means that the top_k policy is not enabled. At this time, only the top_p policy takes effect.
required: false
- name: seed
label:
zh_Hans: 随机种子
en_US: Random seed
type: int
help:
zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。
en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types.
required: false
- name: repetition_penalty
label:
en_US: Repetition penalty
type: float
default: 1.1
help:
zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment.
required: false
...@@ -24,7 +24,7 @@ parameter_rules: ...@@ -24,7 +24,7 @@ parameter_rules:
use_template: max_tokens use_template: max_tokens
default: 2000 default: 2000
min: 1 min: 1
max: 2000 max: 30000
help: help:
zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。 zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。
en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated. en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated.
...@@ -42,10 +42,9 @@ parameter_rules: ...@@ -42,10 +42,9 @@ parameter_rules:
zh_Hans: 随机种子 zh_Hans: 随机种子
en_US: Random seed en_US: Random seed
type: int type: int
default: 1234
help: help:
zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。默认值 1234。 zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。
en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. Default value 1234. en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types.
required: false required: false
- name: repetition_penalty - name: repetition_penalty
label: label:
...@@ -55,3 +54,8 @@ parameter_rules: ...@@ -55,3 +54,8 @@ parameter_rules:
help: help:
zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment. en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment.
pricing:
input: '0.02'
output: '0.02'
unit: '0.001'
currency: RMB
...@@ -24,7 +24,7 @@ parameter_rules: ...@@ -24,7 +24,7 @@ parameter_rules:
use_template: max_tokens use_template: max_tokens
default: 1500 default: 1500
min: 1 min: 1
max: 1500 max: 6000
help: help:
zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。 zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。
en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated. en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated.
...@@ -42,10 +42,9 @@ parameter_rules: ...@@ -42,10 +42,9 @@ parameter_rules:
zh_Hans: 随机种子 zh_Hans: 随机种子
en_US: Random seed en_US: Random seed
type: int type: int
default: 1234
help: help:
zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。默认值 1234。 zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。
en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. Default value 1234. en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types.
required: false required: false
- name: repetition_penalty - name: repetition_penalty
label: label:
...@@ -56,3 +55,8 @@ parameter_rules: ...@@ -56,3 +55,8 @@ parameter_rules:
zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment. en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment.
required: false required: false
pricing:
input: '0.008'
output: '0.008'
unit: '0.001'
currency: RMB
...@@ -44,7 +44,7 @@ readabilipy==0.2.0 ...@@ -44,7 +44,7 @@ readabilipy==0.2.0
google-search-results==2.4.2 google-search-results==2.4.2
replicate~=0.22.0 replicate~=0.22.0
websocket-client~=1.7.0 websocket-client~=1.7.0
dashscope~=1.13.5 dashscope[tokenizer]~=1.14.0
huggingface_hub~=0.16.4 huggingface_hub~=0.16.4
transformers~=4.31.0 transformers~=4.31.0
pandas==1.5.3 pandas==1.5.3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment