fix: tongyi stream generate not incremental and add qwen max models (#2013)

34bf2877 · takatost · GitHub · 3ebec8fa · 34bf2877 · 34bf2877
Unverified Commit 34bf2877 authored Jan 12, 2024 by takatost Committed by GitHub Jan 12, 2024
8 changed files
--- a/api/core/model_runtime/model_providers/__base/ai_model.py
+++ b/api/core/model_runtime/model_providers/__base/ai_model.py
 import decimal
-import json
-import logging
 import os
 from abc import ABC, abstractmethod
 from typing import Optional
@@ -12,7 +10,6 @@ from core.model_runtime.entities.model_entities import (AIModelEntity, DefaultPa
                                                        PriceConfig, PriceInfo, PriceType)
 from core.model_runtime.errors.invoke import InvokeAuthorizationError, InvokeError
 from core.model_runtime.model_providers.__base.tokenizers.gpt2_tokenzier import GPT2Tokenizer
-from pydantic import ValidationError


 class AIModel(ABC):
@@ -54,14 +51,16 @@ class AIModel(ABC):
        :param error: model invoke error
        :return: unified error
        """
+        provider_name = self.__class__.__module__.split('.')[-3]
+
        for invoke_error, model_errors in self._invoke_error_mapping.items():
            if isinstance(error, tuple(model_errors)):
                if invoke_error == InvokeAuthorizationError:
-                    return invoke_error(description="Incorrect model credentials provided, please check and try again. ")
+                    return invoke_error(description=f"[{provider_name}] Incorrect model credentials provided, please check and try again. ")

-                return invoke_error(description=f"{invoke_error.description}: {str(error)}")
+                return invoke_error(description=f"[{provider_name}] {invoke_error.description}, {str(error)}")

-        return InvokeError(description=f"Error: {str(error)}")
+        return InvokeError(description=f"[{provider_name}] Error: {str(error)}")

    def get_price(self, model: str, credentials: dict, price_type: PriceType, tokens: int) -> PriceInfo:
        """

--- a/api/core/model_runtime/model_providers/tongyi/llm/llm.py
+++ b/api/core/model_runtime/model_providers/tongyi/llm/llm.py
-from http import HTTPStatus
 from typing import Generator, List, Optional, Union

-import dashscope
-from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta
+from dashscope import get_tokenizer
+
+from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta, LLMMode
 from core.model_runtime.entities.message_entities import (AssistantPromptMessage, PromptMessage, PromptMessageTool,
                                                          SystemPromptMessage, UserPromptMessage)
 from core.model_runtime.errors.invoke import (InvokeAuthorizationError, InvokeBadRequestError, InvokeConnectionError,
@@ -51,19 +51,12 @@ class TongyiLargeLanguageModel(LargeLanguageModel):
        :param tools: tools for tool calling
        :return:
        """
-        # transform credentials to kwargs for model instance
-        credentials_kwargs = self._to_credential_kwargs(credentials)
+        tokenizer = get_tokenizer(model)

-        response = dashscope.Tokenization.call(
-            model=model,
-            prompt=self._convert_messages_to_prompt(prompt_messages),
-            **credentials_kwargs
-        )
-        
-        if response.status_code == HTTPStatus.OK:
-            return response['usage']['input_tokens']
-        else:
-            raise self._invoke_error_mapping[InvokeBadRequestError][0](response['message'])
+        # convert string to token ids
+        tokens = tokenizer.encode(self._convert_messages_to_prompt(prompt_messages))
+
+        return len(tokens)

    def validate_credentials(self, model: str, credentials: dict) -> None:
        """
@@ -119,14 +112,22 @@ class TongyiLargeLanguageModel(LargeLanguageModel):

        params = {
            'model': model,
-            'prompt': self._convert_messages_to_prompt(prompt_messages),
            **model_parameters,
            **credentials_kwargs
        }
+
+        mode = self.get_model_mode(model, credentials)
+
+        if mode == LLMMode.CHAT:
+            params['messages'] = self._convert_prompt_messages_to_tongyi_messages(prompt_messages)
+        else:
+            params['prompt'] = self._convert_messages_to_prompt(prompt_messages)
+
        if stream:
            responses = stream_generate_with_retry(
                client, 
                stream=True,
+                incremental_output=True,
                **params
            )

@@ -267,6 +268,35 @@ class TongyiLargeLanguageModel(LargeLanguageModel):
        # trim off the trailing ' ' that might come from the "Assistant: "
        return text.rstrip()

+    def _convert_prompt_messages_to_tongyi_messages(self, prompt_messages: list[PromptMessage]) -> list[dict]:
+        """
+        Convert prompt messages to tongyi messages
+
+        :param prompt_messages: prompt messages
+        :return: tongyi messages
+        """
+        tongyi_messages = []
+        for prompt_message in prompt_messages:
+            if isinstance(prompt_message, SystemPromptMessage):
+                tongyi_messages.append({
+                    'role': 'system',
+                    'content': prompt_message.content,
+                })
+            elif isinstance(prompt_message, UserPromptMessage):
+                tongyi_messages.append({
+                    'role': 'user',
+                    'content': prompt_message.content,
+                })
+            elif isinstance(prompt_message, AssistantPromptMessage):
+                tongyi_messages.append({
+                    'role': 'assistant',
+                    'content': prompt_message.content,
+                })
+            else:
+                raise ValueError(f"Got unknown type {prompt_message}")
+
+        return tongyi_messages
+
    @property
    def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]:
        """

--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-1201.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-1201.yaml
+model: qwen-max-1201
+label:
+  en_US: qwen-max-1201
+model_type: llm
+model_properties:
+  mode: chat
+  context_size: 8192
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+    default: 1.0
+    min: 0.0
+    max: 2.0
+    help:
+      zh_Hans: 用于控制随机性和多样性的程度。具体来说，temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值，使得更多的低概率词被选择，生成结果更加多样化；而较低的temperature值则会增强概率分布的峰值，使得高概率词更容易被选择，生成结果更加确定。
+      en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain.
+  - name: top_p
+    use_template: top_p
+    default: 0.8
+    help:
+      zh_Hans: 生成过程中核采样方法概率阈值，例如，取值为0.8时，仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为（0,1.0)，取值越大，生成的随机性越高；取值越低，生成的确定性越高。
+      en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
+  - name: max_tokens
+    use_template: max_tokens
+    default: 1500
+    min: 1
+    max: 6000
+    help:
+      zh_Hans: 用于限制模型生成token的数量，max_tokens设置的是生成上限，并不表示一定会生成这么多的token数量。
+      en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated.
+  - name: top_k
+    label:
+      zh_Hans: 取样数量
+      en_US: Top k
+    type: int
+    help:
+      zh_Hans: 生成时，采样候选集的大小。例如，取值为50时，仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大，生成的随机性越高；取值越小，生成的确定性越高。默认不传递该参数，取值为None或当top_k大于100时，表示不启用top_k策略，此时，仅有top_p策略生效。
+      en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. This parameter is not passed by default. The value is None or when top_k is greater than 100, it means that the top_k policy is not enabled. At this time, only the top_p policy takes effect.
+    required: false
+  - name: seed
+    label:
+      zh_Hans: 随机种子
+      en_US: Random seed
+    type: int
+    help:
+      zh_Hans: 生成时，随机数的种子，用于控制模型生成的随机性。如果使用相同的种子，每次运行生成的结果都将相同；当需要复现模型的生成结果时，可以使用相同的种子。seed参数支持无符号64位整数类型。
+      en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types.
+    required: false
+  - name: repetition_penalty
+    label:
+      en_US: Repetition penalty
+    type: float
+    default: 1.1
+    help:
+      zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
+      en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment.
+    required: false
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-longcontext.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-longcontext.yaml
+model: qwen-max-longcontext
+label:
+  en_US: qwen-max-longcontext
+model_type: llm
+model_properties:
+  mode: chat
+  context_size: 30000
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+    default: 1.0
+    min: 0.0
+    max: 2.0
+    help:
+      zh_Hans: 用于控制随机性和多样性的程度。具体来说，temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值，使得更多的低概率词被选择，生成结果更加多样化；而较低的temperature值则会增强概率分布的峰值，使得高概率词更容易被选择，生成结果更加确定。
+      en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain.
+  - name: top_p
+    use_template: top_p
+    default: 0.8
+    help:
+      zh_Hans: 生成过程中核采样方法概率阈值，例如，取值为0.8时，仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为（0,1.0)，取值越大，生成的随机性越高；取值越低，生成的确定性越高。
+      en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
+  - name: max_tokens
+    use_template: max_tokens
+    default: 2000
+    min: 1
+    max: 28000
+    help:
+      zh_Hans: 用于限制模型生成token的数量，max_tokens设置的是生成上限，并不表示一定会生成这么多的token数量。
+      en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated.
+  - name: top_k
+    label:
+      zh_Hans: 取样数量
+      en_US: Top k
+    type: int
+    help:
+      zh_Hans: 生成时，采样候选集的大小。例如，取值为50时，仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大，生成的随机性越高；取值越小，生成的确定性越高。默认不传递该参数，取值为None或当top_k大于100时，表示不启用top_k策略，此时，仅有top_p策略生效。
+      en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. This parameter is not passed by default. The value is None or when top_k is greater than 100, it means that the top_k policy is not enabled. At this time, only the top_p policy takes effect.
+    required: false
+  - name: seed
+    label:
+      zh_Hans: 随机种子
+      en_US: Random seed
+    type: int
+    help:
+      zh_Hans: 生成时，随机数的种子，用于控制模型生成的随机性。如果使用相同的种子，每次运行生成的结果都将相同；当需要复现模型的生成结果时，可以使用相同的种子。seed参数支持无符号64位整数类型。
+      en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types.
+    required: false
+  - name: repetition_penalty
+    label:
+      en_US: Repetition penalty
+    type: float
+    default: 1.1
+    help:
+      zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
+      en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment.
+    required: false
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max.yaml
+model: qwen-max
+label:
+  en_US: qwen-max
+model_type: llm
+model_properties:
+  mode: chat
+  context_size: 8192
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+    default: 1.0
+    min: 0.0
+    max: 2.0
+    help:
+      zh_Hans: 用于控制随机性和多样性的程度。具体来说，temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值，使得更多的低概率词被选择，生成结果更加多样化；而较低的temperature值则会增强概率分布的峰值，使得高概率词更容易被选择，生成结果更加确定。
+      en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain.
+  - name: top_p
+    use_template: top_p
+    default: 0.8
+    help:
+      zh_Hans: 生成过程中核采样方法概率阈值，例如，取值为0.8时，仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为（0,1.0)，取值越大，生成的随机性越高；取值越低，生成的确定性越高。
+      en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
+  - name: max_tokens
+    use_template: max_tokens
+    default: 1500
+    min: 1
+    max: 6000
+    help:
+      zh_Hans: 用于限制模型生成token的数量，max_tokens设置的是生成上限，并不表示一定会生成这么多的token数量。
+      en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated.
+  - name: top_k
+    label:
+      zh_Hans: 取样数量
+      en_US: Top k
+    type: int
+    help:
+      zh_Hans: 生成时，采样候选集的大小。例如，取值为50时，仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大，生成的随机性越高；取值越小，生成的确定性越高。默认不传递该参数，取值为None或当top_k大于100时，表示不启用top_k策略，此时，仅有top_p策略生效。
+      en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. This parameter is not passed by default. The value is None or when top_k is greater than 100, it means that the top_k policy is not enabled. At this time, only the top_p policy takes effect.
+    required: false
+  - name: seed
+    label:
+      zh_Hans: 随机种子
+      en_US: Random seed
+    type: int
+    help:
+      zh_Hans: 生成时，随机数的种子，用于控制模型生成的随机性。如果使用相同的种子，每次运行生成的结果都将相同；当需要复现模型的生成结果时，可以使用相同的种子。seed参数支持无符号64位整数类型。
+      en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types.
+    required: false
+  - name: repetition_penalty
+    label:
+      en_US: Repetition penalty
+    type: float
+    default: 1.1
+    help:
+      zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
+      en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment.
+    required: false
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus.yaml
@@ -24,7 +24,7 @@ parameter_rules:
    use_template: max_tokens
    default: 2000
    min: 1
-    max: 2000
+    max: 30000
    help:
      zh_Hans: 用于限制模型生成token的数量，max_tokens设置的是生成上限，并不表示一定会生成这么多的token数量。
      en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated.
@@ -42,10 +42,9 @@ parameter_rules:
      zh_Hans: 随机种子
      en_US: Random seed
    type: int
-    default: 1234
    help:
-      zh_Hans: 生成时，随机数的种子，用于控制模型生成的随机性。如果使用相同的种子，每次运行生成的结果都将相同；当需要复现模型的生成结果时，可以使用相同的种子。seed参数支持无符号64位整数类型。默认值 1234。
-      en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. Default value 1234.
+      zh_Hans: 生成时，随机数的种子，用于控制模型生成的随机性。如果使用相同的种子，每次运行生成的结果都将相同；当需要复现模型的生成结果时，可以使用相同的种子。seed参数支持无符号64位整数类型。
+      en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types.
    required: false
  - name: repetition_penalty
    label:
@@ -55,3 +54,8 @@ parameter_rules:
    help:
      zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
      en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment.
+pricing:
+  input: '0.02'
+  output: '0.02'
+  unit: '0.001'
+  currency: RMB
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo.yaml
@@ -24,7 +24,7 @@ parameter_rules:
    use_template: max_tokens
    default: 1500
    min: 1
-    max: 1500
+    max: 6000
    help:
      zh_Hans: 用于限制模型生成token的数量，max_tokens设置的是生成上限，并不表示一定会生成这么多的token数量。
      en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated.
@@ -42,10 +42,9 @@ parameter_rules:
      zh_Hans: 随机种子
      en_US: Random seed
    type: int
-    default: 1234
    help:
-      zh_Hans: 生成时，随机数的种子，用于控制模型生成的随机性。如果使用相同的种子，每次运行生成的结果都将相同；当需要复现模型的生成结果时，可以使用相同的种子。seed参数支持无符号64位整数类型。默认值 1234。
-      en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. Default value 1234.
+      zh_Hans: 生成时，随机数的种子，用于控制模型生成的随机性。如果使用相同的种子，每次运行生成的结果都将相同；当需要复现模型的生成结果时，可以使用相同的种子。seed参数支持无符号64位整数类型。
+      en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types.
    required: false
  - name: repetition_penalty
    label:
@@ -56,3 +55,8 @@ parameter_rules:
      zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
      en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment.
    required: false
+pricing:
+  input: '0.008'
+  output: '0.008'
+  unit: '0.001'
+  currency: RMB
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -44,7 +44,7 @@ readabilipy==0.2.0
 google-search-results==2.4.2
 replicate~=0.22.0
 websocket-client~=1.7.0
-dashscope~=1.13.5
+dashscope[tokenizer]~=1.14.0
 huggingface_hub~=0.16.4
 transformers~=4.31.0
 pandas==1.5.3