feat: optimize xinference request max token key and stop reason (#998)

9ae91a2e · takatost · GitHub · 276d3d10 · 9ae91a2e · 9ae91a2e
Unverified Commit 9ae91a2e authored Aug 24, 2023 by takatost Committed by GitHub Aug 24, 2023
Show whitespace changes
Inline Side-by-side

Showing with 24 additions and 21 deletions

xinference_provider.py api/core/model_providers/providers/xinference_provider.py +1 -2

xinference_llm.py api/core/third_party/langchain/llms/xinference_llm.py +23 -19

No files found.
--- a/api/core/model_providers/providers/xinference_provider.py
+++ b/api/core/model_providers/providers/xinference_provider.py
@@ -2,7 +2,6 @@ import json
 from typing import Type

 import requests
-from xinference.client import RESTfulGenerateModelHandle, RESTfulChatModelHandle, RESTfulChatglmCppChatModelHandle

 from core.helper import encrypter
 from core.model_providers.models.embedding.xinference_embedding import XinferenceEmbedding
@@ -73,7 +72,7 @@ class XinferenceProvider(BaseModelProvider):
                top_p=KwargRule[float](min=0, max=1, default=0.7),
                presence_penalty=KwargRule[float](enabled=False),
                frequency_penalty=KwargRule[float](enabled=False),
-                max_tokens=KwargRule[int](alias='max_new_tokens', min=10, max=4000, default=256),
+                max_tokens=KwargRule[int](min=10, max=4000, default=256),
            )



--- a/api/core/third_party/langchain/llms/xinference_llm.py
+++ b/api/core/third_party/langchain/llms/xinference_llm.py
@@ -89,13 +89,13 @@ class XinferenceLLM(Xinference):

            return completion

-
    def _stream_generate(
            self,
            model: Union["RESTfulGenerateModelHandle", "RESTfulChatModelHandle", "RESTfulChatglmCppChatModelHandle"],
            prompt: str,
            run_manager: Optional[CallbackManagerForLLMRun] = None,
-        generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig", "ChatglmCppGenerateConfig"]] = None,
+            generate_config: Optional[
+                Union["LlamaCppGenerateConfig", "PytorchGenerateConfig", "ChatglmCppGenerateConfig"]] = None,
    ) -> Generator[str, None, None]:
        """
        Args:
@@ -123,6 +123,10 @@ class XinferenceLLM(Xinference):
                if choices:
                    choice = choices[0]
                    if isinstance(choice, dict):
+                        if 'finish_reason' in choice and choice['finish_reason'] \
+                                and choice['finish_reason'] in ['stop', 'length']:
+                            break
+
                        if 'text' in choice:
                            token = choice.get("text", "")
                        elif 'delta' in choice and 'content' in choice['delta']: