Unverified Commit 6cf93379 authored by takatost's avatar takatost Committed by GitHub

fix: split chunks return empty strings (#2197)

parent 8639abec
...@@ -655,6 +655,8 @@ class IndexingRunner: ...@@ -655,6 +655,8 @@ class IndexingRunner:
else: else:
page_content = page_content page_content = page_content
document_node.page_content = page_content document_node.page_content = page_content
if document_node.page_content:
split_documents.append(document_node) split_documents.append(document_node)
all_documents.extend(split_documents) all_documents.extend(split_documents)
# processing qa document # processing qa document
......
import base64 import base64
import copy import copy
import time import time
from typing import Optional, Tuple from typing import Optional, Tuple, Union
import numpy as np import numpy as np
import tiktoken import tiktoken
...@@ -76,7 +76,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel): ...@@ -76,7 +76,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel):
embeddings_batch, embedding_used_tokens = self._embedding_invoke( embeddings_batch, embedding_used_tokens = self._embedding_invoke(
model=model, model=model,
client=client, client=client,
texts=[""], texts="",
extra_model_kwargs=extra_model_kwargs extra_model_kwargs=extra_model_kwargs
) )
...@@ -147,7 +147,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel): ...@@ -147,7 +147,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel):
return ai_model_entity.entity return ai_model_entity.entity
@staticmethod @staticmethod
def _embedding_invoke(model: str, client: AzureOpenAI, texts: list[str], def _embedding_invoke(model: str, client: AzureOpenAI, texts: Union[list[str], str],
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]: extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
response = client.embeddings.create( response = client.embeddings.create(
input=texts, input=texts,
......
...@@ -76,7 +76,7 @@ class CohereTextEmbeddingModel(TextEmbeddingModel): ...@@ -76,7 +76,7 @@ class CohereTextEmbeddingModel(TextEmbeddingModel):
embeddings_batch, embedding_used_tokens = self._embedding_invoke( embeddings_batch, embedding_used_tokens = self._embedding_invoke(
model=model, model=model,
credentials=credentials, credentials=credentials,
texts=[""] texts=[" "]
) )
used_tokens += embedding_used_tokens used_tokens += embedding_used_tokens
...@@ -131,6 +131,9 @@ class CohereTextEmbeddingModel(TextEmbeddingModel): ...@@ -131,6 +131,9 @@ class CohereTextEmbeddingModel(TextEmbeddingModel):
:param text: text to tokenize :param text: text to tokenize
:return: :return:
""" """
if not text:
return Tokens([], [], {})
# initialize client # initialize client
client = cohere.Client(credentials.get('api_key')) client = cohere.Client(credentials.get('api_key'))
......
import base64 import base64
import time import time
from typing import Optional, Tuple from typing import Optional, Tuple, Union
import numpy as np import numpy as np
import tiktoken import tiktoken
...@@ -89,7 +89,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel): ...@@ -89,7 +89,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel):
embeddings_batch, embedding_used_tokens = self._embedding_invoke( embeddings_batch, embedding_used_tokens = self._embedding_invoke(
model=model, model=model,
client=client, client=client,
texts=[""], texts="",
extra_model_kwargs=extra_model_kwargs extra_model_kwargs=extra_model_kwargs
) )
...@@ -160,7 +160,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel): ...@@ -160,7 +160,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel):
except Exception as ex: except Exception as ex:
raise CredentialsValidateFailedError(str(ex)) raise CredentialsValidateFailedError(str(ex))
def _embedding_invoke(self, model: str, client: OpenAI, texts: list[str], def _embedding_invoke(self, model: str, client: OpenAI, texts: Union[list[str], str],
extra_model_kwargs: dict) -> Tuple[list[list[float]], int]: extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
""" """
Invoke embedding model Invoke embedding model
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment