Unverified Commit 8ba38e8e authored by Jyong's avatar Jyong Committed by GitHub

fix overlap and splitter optimization (#2742)

Co-authored-by: 's avatarjyong <jyong@dify.ai>
parent b1635457
...@@ -52,7 +52,7 @@ class BaseIndexProcessor(ABC): ...@@ -52,7 +52,7 @@ class BaseIndexProcessor(ABC):
character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder( character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
chunk_size=segmentation["max_tokens"], chunk_size=segmentation["max_tokens"],
chunk_overlap=0, chunk_overlap=segmentation.get('chunk_overlap', 0),
fixed_separator=separator, fixed_separator=separator,
separators=["\n\n", "。", ".", " ", ""], separators=["\n\n", "。", ".", " ", ""],
embedding_model_instance=embedding_model_instance embedding_model_instance=embedding_model_instance
...@@ -61,7 +61,7 @@ class BaseIndexProcessor(ABC): ...@@ -61,7 +61,7 @@ class BaseIndexProcessor(ABC):
# Automatic segmentation # Automatic segmentation
character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder( character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'], chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
chunk_overlap=0, chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'],
separators=["\n\n", "。", ".", " ", ""], separators=["\n\n", "。", ".", " ", ""],
embedding_model_instance=embedding_model_instance embedding_model_instance=embedding_model_instance
) )
......
...@@ -30,7 +30,7 @@ def _split_text_with_regex( ...@@ -30,7 +30,7 @@ def _split_text_with_regex(
if separator: if separator:
if keep_separator: if keep_separator:
# The parentheses in the pattern keep the delimiters in the result. # The parentheses in the pattern keep the delimiters in the result.
_splits = re.split(f"({separator})", text) _splits = re.split(f"({re.escape(separator)})", text)
splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
if len(_splits) % 2 == 0: if len(_splits) % 2 == 0:
splits += _splits[-1:] splits += _splits[-1:]
...@@ -94,7 +94,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): ...@@ -94,7 +94,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
documents.append(new_doc) documents.append(new_doc)
return documents return documents
def split_documents(self, documents: Iterable[Document]) -> list[Document]: def split_documents(self, documents: Iterable[Document] ) -> list[Document]:
"""Split documents.""" """Split documents."""
texts, metadatas = [], [] texts, metadatas = [], []
for doc in documents: for doc in documents:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment