fix overlap and splitter optimization (#2742)

Co-authored-by: jyong <jyong@dify.ai>

fix overlap and splitter optimization (#2742)
Co-authored-by: jyong <jyong@dify.ai>
8ba38e8e · Jyong · GitHub · b1635457 · 8ba38e8e · 8ba38e8e
Unverified Commit 8ba38e8e authored Mar 07, 2024 by Jyong Committed by GitHub Mar 07, 2024
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

index_processor_base.py api/core/rag/index_processor/index_processor_base.py +2 -2

text_splitter.py api/core/splitter/text_splitter.py +2 -2

No files found.
--- a/api/core/rag/index_processor/index_processor_base.py
+++ b/api/core/rag/index_processor/index_processor_base.py
@@ -52,7 +52,7 @@ class BaseIndexProcessor(ABC):

            character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
                chunk_size=segmentation["max_tokens"],
-                chunk_overlap=0,
+                chunk_overlap=segmentation.get('chunk_overlap', 0),
                fixed_separator=separator,
                separators=["\n\n", "。", ".", " ", ""],
                embedding_model_instance=embedding_model_instance
@@ -61,7 +61,7 @@ class BaseIndexProcessor(ABC):
            # Automatic segmentation
            character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
                chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
-                chunk_overlap=0,
+                chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'],
                separators=["\n\n", "。", ".", " ", ""],
                embedding_model_instance=embedding_model_instance
            )

--- a/api/core/splitter/text_splitter.py
+++ b/api/core/splitter/text_splitter.py
@@ -30,7 +30,7 @@ def _split_text_with_regex(
    if separator:
        if keep_separator:
            # The parentheses in the pattern keep the delimiters in the result.
-            _splits = re.split(f"({separator})", text)
+            _splits = re.split(f"({re.escape(separator)})", text)
            splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
            if len(_splits) % 2 == 0:
                splits += _splits[-1:]
@@ -94,7 +94,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
                documents.append(new_doc)
        return documents

-    def split_documents(self, documents: Iterable[Document]) -> list[Document]:
+    def split_documents(self, documents: Iterable[Document] ) -> list[Document]:
        """Split documents."""
        texts, metadatas = [], []
        for doc in documents: