Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
815f794e
Unverified
Commit
815f794e
authored
May 16, 2023
by
John Wang
Committed by
GitHub
May 16, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat: optimize split rule when use custom split segment identifier (#35)
parent
3117619e
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
73 additions
and
6 deletions
+73
-6
fixed_text_splitter.py
api/core/index/spiltter/fixed_text_splitter.py
+68
-0
indexing_runner.py
api/core/indexing_runner.py
+5
-6
No files found.
api/core/index/spiltter/fixed_text_splitter.py
0 → 100644
View file @
815f794e
"""Functionality for splitting text."""
from
__future__
import
annotations
from
typing
import
(
Any
,
List
,
Optional
,
)
from
langchain.text_splitter
import
RecursiveCharacterTextSplitter
class
FixedRecursiveCharacterTextSplitter
(
RecursiveCharacterTextSplitter
):
def
__init__
(
self
,
fixed_separator
:
str
=
"
\n\n
"
,
separators
:
Optional
[
List
[
str
]]
=
None
,
**
kwargs
:
Any
):
"""Create a new TextSplitter."""
super
()
.
__init__
(
**
kwargs
)
self
.
_fixed_separator
=
fixed_separator
self
.
_separators
=
separators
or
[
"
\n\n
"
,
"
\n
"
,
" "
,
""
]
def
split_text
(
self
,
text
:
str
)
->
List
[
str
]:
"""Split incoming text and return chunks."""
if
self
.
_fixed_separator
:
chunks
=
text
.
split
(
self
.
_fixed_separator
)
else
:
chunks
=
list
(
text
)
final_chunks
=
[]
for
chunk
in
chunks
:
if
self
.
_length_function
(
chunk
)
>
self
.
_chunk_size
:
final_chunks
.
extend
(
self
.
recursive_split_text
(
chunk
))
else
:
final_chunks
.
append
(
chunk
)
return
final_chunks
def
recursive_split_text
(
self
,
text
:
str
)
->
List
[
str
]:
"""Split incoming text and return chunks."""
final_chunks
=
[]
# Get appropriate separator to use
separator
=
self
.
_separators
[
-
1
]
for
_s
in
self
.
_separators
:
if
_s
==
""
:
separator
=
_s
break
if
_s
in
text
:
separator
=
_s
break
# Now that we have the separator, split the text
if
separator
:
splits
=
text
.
split
(
separator
)
else
:
splits
=
list
(
text
)
# Now go merging things, recursively splitting longer texts.
_good_splits
=
[]
for
s
in
splits
:
if
self
.
_length_function
(
s
)
<
self
.
_chunk_size
:
_good_splits
.
append
(
s
)
else
:
if
_good_splits
:
merged_text
=
self
.
_merge_splits
(
_good_splits
,
separator
)
final_chunks
.
extend
(
merged_text
)
_good_splits
=
[]
other_info
=
self
.
recursive_split_text
(
s
)
final_chunks
.
extend
(
other_info
)
if
_good_splits
:
merged_text
=
self
.
_merge_splits
(
_good_splits
,
separator
)
final_chunks
.
extend
(
merged_text
)
return
final_chunks
api/core/indexing_runner.py
View file @
815f794e
...
...
@@ -18,6 +18,7 @@ from core.docstore.dataset_docstore import DatesetDocumentStore
from
core.index.keyword_table_index
import
KeywordTableIndex
from
core.index.readers.html_parser
import
HTMLParser
from
core.index.readers.pdf_parser
import
PDFParser
from
core.index.spiltter.fixed_text_splitter
import
FixedRecursiveCharacterTextSplitter
from
core.index.vector_index
import
VectorIndex
from
core.llm.token_calculator
import
TokenCalculator
from
extensions.ext_database
import
db
...
...
@@ -267,16 +268,14 @@ class IndexingRunner:
raise
ValueError
(
"Custom segment length should be between 50 and 1000."
)
separator
=
segmentation
[
"separator"
]
if
not
separator
:
separators
=
[
"
\n\n
"
,
"。"
,
"."
,
" "
,
""
]
else
:
if
separator
:
separator
=
separator
.
replace
(
'
\\
n'
,
'
\n
'
)
separators
=
[
separator
,
""
]
character_splitter
=
RecursiveCharacterTextSplitter
.
from_tiktoken_encoder
(
character_splitter
=
Fixed
RecursiveCharacterTextSplitter
.
from_tiktoken_encoder
(
chunk_size
=
segmentation
[
"max_tokens"
],
chunk_overlap
=
0
,
separators
=
separators
fixed_separator
=
separator
,
separators
=
[
"
\n\n
"
,
"。"
,
"."
,
" "
,
""
]
)
else
:
# Automatic segmentation
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment