Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
bacd59ae
Commit
bacd59ae
authored
Jul 29, 2023
by
jyong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add qa thread control
parent
626c78a6
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
35 additions
and
36 deletions
+35
-36
indexing_runner.py
api/core/indexing_runner.py
+35
-36
No files found.
api/core/indexing_runner.py
View file @
bacd59ae
...
@@ -494,6 +494,7 @@ class IndexingRunner:
...
@@ -494,6 +494,7 @@ class IndexingRunner:
Split the text documents into nodes.
Split the text documents into nodes.
"""
"""
all_documents
=
[]
all_documents
=
[]
all_qa_documents
=
[]
for
text_doc
in
text_docs
:
for
text_doc
in
text_docs
:
# document clean
# document clean
document_text
=
self
.
_document_clean
(
text_doc
.
page_content
,
processing_rule
)
document_text
=
self
.
_document_clean
(
text_doc
.
page_content
,
processing_rule
)
...
@@ -502,41 +503,38 @@ class IndexingRunner:
...
@@ -502,41 +503,38 @@ class IndexingRunner:
# parse document to nodes
# parse document to nodes
documents
=
splitter
.
split_documents
([
text_doc
])
documents
=
splitter
.
split_documents
([
text_doc
])
split_documents
=
[]
split_documents
=
[]
for
document_node
in
documents
:
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_hash'
]
=
hash
split_documents
.
append
(
document_node
)
all_documents
.
extend
(
split_documents
)
# processing qa document
if
document_form
==
'qa_model'
:
llm
:
StreamableOpenAI
=
LLMBuilder
.
to_llm
(
llm
:
StreamableOpenAI
=
LLMBuilder
.
to_llm
(
tenant_id
=
tenant_id
,
tenant_id
=
tenant_id
,
model_name
=
'gpt-3.5-turbo'
,
model_name
=
'gpt-3.5-turbo'
,
max_tokens
=
2000
max_tokens
=
2000
)
)
for
i
in
range
(
0
,
len
(
documents
),
10
):
for
i
in
range
(
0
,
len
(
all_
documents
),
10
):
threads
=
[]
threads
=
[]
sub_documents
=
documents
[
i
:
i
+
10
]
sub_documents
=
all_
documents
[
i
:
i
+
10
]
for
doc
in
sub_documents
:
for
doc
in
sub_documents
:
document_format_thread
=
threading
.
Thread
(
target
=
self
.
format_document
,
kwargs
=
{
document_format_thread
=
threading
.
Thread
(
target
=
self
.
format_qa_document
,
kwargs
=
{
'llm'
:
llm
,
'document_node'
:
doc
,
'split_documents'
:
split_documents
,
'llm'
:
llm
,
'document_node'
:
doc
,
'all_qa_documents'
:
all_qa_documents
})
'document_form'
:
document_form
})
threads
.
append
(
document_format_thread
)
threads
.
append
(
document_format_thread
)
document_format_thread
.
start
()
document_format_thread
.
start
()
for
thread
in
threads
:
for
thread
in
threads
:
thread
.
join
()
thread
.
join
()
return
all_qa_documents
all_documents
.
extend
(
split_documents
)
return
all_documents
return
all_documents
def
format_
document
(
self
,
llm
:
StreamableOpenAI
,
document_node
,
split_documents
,
document_form
:
str
):
def
format_
qa_document
(
self
,
llm
:
StreamableOpenAI
,
document_node
,
all_qa_documents
):
format_documents
=
[]
format_documents
=
[]
if
document_node
.
page_content
is
None
or
not
document_node
.
page_content
.
strip
():
if
document_node
.
page_content
is
None
or
not
document_node
.
page_content
.
strip
():
return
format_documents
return
if
document_form
==
'text_model'
:
# text model document
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_hash'
]
=
hash
format_documents
.
append
(
document_node
)
elif
document_form
==
'qa_model'
:
try
:
try
:
# qa model document
# qa model document
response
=
LLMGenerator
.
generate_qa_document_sync
(
llm
,
document_node
.
page_content
)
response
=
LLMGenerator
.
generate_qa_document_sync
(
llm
,
document_node
.
page_content
)
...
@@ -553,7 +551,8 @@ class IndexingRunner:
...
@@ -553,7 +551,8 @@ class IndexingRunner:
format_documents
.
extend
(
qa_documents
)
format_documents
.
extend
(
qa_documents
)
except
Exception
as
e
:
except
Exception
as
e
:
logging
.
error
(
str
(
e
))
logging
.
error
(
str
(
e
))
split_documents
.
extend
(
format_documents
)
all_qa_documents
.
extend
(
format_documents
)
def
_split_to_documents_for_estimate
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
def
_split_to_documents_for_estimate
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment