Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
174ebb51
Unverified
Commit
174ebb51
authored
Jul 29, 2023
by
Jyong
Committed by
GitHub
Jul 29, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add qa thread control (#677)
parent
626c78a6
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
35 additions
and
36 deletions
+35
-36
indexing_runner.py
api/core/indexing_runner.py
+35
-36
No files found.
api/core/indexing_runner.py
View file @
174ebb51
...
...
@@ -494,6 +494,7 @@ class IndexingRunner:
Split the text documents into nodes.
"""
all_documents
=
[]
all_qa_documents
=
[]
for
text_doc
in
text_docs
:
# document clean
document_text
=
self
.
_document_clean
(
text_doc
.
page_content
,
processing_rule
)
...
...
@@ -502,58 +503,56 @@ class IndexingRunner:
# parse document to nodes
documents
=
splitter
.
split_documents
([
text_doc
])
split_documents
=
[]
for
document_node
in
documents
:
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_hash'
]
=
hash
split_documents
.
append
(
document_node
)
all_documents
.
extend
(
split_documents
)
# processing qa document
if
document_form
==
'qa_model'
:
llm
:
StreamableOpenAI
=
LLMBuilder
.
to_llm
(
tenant_id
=
tenant_id
,
model_name
=
'gpt-3.5-turbo'
,
max_tokens
=
2000
)
for
i
in
range
(
0
,
len
(
documents
),
10
):
for
i
in
range
(
0
,
len
(
all_
documents
),
10
):
threads
=
[]
sub_documents
=
documents
[
i
:
i
+
10
]
sub_documents
=
all_
documents
[
i
:
i
+
10
]
for
doc
in
sub_documents
:
document_format_thread
=
threading
.
Thread
(
target
=
self
.
format_document
,
kwargs
=
{
'llm'
:
llm
,
'document_node'
:
doc
,
'split_documents'
:
split_documents
,
'document_form'
:
document_form
})
document_format_thread
=
threading
.
Thread
(
target
=
self
.
format_qa_document
,
kwargs
=
{
'llm'
:
llm
,
'document_node'
:
doc
,
'all_qa_documents'
:
all_qa_documents
})
threads
.
append
(
document_format_thread
)
document_format_thread
.
start
()
for
thread
in
threads
:
thread
.
join
()
all_documents
.
extend
(
split_documents
)
return
all_qa_documents
return
all_documents
def
format_
document
(
self
,
llm
:
StreamableOpenAI
,
document_node
,
split_documents
,
document_form
:
str
):
def
format_
qa_document
(
self
,
llm
:
StreamableOpenAI
,
document_node
,
all_qa_documents
):
format_documents
=
[]
if
document_node
.
page_content
is
None
or
not
document_node
.
page_content
.
strip
():
return
format_documents
if
document_form
==
'text_model'
:
# text model document
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_hash'
]
=
hash
return
try
:
# qa model document
response
=
LLMGenerator
.
generate_qa_document_sync
(
llm
,
document_node
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
qa_documents
=
[]
for
result
in
document_qa_list
:
qa_document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
document_node
.
metadata
.
copy
())
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
result
[
'question'
])
qa_document
.
metadata
[
'answer'
]
=
result
[
'answer'
]
qa_document
.
metadata
[
'doc_id'
]
=
doc_id
qa_document
.
metadata
[
'doc_hash'
]
=
hash
qa_documents
.
append
(
qa_document
)
format_documents
.
extend
(
qa_documents
)
except
Exception
as
e
:
logging
.
error
(
str
(
e
))
format_documents
.
append
(
document_node
)
elif
document_form
==
'qa_model'
:
try
:
# qa model document
response
=
LLMGenerator
.
generate_qa_document_sync
(
llm
,
document_node
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
qa_documents
=
[]
for
result
in
document_qa_list
:
qa_document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
document_node
.
metadata
.
copy
())
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
result
[
'question'
])
qa_document
.
metadata
[
'answer'
]
=
result
[
'answer'
]
qa_document
.
metadata
[
'doc_id'
]
=
doc_id
qa_document
.
metadata
[
'doc_hash'
]
=
hash
qa_documents
.
append
(
qa_document
)
format_documents
.
extend
(
qa_documents
)
except
Exception
as
e
:
logging
.
error
(
str
(
e
))
split_documents
.
extend
(
format_documents
)
all_qa_documents
.
extend
(
format_documents
)
def
_split_to_documents_for_estimate
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment