Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
9b52050b
Commit
9b52050b
authored
Jul 29, 2023
by
jyong
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feat/improve-qa-dataset-thread' into deploy/dev
# Conflicts: # api/core/indexing_runner.py
parents
b38115b4
bacd59ae
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
42 additions
and
39 deletions
+42
-39
structured_chat.py
api/core/agent/agent/structured_chat.py
+7
-2
indexing_runner.py
api/core/indexing_runner.py
+35
-37
No files found.
api/core/agent/agent/structured_chat.py
View file @
9b52050b
...
...
@@ -9,7 +9,7 @@ from langchain.callbacks.base import BaseCallbackManager
from
langchain.callbacks.manager
import
Callbacks
from
langchain.memory.summary
import
SummarizerMixin
from
langchain.prompts
import
SystemMessagePromptTemplate
,
HumanMessagePromptTemplate
,
ChatPromptTemplate
from
langchain.schema
import
AgentAction
,
AgentFinish
,
AIMessage
,
HumanMessage
from
langchain.schema
import
AgentAction
,
AgentFinish
,
AIMessage
,
HumanMessage
,
OutputParserException
from
langchain.tools
import
BaseTool
from
langchain.agents.structured_chat.prompt
import
PREFIX
,
SUFFIX
...
...
@@ -94,7 +94,12 @@ class AutoSummarizingStructuredChatAgent(StructuredChatAgent, CalcTokenMixin):
full_inputs
=
self
.
summarize_messages
(
intermediate_steps
,
**
kwargs
)
full_output
=
self
.
llm_chain
.
predict
(
callbacks
=
callbacks
,
**
full_inputs
)
return
self
.
output_parser
.
parse
(
full_output
)
try
:
return
self
.
output_parser
.
parse
(
full_output
)
except
OutputParserException
:
return
AgentFinish
({
"output"
:
"I'm sorry, the answer of model is invalid, "
"I don't know how to respond to that."
},
""
)
def
summarize_messages
(
self
,
intermediate_steps
:
List
[
Tuple
[
AgentAction
,
str
]],
**
kwargs
):
if
len
(
intermediate_steps
)
>=
2
:
...
...
api/core/indexing_runner.py
View file @
9b52050b
...
...
@@ -494,6 +494,7 @@ class IndexingRunner:
Split the text documents into nodes.
"""
all_documents
=
[]
all_qa_documents
=
[]
for
text_doc
in
text_docs
:
# document clean
document_text
=
self
.
_document_clean
(
text_doc
.
page_content
,
processing_rule
)
...
...
@@ -502,59 +503,56 @@ class IndexingRunner:
# parse document to nodes
documents
=
splitter
.
split_documents
([
text_doc
])
split_documents
=
[]
for
document_node
in
documents
:
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_hash'
]
=
hash
split_documents
.
append
(
document_node
)
all_documents
.
extend
(
split_documents
)
# processing qa document
if
document_form
==
'qa_model'
:
llm
:
StreamableOpenAI
=
LLMBuilder
.
to_llm
(
tenant_id
=
tenant_id
,
model_name
=
'gpt-3.5-turbo'
,
max_tokens
=
2000
)
for
i
in
range
(
0
,
len
(
documents
),
10
):
for
i
in
range
(
0
,
len
(
all_
documents
),
10
):
threads
=
[]
sub_documents
=
documents
[
i
:
i
+
10
]
sub_documents
=
all_
documents
[
i
:
i
+
10
]
for
doc
in
sub_documents
:
document_format_thread
=
threading
.
Thread
(
target
=
self
.
format_document
,
kwargs
=
{
'llm'
:
llm
,
'document_node'
:
doc
,
'split_documents'
:
split_documents
,
'document_form'
:
document_form
})
document_format_thread
=
threading
.
Thread
(
target
=
self
.
format_qa_document
,
kwargs
=
{
'llm'
:
llm
,
'document_node'
:
doc
,
'all_qa_documents'
:
all_qa_documents
})
threads
.
append
(
document_format_thread
)
document_format_thread
.
start
()
for
thread
in
threads
:
thread
.
join
()
all_documents
.
extend
(
split_documents
)
return
all_qa_documents
return
all_documents
def
format_document
(
self
,
llm
:
StreamableOpenAI
,
document_node
,
split_documents
,
document_form
:
str
):
print
(
document_node
.
page_content
)
def
format_qa_document
(
self
,
llm
:
StreamableOpenAI
,
document_node
,
all_qa_documents
):
format_documents
=
[]
if
document_node
.
page_content
is
None
or
not
document_node
.
page_content
.
strip
():
return
format_documents
if
document_form
==
'text_model'
:
# text model document
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_hash'
]
=
hash
return
try
:
# qa model document
response
=
LLMGenerator
.
generate_qa_document_sync
(
llm
,
document_node
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
qa_documents
=
[]
for
result
in
document_qa_list
:
qa_document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
document_node
.
metadata
.
copy
())
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
result
[
'question'
])
qa_document
.
metadata
[
'answer'
]
=
result
[
'answer'
]
qa_document
.
metadata
[
'doc_id'
]
=
doc_id
qa_document
.
metadata
[
'doc_hash'
]
=
hash
qa_documents
.
append
(
qa_document
)
format_documents
.
extend
(
qa_documents
)
except
Exception
as
e
:
logging
.
error
(
str
(
e
))
format_documents
.
append
(
document_node
)
elif
document_form
==
'qa_model'
:
try
:
# qa model document
response
=
LLMGenerator
.
generate_qa_document_sync
(
llm
,
document_node
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
qa_documents
=
[]
for
result
in
document_qa_list
:
qa_document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
document_node
.
metadata
.
copy
())
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
result
[
'question'
])
qa_document
.
metadata
[
'answer'
]
=
result
[
'answer'
]
qa_document
.
metadata
[
'doc_id'
]
=
doc_id
qa_document
.
metadata
[
'doc_hash'
]
=
hash
qa_documents
.
append
(
qa_document
)
format_documents
.
extend
(
qa_documents
)
except
Exception
as
e
:
logging
.
error
(
str
(
e
))
split_documents
.
extend
(
format_documents
)
all_qa_documents
.
extend
(
format_documents
)
def
_split_to_documents_for_estimate
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment