Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
90b22d8c
Commit
90b22d8c
authored
Jul 08, 2023
by
jyong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add qa model
parent
561c9cab
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
64 additions
and
7 deletions
+64
-7
datasets_document.py
api/controllers/console/datasets/datasets_document.py
+2
-0
excel.py
api/core/data_loader/loader/excel.py
+1
-1
llm_generator.py
api/core/generator/llm_generator.py
+19
-3
indexing_runner.py
api/core/indexing_runner.py
+31
-3
prompts.py
api/core/prompt/prompts.py
+10
-0
completion_service.py
api/services/completion_service.py
+1
-0
No files found.
api/controllers/console/datasets/datasets_document.py
View file @
90b22d8c
...
...
@@ -488,6 +488,8 @@ class DocumentBatchIndexingStatusApi(DocumentResource):
DocumentSegment
.
status
!=
're_segment'
)
.
count
()
document
.
completed_segments
=
completed_segments
document
.
total_segments
=
total_segments
if
document
.
is_paused
:
document
.
indexing_status
=
'paused'
documents_status
.
append
(
marshal
(
document
,
self
.
document_status_fields
))
data
=
{
'data'
:
documents_status
...
...
api/core/data_loader/loader/excel.py
View file @
90b22d8c
...
...
@@ -39,7 +39,7 @@ class ExcelLoader(BaseLoader):
row_dict
=
dict
(
zip
(
keys
,
list
(
map
(
str
,
row
))))
row_dict
=
{
k
:
v
for
k
,
v
in
row_dict
.
items
()
if
v
}
item
=
''
.
join
(
f
'{k}:{v}
\n
'
for
k
,
v
in
row_dict
.
items
())
document
=
Document
(
page_content
=
item
)
document
=
Document
(
page_content
=
item
,
metadata
=
{
'source'
:
self
.
_file_path
}
)
data
.
append
(
document
)
return
data
api/core/generator/llm_generator.py
View file @
90b22d8c
...
...
@@ -2,7 +2,7 @@ import logging
from
langchain
import
PromptTemplate
from
langchain.chat_models.base
import
BaseChatModel
from
langchain.schema
import
HumanMessage
,
OutputParserException
,
BaseMessage
from
langchain.schema
import
HumanMessage
,
OutputParserException
,
BaseMessage
,
SystemMessage
from
core.constant
import
llm_constant
from
core.llm.llm_builder
import
LLMBuilder
...
...
@@ -12,8 +12,8 @@ from core.prompt.output_parser.rule_config_generator import RuleConfigGeneratorO
from
core.prompt.output_parser.suggested_questions_after_answer
import
SuggestedQuestionsAfterAnswerOutputParser
from
core.prompt.prompt_template
import
JinjaPromptTemplate
,
OutLinePromptTemplate
from
core.prompt.prompts
import
CONVERSATION_TITLE_PROMPT
,
CONVERSATION_SUMMARY_PROMPT
,
INTRODUCTION_GENERATE_PROMPT
from
core.prompt.prompts
import
CONVERSATION_TITLE_PROMPT
,
CONVERSATION_SUMMARY_PROMPT
,
INTRODUCTION_GENERATE_PROMPT
,
\
GENERATOR_QA_PROMPT
# gpt-3.5-turbo works not well
generate_base_model
=
'text-davinci-003'
...
...
@@ -171,3 +171,19 @@ class LLMGenerator:
}
return
rule_config
@
classmethod
def
generate_qa_document
(
cls
,
tenant_id
:
str
,
query
):
prompt
=
GENERATOR_QA_PROMPT
llm
:
StreamableOpenAI
=
LLMBuilder
.
to_llm
(
tenant_id
=
tenant_id
,
model_name
=
'gpt-3.5-turbo'
,
max_tokens
=
1000
)
if
isinstance
(
llm
,
BaseChatModel
):
prompt
=
[
SystemMessage
(
content
=
prompt
),
HumanMessage
(
content
=
query
)]
response
=
llm
.
generate
([
prompt
])
answer
=
response
.
generations
[
0
][
0
]
.
text
return
answer
.
strip
()
api/core/indexing_runner.py
View file @
90b22d8c
...
...
@@ -16,6 +16,7 @@ from core.data_loader.file_extractor import FileExtractor
from
core.data_loader.loader.notion
import
NotionLoader
from
core.docstore.dataset_docstore
import
DatesetDocumentStore
from
core.embedding.cached_embedding
import
CacheEmbedding
from
core.generator.llm_generator
import
LLMGenerator
from
core.index.index
import
IndexBuilder
from
core.index.keyword_table_index.keyword_table_index
import
KeywordTableIndex
,
KeywordTableConfig
from
core.index.vector_index.vector_index
import
VectorIndex
...
...
@@ -70,12 +71,18 @@ class IndexingRunner:
dataset_document
=
dataset_document
,
processing_rule
=
processing_rule
)
new_documents
=
[]
for
document
in
documents
:
response
=
LLMGenerator
.
generate_qa_document
(
dataset
.
tenant_id
,
document
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
for
result
in
document_qa_list
:
document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
{
'source'
:
result
[
'answer'
]})
new_documents
.
append
(
document
)
# build index
self
.
_build_index
(
dataset
=
dataset
,
dataset_document
=
dataset_document
,
documents
=
documents
documents
=
new_
documents
)
except
DocumentIsPausedException
:
raise
DocumentIsPausedException
(
'Document paused, document id: {}'
.
format
(
dataset_document
.
id
))
...
...
@@ -91,6 +98,22 @@ class IndexingRunner:
dataset_document
.
stopped_at
=
datetime
.
datetime
.
utcnow
()
db
.
session
.
commit
()
def
format_split_text
(
self
,
text
):
regex
=
r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)"
# 匹配Q和A的正则表达式
matches
=
re
.
findall
(
regex
,
text
,
re
.
MULTILINE
)
# 获取所有匹配到的结果
result
=
[]
# 存储最终的结果
for
match
in
matches
:
q
=
match
[
0
]
a
=
match
[
1
]
if
q
and
a
:
# 如果Q和A都存在,就将其添加到结果中
result
.
append
({
"question"
:
q
,
"answer"
:
re
.
sub
(
r"\n\s*"
,
"
\n
"
,
a
.
strip
())
})
return
result
def
run_in_splitting_status
(
self
,
dataset_document
:
DatasetDocument
):
"""Run the indexing process when the index_status is splitting."""
try
:
...
...
@@ -428,7 +451,7 @@ class IndexingRunner:
return
documents
def
_split_to_documents
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
processing_rule
:
DatasetProcessRule
)
->
List
[
Document
]:
processing_rule
:
DatasetProcessRule
,
tenant_id
)
->
List
[
Document
]:
"""
Split the text documents into nodes.
"""
...
...
@@ -446,6 +469,11 @@ class IndexingRunner:
if
document
.
page_content
is
None
or
not
document
.
page_content
.
strip
():
continue
response
=
LLMGenerator
.
generate_qa_document
(
processing_rule
.
tenant_id
,
document
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
for
result
in
document_qa_list
:
document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
{
'source'
:
result
[
'answer'
]})
new_documents
.
append
(
document
)
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document
.
page_content
)
...
...
api/core/prompt/prompts.py
View file @
90b22d8c
...
...
@@ -43,6 +43,16 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
"[
\"
question1
\"
,
\"
question2
\"
,
\"
question3
\"
]
\n
"
)
GENERATOR_QA_PROMPT
=
(
"你是出题人.
\n
"
"用户会发送一段长文本.
\n
请一步一步思考"
'Step1:了解并总结这段文本的主要内容
\n
'
'Step2:这段文本提到了哪些关键信息或概念
\n
'
'Step3:可分解或结合多个信息与概念
\n
'
'Step4:将这些关键信息与概念生成 10 个问题与答案,问题描述清楚并且详细完整,答案详细完整.
\n
'
"按格式回答: Q1:
\n
A1:
\n
Q2:
\n
A2:...
\n
"
)
RULE_CONFIG_GENERATE_TEMPLATE
=
"""Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select
\
the model prompt that best suits the input.
You will be provided with the prompt, variables, and an opening statement.
...
...
api/services/completion_service.py
View file @
90b22d8c
...
...
@@ -198,6 +198,7 @@ class CompletionService:
conversation
=
db
.
session
.
query
(
Conversation
)
.
filter_by
(
id
=
conversation
.
id
)
.
first
()
# run
Completion
.
generate
(
task_id
=
generate_task_id
,
app
=
app_model
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment