Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
90b22d8c
Commit
90b22d8c
authored
Jul 08, 2023
by
jyong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add qa model
parent
561c9cab
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
64 additions
and
7 deletions
+64
-7
datasets_document.py
api/controllers/console/datasets/datasets_document.py
+2
-0
excel.py
api/core/data_loader/loader/excel.py
+1
-1
llm_generator.py
api/core/generator/llm_generator.py
+19
-3
indexing_runner.py
api/core/indexing_runner.py
+31
-3
prompts.py
api/core/prompt/prompts.py
+10
-0
completion_service.py
api/services/completion_service.py
+1
-0
No files found.
api/controllers/console/datasets/datasets_document.py
View file @
90b22d8c
...
@@ -488,6 +488,8 @@ class DocumentBatchIndexingStatusApi(DocumentResource):
...
@@ -488,6 +488,8 @@ class DocumentBatchIndexingStatusApi(DocumentResource):
DocumentSegment
.
status
!=
're_segment'
)
.
count
()
DocumentSegment
.
status
!=
're_segment'
)
.
count
()
document
.
completed_segments
=
completed_segments
document
.
completed_segments
=
completed_segments
document
.
total_segments
=
total_segments
document
.
total_segments
=
total_segments
if
document
.
is_paused
:
document
.
indexing_status
=
'paused'
documents_status
.
append
(
marshal
(
document
,
self
.
document_status_fields
))
documents_status
.
append
(
marshal
(
document
,
self
.
document_status_fields
))
data
=
{
data
=
{
'data'
:
documents_status
'data'
:
documents_status
...
...
api/core/data_loader/loader/excel.py
View file @
90b22d8c
...
@@ -39,7 +39,7 @@ class ExcelLoader(BaseLoader):
...
@@ -39,7 +39,7 @@ class ExcelLoader(BaseLoader):
row_dict
=
dict
(
zip
(
keys
,
list
(
map
(
str
,
row
))))
row_dict
=
dict
(
zip
(
keys
,
list
(
map
(
str
,
row
))))
row_dict
=
{
k
:
v
for
k
,
v
in
row_dict
.
items
()
if
v
}
row_dict
=
{
k
:
v
for
k
,
v
in
row_dict
.
items
()
if
v
}
item
=
''
.
join
(
f
'{k}:{v}
\n
'
for
k
,
v
in
row_dict
.
items
())
item
=
''
.
join
(
f
'{k}:{v}
\n
'
for
k
,
v
in
row_dict
.
items
())
document
=
Document
(
page_content
=
item
)
document
=
Document
(
page_content
=
item
,
metadata
=
{
'source'
:
self
.
_file_path
}
)
data
.
append
(
document
)
data
.
append
(
document
)
return
data
return
data
api/core/generator/llm_generator.py
View file @
90b22d8c
...
@@ -2,7 +2,7 @@ import logging
...
@@ -2,7 +2,7 @@ import logging
from
langchain
import
PromptTemplate
from
langchain
import
PromptTemplate
from
langchain.chat_models.base
import
BaseChatModel
from
langchain.chat_models.base
import
BaseChatModel
from
langchain.schema
import
HumanMessage
,
OutputParserException
,
BaseMessage
from
langchain.schema
import
HumanMessage
,
OutputParserException
,
BaseMessage
,
SystemMessage
from
core.constant
import
llm_constant
from
core.constant
import
llm_constant
from
core.llm.llm_builder
import
LLMBuilder
from
core.llm.llm_builder
import
LLMBuilder
...
@@ -12,8 +12,8 @@ from core.prompt.output_parser.rule_config_generator import RuleConfigGeneratorO
...
@@ -12,8 +12,8 @@ from core.prompt.output_parser.rule_config_generator import RuleConfigGeneratorO
from
core.prompt.output_parser.suggested_questions_after_answer
import
SuggestedQuestionsAfterAnswerOutputParser
from
core.prompt.output_parser.suggested_questions_after_answer
import
SuggestedQuestionsAfterAnswerOutputParser
from
core.prompt.prompt_template
import
JinjaPromptTemplate
,
OutLinePromptTemplate
from
core.prompt.prompt_template
import
JinjaPromptTemplate
,
OutLinePromptTemplate
from
core.prompt.prompts
import
CONVERSATION_TITLE_PROMPT
,
CONVERSATION_SUMMARY_PROMPT
,
INTRODUCTION_GENERATE_PROMPT
from
core.prompt.prompts
import
CONVERSATION_TITLE_PROMPT
,
CONVERSATION_SUMMARY_PROMPT
,
INTRODUCTION_GENERATE_PROMPT
,
\
GENERATOR_QA_PROMPT
# gpt-3.5-turbo works not well
# gpt-3.5-turbo works not well
generate_base_model
=
'text-davinci-003'
generate_base_model
=
'text-davinci-003'
...
@@ -171,3 +171,19 @@ class LLMGenerator:
...
@@ -171,3 +171,19 @@ class LLMGenerator:
}
}
return
rule_config
return
rule_config
@
classmethod
def
generate_qa_document
(
cls
,
tenant_id
:
str
,
query
):
prompt
=
GENERATOR_QA_PROMPT
llm
:
StreamableOpenAI
=
LLMBuilder
.
to_llm
(
tenant_id
=
tenant_id
,
model_name
=
'gpt-3.5-turbo'
,
max_tokens
=
1000
)
if
isinstance
(
llm
,
BaseChatModel
):
prompt
=
[
SystemMessage
(
content
=
prompt
),
HumanMessage
(
content
=
query
)]
response
=
llm
.
generate
([
prompt
])
answer
=
response
.
generations
[
0
][
0
]
.
text
return
answer
.
strip
()
api/core/indexing_runner.py
View file @
90b22d8c
...
@@ -16,6 +16,7 @@ from core.data_loader.file_extractor import FileExtractor
...
@@ -16,6 +16,7 @@ from core.data_loader.file_extractor import FileExtractor
from
core.data_loader.loader.notion
import
NotionLoader
from
core.data_loader.loader.notion
import
NotionLoader
from
core.docstore.dataset_docstore
import
DatesetDocumentStore
from
core.docstore.dataset_docstore
import
DatesetDocumentStore
from
core.embedding.cached_embedding
import
CacheEmbedding
from
core.embedding.cached_embedding
import
CacheEmbedding
from
core.generator.llm_generator
import
LLMGenerator
from
core.index.index
import
IndexBuilder
from
core.index.index
import
IndexBuilder
from
core.index.keyword_table_index.keyword_table_index
import
KeywordTableIndex
,
KeywordTableConfig
from
core.index.keyword_table_index.keyword_table_index
import
KeywordTableIndex
,
KeywordTableConfig
from
core.index.vector_index.vector_index
import
VectorIndex
from
core.index.vector_index.vector_index
import
VectorIndex
...
@@ -70,12 +71,18 @@ class IndexingRunner:
...
@@ -70,12 +71,18 @@ class IndexingRunner:
dataset_document
=
dataset_document
,
dataset_document
=
dataset_document
,
processing_rule
=
processing_rule
processing_rule
=
processing_rule
)
)
new_documents
=
[]
for
document
in
documents
:
response
=
LLMGenerator
.
generate_qa_document
(
dataset
.
tenant_id
,
document
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
for
result
in
document_qa_list
:
document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
{
'source'
:
result
[
'answer'
]})
new_documents
.
append
(
document
)
# build index
# build index
self
.
_build_index
(
self
.
_build_index
(
dataset
=
dataset
,
dataset
=
dataset
,
dataset_document
=
dataset_document
,
dataset_document
=
dataset_document
,
documents
=
documents
documents
=
new_
documents
)
)
except
DocumentIsPausedException
:
except
DocumentIsPausedException
:
raise
DocumentIsPausedException
(
'Document paused, document id: {}'
.
format
(
dataset_document
.
id
))
raise
DocumentIsPausedException
(
'Document paused, document id: {}'
.
format
(
dataset_document
.
id
))
...
@@ -91,6 +98,22 @@ class IndexingRunner:
...
@@ -91,6 +98,22 @@ class IndexingRunner:
dataset_document
.
stopped_at
=
datetime
.
datetime
.
utcnow
()
dataset_document
.
stopped_at
=
datetime
.
datetime
.
utcnow
()
db
.
session
.
commit
()
db
.
session
.
commit
()
def
format_split_text
(
self
,
text
):
regex
=
r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)"
# 匹配Q和A的正则表达式
matches
=
re
.
findall
(
regex
,
text
,
re
.
MULTILINE
)
# 获取所有匹配到的结果
result
=
[]
# 存储最终的结果
for
match
in
matches
:
q
=
match
[
0
]
a
=
match
[
1
]
if
q
and
a
:
# 如果Q和A都存在,就将其添加到结果中
result
.
append
({
"question"
:
q
,
"answer"
:
re
.
sub
(
r"\n\s*"
,
"
\n
"
,
a
.
strip
())
})
return
result
def
run_in_splitting_status
(
self
,
dataset_document
:
DatasetDocument
):
def
run_in_splitting_status
(
self
,
dataset_document
:
DatasetDocument
):
"""Run the indexing process when the index_status is splitting."""
"""Run the indexing process when the index_status is splitting."""
try
:
try
:
...
@@ -428,7 +451,7 @@ class IndexingRunner:
...
@@ -428,7 +451,7 @@ class IndexingRunner:
return
documents
return
documents
def
_split_to_documents
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
def
_split_to_documents
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
processing_rule
:
DatasetProcessRule
)
->
List
[
Document
]:
processing_rule
:
DatasetProcessRule
,
tenant_id
)
->
List
[
Document
]:
"""
"""
Split the text documents into nodes.
Split the text documents into nodes.
"""
"""
...
@@ -446,6 +469,11 @@ class IndexingRunner:
...
@@ -446,6 +469,11 @@ class IndexingRunner:
if
document
.
page_content
is
None
or
not
document
.
page_content
.
strip
():
if
document
.
page_content
is
None
or
not
document
.
page_content
.
strip
():
continue
continue
response
=
LLMGenerator
.
generate_qa_document
(
processing_rule
.
tenant_id
,
document
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
for
result
in
document_qa_list
:
document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
{
'source'
:
result
[
'answer'
]})
new_documents
.
append
(
document
)
doc_id
=
str
(
uuid
.
uuid4
())
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document
.
page_content
)
hash
=
helper
.
generate_text_hash
(
document
.
page_content
)
...
...
api/core/prompt/prompts.py
View file @
90b22d8c
...
@@ -43,6 +43,16 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
...
@@ -43,6 +43,16 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
"[
\"
question1
\"
,
\"
question2
\"
,
\"
question3
\"
]
\n
"
"[
\"
question1
\"
,
\"
question2
\"
,
\"
question3
\"
]
\n
"
)
)
GENERATOR_QA_PROMPT
=
(
"你是出题人.
\n
"
"用户会发送一段长文本.
\n
请一步一步思考"
'Step1:了解并总结这段文本的主要内容
\n
'
'Step2:这段文本提到了哪些关键信息或概念
\n
'
'Step3:可分解或结合多个信息与概念
\n
'
'Step4:将这些关键信息与概念生成 10 个问题与答案,问题描述清楚并且详细完整,答案详细完整.
\n
'
"按格式回答: Q1:
\n
A1:
\n
Q2:
\n
A2:...
\n
"
)
RULE_CONFIG_GENERATE_TEMPLATE
=
"""Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select
\
RULE_CONFIG_GENERATE_TEMPLATE
=
"""Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select
\
the model prompt that best suits the input.
the model prompt that best suits the input.
You will be provided with the prompt, variables, and an opening statement.
You will be provided with the prompt, variables, and an opening statement.
...
...
api/services/completion_service.py
View file @
90b22d8c
...
@@ -198,6 +198,7 @@ class CompletionService:
...
@@ -198,6 +198,7 @@ class CompletionService:
conversation
=
db
.
session
.
query
(
Conversation
)
.
filter_by
(
id
=
conversation
.
id
)
.
first
()
conversation
=
db
.
session
.
query
(
Conversation
)
.
filter_by
(
id
=
conversation
.
id
)
.
first
()
# run
# run
Completion
.
generate
(
Completion
.
generate
(
task_id
=
generate_task_id
,
task_id
=
generate_task_id
,
app
=
app_model
,
app
=
app_model
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment