Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
dd9bcd09
Commit
dd9bcd09
authored
Jul 26, 2023
by
jyong
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feat/milvus-support' into deploy/dev
parents
b8a61cfa
9763fc28
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
47 additions
and
32 deletions
+47
-32
indexing_runner.py
api/core/indexing_runner.py
+46
-29
prompts.py
api/core/prompt/prompts.py
+1
-3
No files found.
api/core/indexing_runner.py
View file @
dd9bcd09
import
concurrent
import
datetime
import
datetime
import
json
import
json
import
logging
import
logging
import
re
import
re
import
time
import
time
import
uuid
import
uuid
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
typing
import
Optional
,
List
,
cast
from
typing
import
Optional
,
List
,
cast
import
openai
import
openai
from
flask
import
current_app
from
flask
import
current_app
,
Flask
from
flask_login
import
current_user
from
flask_login
import
current_user
from
langchain.embeddings
import
OpenAIEmbeddings
from
langchain.embeddings
import
OpenAIEmbeddings
from
langchain.schema
import
Document
from
langchain.schema
import
Document
...
@@ -229,7 +231,8 @@ class IndexingRunner:
...
@@ -229,7 +231,8 @@ class IndexingRunner:
dataset_document
.
stopped_at
=
datetime
.
datetime
.
utcnow
()
dataset_document
.
stopped_at
=
datetime
.
datetime
.
utcnow
()
db
.
session
.
commit
()
db
.
session
.
commit
()
def
file_indexing_estimate
(
self
,
file_details
:
List
[
UploadFile
],
tmp_processing_rule
:
dict
,
doc_form
:
str
=
None
)
->
dict
:
def
file_indexing_estimate
(
self
,
file_details
:
List
[
UploadFile
],
tmp_processing_rule
:
dict
,
doc_form
:
str
=
None
)
->
dict
:
"""
"""
Estimate the indexing for the document.
Estimate the indexing for the document.
"""
"""
...
@@ -269,7 +272,8 @@ class IndexingRunner:
...
@@ -269,7 +272,8 @@ class IndexingRunner:
return
{
return
{
"total_segments"
:
total_segments
,
"total_segments"
:
total_segments
,
"tokens"
:
total_segments
*
2000
,
"tokens"
:
total_segments
*
2000
,
"total_price"
:
'{:f}'
.
format
(
TokenCalculator
.
get_token_price
(
'gpt-3.5-turbo'
,
total_segments
*
2000
,
'completion'
)),
"total_price"
:
'{:f}'
.
format
(
TokenCalculator
.
get_token_price
(
'gpt-3.5-turbo'
,
total_segments
*
2000
,
'completion'
)),
"currency"
:
TokenCalculator
.
get_currency
(
self
.
embedding_model_name
),
"currency"
:
TokenCalculator
.
get_currency
(
self
.
embedding_model_name
),
"qa_preview"
:
document_qa_list
,
"qa_preview"
:
document_qa_list
,
"preview"
:
preview_texts
"preview"
:
preview_texts
...
@@ -340,7 +344,8 @@ class IndexingRunner:
...
@@ -340,7 +344,8 @@ class IndexingRunner:
return
{
return
{
"total_segments"
:
total_segments
,
"total_segments"
:
total_segments
,
"tokens"
:
total_segments
*
2000
,
"tokens"
:
total_segments
*
2000
,
"total_price"
:
'{:f}'
.
format
(
TokenCalculator
.
get_token_price
(
'gpt-3.5-turbo'
,
total_segments
*
2000
,
'completion'
)),
"total_price"
:
'{:f}'
.
format
(
TokenCalculator
.
get_token_price
(
'gpt-3.5-turbo'
,
total_segments
*
2000
,
'completion'
)),
"currency"
:
TokenCalculator
.
get_currency
(
self
.
embedding_model_name
),
"currency"
:
TokenCalculator
.
get_currency
(
self
.
embedding_model_name
),
"qa_preview"
:
document_qa_list
,
"qa_preview"
:
document_qa_list
,
"preview"
:
preview_texts
"preview"
:
preview_texts
...
@@ -492,32 +497,44 @@ class IndexingRunner:
...
@@ -492,32 +497,44 @@ class IndexingRunner:
documents
=
splitter
.
split_documents
([
text_doc
])
documents
=
splitter
.
split_documents
([
text_doc
])
split_documents
=
[]
split_documents
=
[]
for
document
in
documents
:
if
document
.
page_content
is
None
or
not
document
.
page_content
.
strip
():
def
format_document
(
flask_app
:
Flask
,
document_node
:
Document
)
->
List
[
Document
]:
continue
with
flask_app
.
app_context
():
if
document_form
==
'text_model'
:
print
(
"process:"
+
document_node
.
page_content
)
# text model document
format_documents
=
[]
doc_id
=
str
(
uuid
.
uuid4
())
if
document_node
.
page_content
is
None
or
not
document_node
.
page_content
.
strip
():
hash
=
helper
.
generate_text_hash
(
document
.
page_content
)
return
format_documents
if
document_form
==
'text_model'
:
document
.
metadata
[
'doc_id'
]
=
doc_id
# text model document
document
.
metadata
[
'doc_hash'
]
=
hash
split_documents
.
append
(
document
)
elif
document_form
==
'qa_model'
:
# qa model document
response
=
LLMGenerator
.
generate_qa_document
(
tenant_id
,
document
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
qa_documents
=
[]
for
result
in
document_qa_list
:
qa_document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
document
.
metadata
.
copy
())
doc_id
=
str
(
uuid
.
uuid4
())
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
result
[
'question'
])
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
qa_document
.
metadata
[
'answer'
]
=
result
[
'answer'
]
qa_document
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_id'
]
=
doc_id
qa_document
.
metadata
[
'doc_hash'
]
=
hash
document_node
.
metadata
[
'doc_hash'
]
=
hash
qa_documents
.
append
(
qa_document
)
split_documents
.
extend
(
qa_documents
)
format_documents
.
append
(
document_node
)
elif
document_form
==
'qa_model'
:
# qa model document
response
=
LLMGenerator
.
generate_qa_document
(
tenant_id
,
document_node
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
qa_documents
=
[]
for
result
in
document_qa_list
:
qa_document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
document_node
.
metadata
.
copy
())
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
result
[
'question'
])
qa_document
.
metadata
[
'answer'
]
=
result
[
'answer'
]
qa_document
.
metadata
[
'doc_id'
]
=
doc_id
qa_document
.
metadata
[
'doc_hash'
]
=
hash
qa_documents
.
append
(
qa_document
)
format_documents
.
extend
(
qa_documents
)
return
format_documents
with
ThreadPoolExecutor
()
as
executor
:
future_to_doc
=
{
executor
.
submit
(
format_document
,
current_app
.
_get_current_object
(),
doc
):
doc
for
doc
in
documents
}
for
future
in
concurrent
.
futures
.
as_completed
(
future_to_doc
):
split_documents
.
extend
(
future
.
result
())
all_documents
.
extend
(
split_documents
)
all_documents
.
extend
(
split_documents
)
...
...
api/core/prompt/prompts.py
View file @
dd9bcd09
...
@@ -44,9 +44,7 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
...
@@ -44,9 +44,7 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
)
)
GENERATOR_QA_PROMPT
=
(
GENERATOR_QA_PROMPT
=
(
"You are the questioner.
\n
"
"Please respond according to the language of the user's input text. If the text is in language [A], you must also reply in language [A].
\n
"
"Based on the language of the input text from the user, reply using the same language."
"The user will send a long text.
\n
Please think step by step."
'Step 1: Understand and summarize the main content of this text.
\n
'
'Step 1: Understand and summarize the main content of this text.
\n
'
'Step 2: What key information or concepts are mentioned in this text?
\n
'
'Step 2: What key information or concepts are mentioned in this text?
\n
'
'Step 3: Decompose or combine multiple pieces of information and concepts.
\n
'
'Step 3: Decompose or combine multiple pieces of information and concepts.
\n
'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment