Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
dd9bcd09
Commit
dd9bcd09
authored
Jul 26, 2023
by
jyong
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feat/milvus-support' into deploy/dev
parents
b8a61cfa
9763fc28
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
47 additions
and
32 deletions
+47
-32
indexing_runner.py
api/core/indexing_runner.py
+46
-29
prompts.py
api/core/prompt/prompts.py
+1
-3
No files found.
api/core/indexing_runner.py
View file @
dd9bcd09
import
concurrent
import
datetime
import
json
import
logging
import
re
import
time
import
uuid
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
typing
import
Optional
,
List
,
cast
import
openai
from
flask
import
current_app
from
flask
import
current_app
,
Flask
from
flask_login
import
current_user
from
langchain.embeddings
import
OpenAIEmbeddings
from
langchain.schema
import
Document
...
...
@@ -229,7 +231,8 @@ class IndexingRunner:
dataset_document
.
stopped_at
=
datetime
.
datetime
.
utcnow
()
db
.
session
.
commit
()
def
file_indexing_estimate
(
self
,
file_details
:
List
[
UploadFile
],
tmp_processing_rule
:
dict
,
doc_form
:
str
=
None
)
->
dict
:
def
file_indexing_estimate
(
self
,
file_details
:
List
[
UploadFile
],
tmp_processing_rule
:
dict
,
doc_form
:
str
=
None
)
->
dict
:
"""
Estimate the indexing for the document.
"""
...
...
@@ -269,7 +272,8 @@ class IndexingRunner:
return
{
"total_segments"
:
total_segments
,
"tokens"
:
total_segments
*
2000
,
"total_price"
:
'{:f}'
.
format
(
TokenCalculator
.
get_token_price
(
'gpt-3.5-turbo'
,
total_segments
*
2000
,
'completion'
)),
"total_price"
:
'{:f}'
.
format
(
TokenCalculator
.
get_token_price
(
'gpt-3.5-turbo'
,
total_segments
*
2000
,
'completion'
)),
"currency"
:
TokenCalculator
.
get_currency
(
self
.
embedding_model_name
),
"qa_preview"
:
document_qa_list
,
"preview"
:
preview_texts
...
...
@@ -340,7 +344,8 @@ class IndexingRunner:
return
{
"total_segments"
:
total_segments
,
"tokens"
:
total_segments
*
2000
,
"total_price"
:
'{:f}'
.
format
(
TokenCalculator
.
get_token_price
(
'gpt-3.5-turbo'
,
total_segments
*
2000
,
'completion'
)),
"total_price"
:
'{:f}'
.
format
(
TokenCalculator
.
get_token_price
(
'gpt-3.5-turbo'
,
total_segments
*
2000
,
'completion'
)),
"currency"
:
TokenCalculator
.
get_currency
(
self
.
embedding_model_name
),
"qa_preview"
:
document_qa_list
,
"preview"
:
preview_texts
...
...
@@ -492,32 +497,44 @@ class IndexingRunner:
documents
=
splitter
.
split_documents
([
text_doc
])
split_documents
=
[]
for
document
in
documents
:
if
document
.
page_content
is
None
or
not
document
.
page_content
.
strip
():
continue
if
document_form
==
'text_model'
:
# text model document
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document
.
page_content
)
document
.
metadata
[
'doc_id'
]
=
doc_id
document
.
metadata
[
'doc_hash'
]
=
hash
split_documents
.
append
(
document
)
elif
document_form
==
'qa_model'
:
# qa model document
response
=
LLMGenerator
.
generate_qa_document
(
tenant_id
,
document
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
qa_documents
=
[]
for
result
in
document_qa_list
:
qa_document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
document
.
metadata
.
copy
())
def
format_document
(
flask_app
:
Flask
,
document_node
:
Document
)
->
List
[
Document
]:
with
flask_app
.
app_context
():
print
(
"process:"
+
document_node
.
page_content
)
format_documents
=
[]
if
document_node
.
page_content
is
None
or
not
document_node
.
page_content
.
strip
():
return
format_documents
if
document_form
==
'text_model'
:
# text model document
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
result
[
'question'
])
qa_document
.
metadata
[
'answer'
]
=
result
[
'answer'
]
qa_document
.
metadata
[
'doc_id'
]
=
doc_id
qa_document
.
metadata
[
'doc_hash'
]
=
hash
qa_documents
.
append
(
qa_document
)
split_documents
.
extend
(
qa_documents
)
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_hash'
]
=
hash
format_documents
.
append
(
document_node
)
elif
document_form
==
'qa_model'
:
# qa model document
response
=
LLMGenerator
.
generate_qa_document
(
tenant_id
,
document_node
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
qa_documents
=
[]
for
result
in
document_qa_list
:
qa_document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
document_node
.
metadata
.
copy
())
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
result
[
'question'
])
qa_document
.
metadata
[
'answer'
]
=
result
[
'answer'
]
qa_document
.
metadata
[
'doc_id'
]
=
doc_id
qa_document
.
metadata
[
'doc_hash'
]
=
hash
qa_documents
.
append
(
qa_document
)
format_documents
.
extend
(
qa_documents
)
return
format_documents
with
ThreadPoolExecutor
()
as
executor
:
future_to_doc
=
{
executor
.
submit
(
format_document
,
current_app
.
_get_current_object
(),
doc
):
doc
for
doc
in
documents
}
for
future
in
concurrent
.
futures
.
as_completed
(
future_to_doc
):
split_documents
.
extend
(
future
.
result
())
all_documents
.
extend
(
split_documents
)
...
...
api/core/prompt/prompts.py
View file @
dd9bcd09
...
...
@@ -44,9 +44,7 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
)
GENERATOR_QA_PROMPT
=
(
"You are the questioner.
\n
"
"Based on the language of the input text from the user, reply using the same language."
"The user will send a long text.
\n
Please think step by step."
"Please respond according to the language of the user's input text. If the text is in language [A], you must also reply in language [A].
\n
"
'Step 1: Understand and summarize the main content of this text.
\n
'
'Step 2: What key information or concepts are mentioned in this text?
\n
'
'Step 3: Decompose or combine multiple pieces of information and concepts.
\n
'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment