Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
3c37c22e
Commit
3c37c22e
authored
Jul 29, 2023
by
jyong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add thread pool
parent
ca606103
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
19 deletions
+14
-19
indexing_runner.py
api/core/indexing_runner.py
+14
-19
No files found.
api/core/indexing_runner.py
View file @
3c37c22e
import
asyncio
import
concurrent
import
concurrent
import
datetime
import
datetime
import
json
import
json
...
@@ -8,25 +7,17 @@ import threading
...
@@ -8,25 +7,17 @@ import threading
import
time
import
time
import
uuid
import
uuid
from
concurrent.futures
import
ThreadPoolExecutor
from
concurrent.futures
import
ThreadPoolExecutor
from
multiprocessing
import
Process
from
typing
import
Optional
,
List
,
cast
from
typing
import
Optional
,
List
,
cast
import
openai
from
billiard.pool
import
Pool
from
flask
import
current_app
,
Flask
from
flask_login
import
current_user
from
flask_login
import
current_user
from
langchain.embeddings
import
OpenAIEmbeddings
from
langchain.schema
import
Document
from
langchain.schema
import
Document
from
langchain.text_splitter
import
RecursiveCharacterTextSplitter
,
TextSplitter
from
langchain.text_splitter
import
RecursiveCharacterTextSplitter
,
TextSplitter
from
core.data_loader.file_extractor
import
FileExtractor
from
core.data_loader.file_extractor
import
FileExtractor
from
core.data_loader.loader.notion
import
NotionLoader
from
core.data_loader.loader.notion
import
NotionLoader
from
core.docstore.dataset_docstore
import
DatesetDocumentStore
from
core.docstore.dataset_docstore
import
DatesetDocumentStore
from
core.embedding.cached_embedding
import
CacheEmbedding
from
core.generator.llm_generator
import
LLMGenerator
from
core.generator.llm_generator
import
LLMGenerator
from
core.index.index
import
IndexBuilder
from
core.index.index
import
IndexBuilder
from
core.index.keyword_table_index.keyword_table_index
import
KeywordTableIndex
,
KeywordTableConfig
from
core.index.vector_index.vector_index
import
VectorIndex
from
core.llm.error
import
ProviderTokenNotInitError
from
core.llm.error
import
ProviderTokenNotInitError
from
core.llm.llm_builder
import
LLMBuilder
from
core.llm.llm_builder
import
LLMBuilder
from
core.llm.streamable_open_ai
import
StreamableOpenAI
from
core.llm.streamable_open_ai
import
StreamableOpenAI
...
@@ -516,19 +507,23 @@ class IndexingRunner:
...
@@ -516,19 +507,23 @@ class IndexingRunner:
model_name
=
'gpt-3.5-turbo'
,
model_name
=
'gpt-3.5-turbo'
,
max_tokens
=
2000
max_tokens
=
2000
)
)
threads
=
[]
# threads = []
for
doc
in
documents
:
# for doc in documents:
document_format_thread
=
threading
.
Thread
(
target
=
self
.
format_document
,
kwargs
=
{
# document_format_thread = threading.Thread(target=self.format_document, kwargs={
'llm'
:
llm
,
'document_node'
:
doc
,
'split_documents'
:
split_documents
,
'document_form'
:
document_form
})
# 'llm': llm, 'document_node': doc, 'split_documents': split_documents, 'document_form': document_form})
threads
.
append
(
document_format_thread
)
# threads.append(document_format_thread)
document_format_thread
.
start
()
# document_format_thread.start()
for
thread
in
threads
:
# for thread in threads:
thread
.
join
()
# thread.join()
with
ThreadPoolExecutor
()
as
executor
:
future_to_doc
=
{
executor
.
submit
(
self
.
format_document
,
llm
,
doc
,
document_form
):
doc
for
doc
in
documents
}
for
future
in
concurrent
.
futures
.
as_completed
(
future_to_doc
):
split_documents
.
extend
(
future
.
result
())
all_documents
.
extend
(
split_documents
)
all_documents
.
extend
(
split_documents
)
return
all_documents
return
all_documents
def
format_document
(
self
,
llm
:
StreamableOpenAI
,
document_node
,
split_documents
:
List
,
document_form
:
str
):
def
format_document
(
self
,
llm
:
StreamableOpenAI
,
document_node
,
document_form
:
str
):
print
(
document_node
.
page_content
)
print
(
document_node
.
page_content
)
format_documents
=
[]
format_documents
=
[]
if
document_node
.
page_content
is
None
or
not
document_node
.
page_content
.
strip
():
if
document_node
.
page_content
is
None
or
not
document_node
.
page_content
.
strip
():
...
@@ -559,7 +554,7 @@ class IndexingRunner:
...
@@ -559,7 +554,7 @@ class IndexingRunner:
format_documents
.
extend
(
qa_documents
)
format_documents
.
extend
(
qa_documents
)
except
Exception
as
e
:
except
Exception
as
e
:
logging
.
error
(
str
(
e
))
logging
.
error
(
str
(
e
))
split_documents
.
extend
(
format_documents
)
return
format_documents
def
_split_to_documents_for_estimate
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
def
_split_to_documents_for_estimate
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment