Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
9eaae770
Unverified
Commit
9eaae770
authored
Jul 29, 2023
by
Jyong
Committed by
GitHub
Jul 29, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Feat/add thread control (#675)
parent
ca606103
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
19 deletions
+13
-19
indexing_runner.py
api/core/indexing_runner.py
+13
-19
No files found.
api/core/indexing_runner.py
View file @
9eaae770
import
asyncio
import
concurrent
import
datetime
import
json
...
...
@@ -8,25 +7,17 @@ import threading
import
time
import
uuid
from
concurrent.futures
import
ThreadPoolExecutor
from
multiprocessing
import
Process
from
typing
import
Optional
,
List
,
cast
import
openai
from
billiard.pool
import
Pool
from
flask
import
current_app
,
Flask
from
flask_login
import
current_user
from
langchain.embeddings
import
OpenAIEmbeddings
from
langchain.schema
import
Document
from
langchain.text_splitter
import
RecursiveCharacterTextSplitter
,
TextSplitter
from
core.data_loader.file_extractor
import
FileExtractor
from
core.data_loader.loader.notion
import
NotionLoader
from
core.docstore.dataset_docstore
import
DatesetDocumentStore
from
core.embedding.cached_embedding
import
CacheEmbedding
from
core.generator.llm_generator
import
LLMGenerator
from
core.index.index
import
IndexBuilder
from
core.index.keyword_table_index.keyword_table_index
import
KeywordTableIndex
,
KeywordTableConfig
from
core.index.vector_index.vector_index
import
VectorIndex
from
core.llm.error
import
ProviderTokenNotInitError
from
core.llm.llm_builder
import
LLMBuilder
from
core.llm.streamable_open_ai
import
StreamableOpenAI
...
...
@@ -516,20 +507,23 @@ class IndexingRunner:
model_name
=
'gpt-3.5-turbo'
,
max_tokens
=
2000
)
for
i
in
range
(
0
,
len
(
documents
),
10
):
threads
=
[]
for
doc
in
documents
:
sub_documents
=
documents
[
i
:
i
+
10
]
for
doc
in
sub_documents
:
document_format_thread
=
threading
.
Thread
(
target
=
self
.
format_document
,
kwargs
=
{
'llm'
:
llm
,
'document_node'
:
doc
,
'split_documents'
:
split_documents
,
'document_form'
:
document_form
})
'llm'
:
llm
,
'document_node'
:
doc
,
'split_documents'
:
split_documents
,
'document_form'
:
document_form
})
threads
.
append
(
document_format_thread
)
document_format_thread
.
start
()
for
thread
in
threads
:
thread
.
join
()
all_documents
.
extend
(
split_documents
)
return
all_documents
def
format_document
(
self
,
llm
:
StreamableOpenAI
,
document_node
,
split_documents
:
List
,
document_form
:
str
):
print
(
document_node
.
page_content
)
def
format_document
(
self
,
llm
:
StreamableOpenAI
,
document_node
,
split_documents
,
document_form
:
str
):
format_documents
=
[]
if
document_node
.
page_content
is
None
or
not
document_node
.
page_content
.
strip
():
return
format_documents
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment