Unverified Commit 9eaae770 authored by Jyong's avatar Jyong Committed by GitHub

Feat/add thread control (#675)

parent ca606103
import asyncio
import concurrent import concurrent
import datetime import datetime
import json import json
...@@ -8,25 +7,17 @@ import threading ...@@ -8,25 +7,17 @@ import threading
import time import time
import uuid import uuid
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process
from typing import Optional, List, cast from typing import Optional, List, cast
import openai
from billiard.pool import Pool
from flask import current_app, Flask
from flask_login import current_user from flask_login import current_user
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
from core.data_loader.file_extractor import FileExtractor from core.data_loader.file_extractor import FileExtractor
from core.data_loader.loader.notion import NotionLoader from core.data_loader.loader.notion import NotionLoader
from core.docstore.dataset_docstore import DatesetDocumentStore from core.docstore.dataset_docstore import DatesetDocumentStore
from core.embedding.cached_embedding import CacheEmbedding
from core.generator.llm_generator import LLMGenerator from core.generator.llm_generator import LLMGenerator
from core.index.index import IndexBuilder from core.index.index import IndexBuilder
from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
from core.index.vector_index.vector_index import VectorIndex
from core.llm.error import ProviderTokenNotInitError from core.llm.error import ProviderTokenNotInitError
from core.llm.llm_builder import LLMBuilder from core.llm.llm_builder import LLMBuilder
from core.llm.streamable_open_ai import StreamableOpenAI from core.llm.streamable_open_ai import StreamableOpenAI
...@@ -516,20 +507,23 @@ class IndexingRunner: ...@@ -516,20 +507,23 @@ class IndexingRunner:
model_name='gpt-3.5-turbo', model_name='gpt-3.5-turbo',
max_tokens=2000 max_tokens=2000
) )
threads = [] for i in range(0, len(documents), 10):
for doc in documents: threads = []
document_format_thread = threading.Thread(target=self.format_document, kwargs={ sub_documents = documents[i:i + 10]
'llm': llm, 'document_node': doc, 'split_documents': split_documents, 'document_form': document_form}) for doc in sub_documents:
threads.append(document_format_thread) document_format_thread = threading.Thread(target=self.format_document, kwargs={
document_format_thread.start() 'llm': llm, 'document_node': doc, 'split_documents': split_documents,
for thread in threads: 'document_form': document_form})
thread.join() threads.append(document_format_thread)
document_format_thread.start()
for thread in threads:
thread.join()
all_documents.extend(split_documents) all_documents.extend(split_documents)
return all_documents return all_documents
def format_document(self, llm: StreamableOpenAI, document_node, split_documents: List, document_form: str): def format_document(self, llm: StreamableOpenAI, document_node, split_documents, document_form: str):
print(document_node.page_content)
format_documents = [] format_documents = []
if document_node.page_content is None or not document_node.page_content.strip(): if document_node.page_content is None or not document_node.page_content.strip():
return format_documents return format_documents
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment