Commit dd9bcd09 authored by jyong's avatar jyong

Merge branch 'feat/milvus-support' into deploy/dev

parents b8a61cfa 9763fc28
import concurrent
import datetime import datetime
import json import json
import logging import logging
import re import re
import time import time
import uuid import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional, List, cast from typing import Optional, List, cast
import openai import openai
from flask import current_app from flask import current_app, Flask
from flask_login import current_user from flask_login import current_user
from langchain.embeddings import OpenAIEmbeddings from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document from langchain.schema import Document
...@@ -229,7 +231,8 @@ class IndexingRunner: ...@@ -229,7 +231,8 @@ class IndexingRunner:
dataset_document.stopped_at = datetime.datetime.utcnow() dataset_document.stopped_at = datetime.datetime.utcnow()
db.session.commit() db.session.commit()
def file_indexing_estimate(self, file_details: List[UploadFile], tmp_processing_rule: dict, doc_form: str = None) -> dict: def file_indexing_estimate(self, file_details: List[UploadFile], tmp_processing_rule: dict,
doc_form: str = None) -> dict:
""" """
Estimate the indexing for the document. Estimate the indexing for the document.
""" """
...@@ -269,7 +272,8 @@ class IndexingRunner: ...@@ -269,7 +272,8 @@ class IndexingRunner:
return { return {
"total_segments": total_segments, "total_segments": total_segments,
"tokens": total_segments * 2000, "tokens": total_segments * 2000,
"total_price": '{:f}'.format(TokenCalculator.get_token_price('gpt-3.5-turbo', total_segments * 2000, 'completion')), "total_price": '{:f}'.format(
TokenCalculator.get_token_price('gpt-3.5-turbo', total_segments * 2000, 'completion')),
"currency": TokenCalculator.get_currency(self.embedding_model_name), "currency": TokenCalculator.get_currency(self.embedding_model_name),
"qa_preview": document_qa_list, "qa_preview": document_qa_list,
"preview": preview_texts "preview": preview_texts
...@@ -340,7 +344,8 @@ class IndexingRunner: ...@@ -340,7 +344,8 @@ class IndexingRunner:
return { return {
"total_segments": total_segments, "total_segments": total_segments,
"tokens": total_segments * 2000, "tokens": total_segments * 2000,
"total_price": '{:f}'.format(TokenCalculator.get_token_price('gpt-3.5-turbo', total_segments * 2000, 'completion')), "total_price": '{:f}'.format(
TokenCalculator.get_token_price('gpt-3.5-turbo', total_segments * 2000, 'completion')),
"currency": TokenCalculator.get_currency(self.embedding_model_name), "currency": TokenCalculator.get_currency(self.embedding_model_name),
"qa_preview": document_qa_list, "qa_preview": document_qa_list,
"preview": preview_texts "preview": preview_texts
...@@ -492,32 +497,44 @@ class IndexingRunner: ...@@ -492,32 +497,44 @@ class IndexingRunner:
documents = splitter.split_documents([text_doc]) documents = splitter.split_documents([text_doc])
split_documents = [] split_documents = []
for document in documents:
if document.page_content is None or not document.page_content.strip(): def format_document(flask_app: Flask, document_node: Document) -> List[Document]:
continue with flask_app.app_context():
if document_form == 'text_model': print("process:"+document_node.page_content)
# text model document format_documents = []
doc_id = str(uuid.uuid4()) if document_node.page_content is None or not document_node.page_content.strip():
hash = helper.generate_text_hash(document.page_content) return format_documents
if document_form == 'text_model':
document.metadata['doc_id'] = doc_id # text model document
document.metadata['doc_hash'] = hash
split_documents.append(document)
elif document_form == 'qa_model':
# qa model document
response = LLMGenerator.generate_qa_document(tenant_id, document.page_content)
document_qa_list = self.format_split_text(response)
qa_documents = []
for result in document_qa_list:
qa_document = Document(page_content=result['question'], metadata=document.metadata.copy())
doc_id = str(uuid.uuid4()) doc_id = str(uuid.uuid4())
hash = helper.generate_text_hash(result['question']) hash = helper.generate_text_hash(document_node.page_content)
qa_document.metadata['answer'] = result['answer']
qa_document.metadata['doc_id'] = doc_id document_node.metadata['doc_id'] = doc_id
qa_document.metadata['doc_hash'] = hash document_node.metadata['doc_hash'] = hash
qa_documents.append(qa_document)
split_documents.extend(qa_documents) format_documents.append(document_node)
elif document_form == 'qa_model':
# qa model document
response = LLMGenerator.generate_qa_document(tenant_id, document_node.page_content)
document_qa_list = self.format_split_text(response)
qa_documents = []
for result in document_qa_list:
qa_document = Document(page_content=result['question'], metadata=document_node.metadata.copy())
doc_id = str(uuid.uuid4())
hash = helper.generate_text_hash(result['question'])
qa_document.metadata['answer'] = result['answer']
qa_document.metadata['doc_id'] = doc_id
qa_document.metadata['doc_hash'] = hash
qa_documents.append(qa_document)
format_documents.extend(qa_documents)
return format_documents
with ThreadPoolExecutor() as executor:
future_to_doc = {executor.submit(format_document, current_app._get_current_object(), doc): doc for doc in documents}
for future in concurrent.futures.as_completed(future_to_doc):
split_documents.extend(future.result())
all_documents.extend(split_documents) all_documents.extend(split_documents)
......
...@@ -44,9 +44,7 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = ( ...@@ -44,9 +44,7 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
) )
GENERATOR_QA_PROMPT = ( GENERATOR_QA_PROMPT = (
"You are the questioner.\n" "Please respond according to the language of the user's input text. If the text is in language [A], you must also reply in language [A].\n"
"Based on the language of the input text from the user, reply using the same language."
"The user will send a long text. \nPlease think step by step."
'Step 1: Understand and summarize the main content of this text.\n' 'Step 1: Understand and summarize the main content of this text.\n'
'Step 2: What key information or concepts are mentioned in this text?\n' 'Step 2: What key information or concepts are mentioned in this text?\n'
'Step 3: Decompose or combine multiple pieces of information and concepts.\n' 'Step 3: Decompose or combine multiple pieces of information and concepts.\n'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment