Commit 90b22d8c authored by jyong's avatar jyong

add qa model

parent 561c9cab
...@@ -488,6 +488,8 @@ class DocumentBatchIndexingStatusApi(DocumentResource): ...@@ -488,6 +488,8 @@ class DocumentBatchIndexingStatusApi(DocumentResource):
DocumentSegment.status != 're_segment').count() DocumentSegment.status != 're_segment').count()
document.completed_segments = completed_segments document.completed_segments = completed_segments
document.total_segments = total_segments document.total_segments = total_segments
if document.is_paused:
document.indexing_status = 'paused'
documents_status.append(marshal(document, self.document_status_fields)) documents_status.append(marshal(document, self.document_status_fields))
data = { data = {
'data': documents_status 'data': documents_status
......
...@@ -39,7 +39,7 @@ class ExcelLoader(BaseLoader): ...@@ -39,7 +39,7 @@ class ExcelLoader(BaseLoader):
row_dict = dict(zip(keys, list(map(str, row)))) row_dict = dict(zip(keys, list(map(str, row))))
row_dict = {k: v for k, v in row_dict.items() if v} row_dict = {k: v for k, v in row_dict.items() if v}
item = ''.join(f'{k}:{v}\n' for k, v in row_dict.items()) item = ''.join(f'{k}:{v}\n' for k, v in row_dict.items())
document = Document(page_content=item) document = Document(page_content=item, metadata={'source': self._file_path})
data.append(document) data.append(document)
return data return data
...@@ -2,7 +2,7 @@ import logging ...@@ -2,7 +2,7 @@ import logging
from langchain import PromptTemplate from langchain import PromptTemplate
from langchain.chat_models.base import BaseChatModel from langchain.chat_models.base import BaseChatModel
from langchain.schema import HumanMessage, OutputParserException, BaseMessage from langchain.schema import HumanMessage, OutputParserException, BaseMessage, SystemMessage
from core.constant import llm_constant from core.constant import llm_constant
from core.llm.llm_builder import LLMBuilder from core.llm.llm_builder import LLMBuilder
...@@ -12,8 +12,8 @@ from core.prompt.output_parser.rule_config_generator import RuleConfigGeneratorO ...@@ -12,8 +12,8 @@ from core.prompt.output_parser.rule_config_generator import RuleConfigGeneratorO
from core.prompt.output_parser.suggested_questions_after_answer import SuggestedQuestionsAfterAnswerOutputParser from core.prompt.output_parser.suggested_questions_after_answer import SuggestedQuestionsAfterAnswerOutputParser
from core.prompt.prompt_template import JinjaPromptTemplate, OutLinePromptTemplate from core.prompt.prompt_template import JinjaPromptTemplate, OutLinePromptTemplate
from core.prompt.prompts import CONVERSATION_TITLE_PROMPT, CONVERSATION_SUMMARY_PROMPT, INTRODUCTION_GENERATE_PROMPT from core.prompt.prompts import CONVERSATION_TITLE_PROMPT, CONVERSATION_SUMMARY_PROMPT, INTRODUCTION_GENERATE_PROMPT, \
GENERATOR_QA_PROMPT
# gpt-3.5-turbo works not well # gpt-3.5-turbo works not well
generate_base_model = 'text-davinci-003' generate_base_model = 'text-davinci-003'
...@@ -171,3 +171,19 @@ class LLMGenerator: ...@@ -171,3 +171,19 @@ class LLMGenerator:
} }
return rule_config return rule_config
@classmethod
def generate_qa_document(cls, tenant_id: str, query):
prompt = GENERATOR_QA_PROMPT
llm: StreamableOpenAI = LLMBuilder.to_llm(
tenant_id=tenant_id,
model_name='gpt-3.5-turbo',
max_tokens=1000
)
if isinstance(llm, BaseChatModel):
prompt = [SystemMessage(content=prompt), HumanMessage(content=query)]
response = llm.generate([prompt])
answer = response.generations[0][0].text
return answer.strip()
...@@ -16,6 +16,7 @@ from core.data_loader.file_extractor import FileExtractor ...@@ -16,6 +16,7 @@ from core.data_loader.file_extractor import FileExtractor
from core.data_loader.loader.notion import NotionLoader from core.data_loader.loader.notion import NotionLoader
from core.docstore.dataset_docstore import DatesetDocumentStore from core.docstore.dataset_docstore import DatesetDocumentStore
from core.embedding.cached_embedding import CacheEmbedding from core.embedding.cached_embedding import CacheEmbedding
from core.generator.llm_generator import LLMGenerator
from core.index.index import IndexBuilder from core.index.index import IndexBuilder
from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
from core.index.vector_index.vector_index import VectorIndex from core.index.vector_index.vector_index import VectorIndex
...@@ -70,12 +71,18 @@ class IndexingRunner: ...@@ -70,12 +71,18 @@ class IndexingRunner:
dataset_document=dataset_document, dataset_document=dataset_document,
processing_rule=processing_rule processing_rule=processing_rule
) )
new_documents = []
for document in documents:
response = LLMGenerator.generate_qa_document(dataset.tenant_id, document.page_content)
document_qa_list = self.format_split_text(response)
for result in document_qa_list:
document = Document(page_content=result['question'], metadata={'source': result['answer']})
new_documents.append(document)
# build index # build index
self._build_index( self._build_index(
dataset=dataset, dataset=dataset,
dataset_document=dataset_document, dataset_document=dataset_document,
documents=documents documents=new_documents
) )
except DocumentIsPausedException: except DocumentIsPausedException:
raise DocumentIsPausedException('Document paused, document id: {}'.format(dataset_document.id)) raise DocumentIsPausedException('Document paused, document id: {}'.format(dataset_document.id))
...@@ -91,6 +98,22 @@ class IndexingRunner: ...@@ -91,6 +98,22 @@ class IndexingRunner:
dataset_document.stopped_at = datetime.datetime.utcnow() dataset_document.stopped_at = datetime.datetime.utcnow()
db.session.commit() db.session.commit()
def format_split_text(self, text):
regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)" # 匹配Q和A的正则表达式
matches = re.findall(regex, text, re.MULTILINE) # 获取所有匹配到的结果
result = [] # 存储最终的结果
for match in matches:
q = match[0]
a = match[1]
if q and a:
# 如果Q和A都存在,就将其添加到结果中
result.append({
"question": q,
"answer": re.sub(r"\n\s*", "\n", a.strip())
})
return result
def run_in_splitting_status(self, dataset_document: DatasetDocument): def run_in_splitting_status(self, dataset_document: DatasetDocument):
"""Run the indexing process when the index_status is splitting.""" """Run the indexing process when the index_status is splitting."""
try: try:
...@@ -428,7 +451,7 @@ class IndexingRunner: ...@@ -428,7 +451,7 @@ class IndexingRunner:
return documents return documents
def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter, def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter,
processing_rule: DatasetProcessRule) -> List[Document]: processing_rule: DatasetProcessRule, tenant_id) -> List[Document]:
""" """
Split the text documents into nodes. Split the text documents into nodes.
""" """
...@@ -446,6 +469,11 @@ class IndexingRunner: ...@@ -446,6 +469,11 @@ class IndexingRunner:
if document.page_content is None or not document.page_content.strip(): if document.page_content is None or not document.page_content.strip():
continue continue
response = LLMGenerator.generate_qa_document(processing_rule.tenant_id, document.page_content)
document_qa_list = self.format_split_text(response)
for result in document_qa_list:
document = Document(page_content=result['question'], metadata={'source': result['answer']})
new_documents.append(document)
doc_id = str(uuid.uuid4()) doc_id = str(uuid.uuid4())
hash = helper.generate_text_hash(document.page_content) hash = helper.generate_text_hash(document.page_content)
......
...@@ -43,6 +43,16 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = ( ...@@ -43,6 +43,16 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
"[\"question1\",\"question2\",\"question3\"]\n" "[\"question1\",\"question2\",\"question3\"]\n"
) )
GENERATOR_QA_PROMPT = (
"你是出题人.\n"
"用户会发送一段长文本.\n请一步一步思考"
'Step1:了解并总结这段文本的主要内容\n'
'Step2:这段文本提到了哪些关键信息或概念\n'
'Step3:可分解或结合多个信息与概念\n'
'Step4:将这些关键信息与概念生成 10 个问题与答案,问题描述清楚并且详细完整,答案详细完整.\n'
"按格式回答: Q1:\nA1:\nQ2:\nA2:...\n"
)
RULE_CONFIG_GENERATE_TEMPLATE = """Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select \ RULE_CONFIG_GENERATE_TEMPLATE = """Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select \
the model prompt that best suits the input. the model prompt that best suits the input.
You will be provided with the prompt, variables, and an opening statement. You will be provided with the prompt, variables, and an opening statement.
......
...@@ -198,6 +198,7 @@ class CompletionService: ...@@ -198,6 +198,7 @@ class CompletionService:
conversation = db.session.query(Conversation).filter_by(id=conversation.id).first() conversation = db.session.query(Conversation).filter_by(id=conversation.id).first()
# run # run
Completion.generate( Completion.generate(
task_id=generate_task_id, task_id=generate_task_id,
app=app_model, app=app_model,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment