Commit 90b22d8c authored by jyong's avatar jyong

add qa model

parent 561c9cab
......@@ -488,6 +488,8 @@ class DocumentBatchIndexingStatusApi(DocumentResource):
DocumentSegment.status != 're_segment').count()
document.completed_segments = completed_segments
document.total_segments = total_segments
if document.is_paused:
document.indexing_status = 'paused'
documents_status.append(marshal(document, self.document_status_fields))
data = {
'data': documents_status
......
......@@ -39,7 +39,7 @@ class ExcelLoader(BaseLoader):
row_dict = dict(zip(keys, list(map(str, row))))
row_dict = {k: v for k, v in row_dict.items() if v}
item = ''.join(f'{k}:{v}\n' for k, v in row_dict.items())
document = Document(page_content=item)
document = Document(page_content=item, metadata={'source': self._file_path})
data.append(document)
return data
......@@ -2,7 +2,7 @@ import logging
from langchain import PromptTemplate
from langchain.chat_models.base import BaseChatModel
from langchain.schema import HumanMessage, OutputParserException, BaseMessage
from langchain.schema import HumanMessage, OutputParserException, BaseMessage, SystemMessage
from core.constant import llm_constant
from core.llm.llm_builder import LLMBuilder
......@@ -12,8 +12,8 @@ from core.prompt.output_parser.rule_config_generator import RuleConfigGeneratorO
from core.prompt.output_parser.suggested_questions_after_answer import SuggestedQuestionsAfterAnswerOutputParser
from core.prompt.prompt_template import JinjaPromptTemplate, OutLinePromptTemplate
from core.prompt.prompts import CONVERSATION_TITLE_PROMPT, CONVERSATION_SUMMARY_PROMPT, INTRODUCTION_GENERATE_PROMPT
from core.prompt.prompts import CONVERSATION_TITLE_PROMPT, CONVERSATION_SUMMARY_PROMPT, INTRODUCTION_GENERATE_PROMPT, \
GENERATOR_QA_PROMPT
# gpt-3.5-turbo works not well
generate_base_model = 'text-davinci-003'
......@@ -171,3 +171,19 @@ class LLMGenerator:
}
return rule_config
@classmethod
def generate_qa_document(cls, tenant_id: str, query):
prompt = GENERATOR_QA_PROMPT
llm: StreamableOpenAI = LLMBuilder.to_llm(
tenant_id=tenant_id,
model_name='gpt-3.5-turbo',
max_tokens=1000
)
if isinstance(llm, BaseChatModel):
prompt = [SystemMessage(content=prompt), HumanMessage(content=query)]
response = llm.generate([prompt])
answer = response.generations[0][0].text
return answer.strip()
......@@ -16,6 +16,7 @@ from core.data_loader.file_extractor import FileExtractor
from core.data_loader.loader.notion import NotionLoader
from core.docstore.dataset_docstore import DatesetDocumentStore
from core.embedding.cached_embedding import CacheEmbedding
from core.generator.llm_generator import LLMGenerator
from core.index.index import IndexBuilder
from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
from core.index.vector_index.vector_index import VectorIndex
......@@ -70,12 +71,18 @@ class IndexingRunner:
dataset_document=dataset_document,
processing_rule=processing_rule
)
new_documents = []
for document in documents:
response = LLMGenerator.generate_qa_document(dataset.tenant_id, document.page_content)
document_qa_list = self.format_split_text(response)
for result in document_qa_list:
document = Document(page_content=result['question'], metadata={'source': result['answer']})
new_documents.append(document)
# build index
self._build_index(
dataset=dataset,
dataset_document=dataset_document,
documents=documents
documents=new_documents
)
except DocumentIsPausedException:
raise DocumentIsPausedException('Document paused, document id: {}'.format(dataset_document.id))
......@@ -91,6 +98,22 @@ class IndexingRunner:
dataset_document.stopped_at = datetime.datetime.utcnow()
db.session.commit()
def format_split_text(self, text):
regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)" # 匹配Q和A的正则表达式
matches = re.findall(regex, text, re.MULTILINE) # 获取所有匹配到的结果
result = [] # 存储最终的结果
for match in matches:
q = match[0]
a = match[1]
if q and a:
# 如果Q和A都存在,就将其添加到结果中
result.append({
"question": q,
"answer": re.sub(r"\n\s*", "\n", a.strip())
})
return result
def run_in_splitting_status(self, dataset_document: DatasetDocument):
"""Run the indexing process when the index_status is splitting."""
try:
......@@ -428,7 +451,7 @@ class IndexingRunner:
return documents
def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter,
processing_rule: DatasetProcessRule) -> List[Document]:
processing_rule: DatasetProcessRule, tenant_id) -> List[Document]:
"""
Split the text documents into nodes.
"""
......@@ -446,6 +469,11 @@ class IndexingRunner:
if document.page_content is None or not document.page_content.strip():
continue
response = LLMGenerator.generate_qa_document(processing_rule.tenant_id, document.page_content)
document_qa_list = self.format_split_text(response)
for result in document_qa_list:
document = Document(page_content=result['question'], metadata={'source': result['answer']})
new_documents.append(document)
doc_id = str(uuid.uuid4())
hash = helper.generate_text_hash(document.page_content)
......
......@@ -43,6 +43,16 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
"[\"question1\",\"question2\",\"question3\"]\n"
)
GENERATOR_QA_PROMPT = (
"你是出题人.\n"
"用户会发送一段长文本.\n请一步一步思考"
'Step1:了解并总结这段文本的主要内容\n'
'Step2:这段文本提到了哪些关键信息或概念\n'
'Step3:可分解或结合多个信息与概念\n'
'Step4:将这些关键信息与概念生成 10 个问题与答案,问题描述清楚并且详细完整,答案详细完整.\n'
"按格式回答: Q1:\nA1:\nQ2:\nA2:...\n"
)
RULE_CONFIG_GENERATE_TEMPLATE = """Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select \
the model prompt that best suits the input.
You will be provided with the prompt, variables, and an opening statement.
......
......@@ -198,6 +198,7 @@ class CompletionService:
conversation = db.session.query(Conversation).filter_by(id=conversation.id).first()
# run
Completion.generate(
task_id=generate_task_id,
app=app_model,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment