add qa model

90b22d8c · jyong · 561c9cab · 90b22d8c · 90b22d8c · 90b22d8c
Commit 90b22d8c authored Jul 08, 2023 by jyong
6 changed files
--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@@ -488,6 +488,8 @@ class DocumentBatchIndexingStatusApi(DocumentResource):
                                                          DocumentSegment.status != 're_segment').count()
            document.completed_segments = completed_segments
            document.total_segments = total_segments
+            if document.is_paused:
+                document.indexing_status = 'paused'
            documents_status.append(marshal(document, self.document_status_fields))
        data = {
            'data': documents_status

--- a/api/core/data_loader/loader/excel.py
+++ b/api/core/data_loader/loader/excel.py
@@ -39,7 +39,7 @@ class ExcelLoader(BaseLoader):
                    row_dict = dict(zip(keys, list(map(str, row))))
                    row_dict = {k: v for k, v in row_dict.items() if v}
                    item = ''.join(f'{k}:{v}\n' for k, v in row_dict.items())
-                    document = Document(page_content=item)
+                    document = Document(page_content=item, metadata={'source': self._file_path})
                    data.append(document)

        return data
--- a/api/core/generator/llm_generator.py
+++ b/api/core/generator/llm_generator.py
@@ -2,7 +2,7 @@ import logging

 from langchain import PromptTemplate
 from langchain.chat_models.base import BaseChatModel
-from langchain.schema import HumanMessage, OutputParserException, BaseMessage
+from langchain.schema import HumanMessage, OutputParserException, BaseMessage, SystemMessage

 from core.constant import llm_constant
 from core.llm.llm_builder import LLMBuilder
@@ -12,8 +12,8 @@ from core.prompt.output_parser.rule_config_generator import RuleConfigGeneratorO

 from core.prompt.output_parser.suggested_questions_after_answer import SuggestedQuestionsAfterAnswerOutputParser
 from core.prompt.prompt_template import JinjaPromptTemplate, OutLinePromptTemplate
-from core.prompt.prompts import CONVERSATION_TITLE_PROMPT, CONVERSATION_SUMMARY_PROMPT, INTRODUCTION_GENERATE_PROMPT
-
+from core.prompt.prompts import CONVERSATION_TITLE_PROMPT, CONVERSATION_SUMMARY_PROMPT, INTRODUCTION_GENERATE_PROMPT, \
+    GENERATOR_QA_PROMPT

 # gpt-3.5-turbo works not well
 generate_base_model = 'text-davinci-003'
@@ -171,3 +171,19 @@ class LLMGenerator:
            }

        return rule_config
+
+    @classmethod
+    def generate_qa_document(cls, tenant_id: str, query):
+        prompt = GENERATOR_QA_PROMPT
+        llm: StreamableOpenAI = LLMBuilder.to_llm(
+            tenant_id=tenant_id,
+            model_name='gpt-3.5-turbo',
+            max_tokens=1000
+        )
+
+        if isinstance(llm, BaseChatModel):
+            prompt = [SystemMessage(content=prompt), HumanMessage(content=query)]
+
+        response = llm.generate([prompt])
+        answer = response.generations[0][0].text
+        return answer.strip()
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -16,6 +16,7 @@ from core.data_loader.file_extractor import FileExtractor
 from core.data_loader.loader.notion import NotionLoader
 from core.docstore.dataset_docstore import DatesetDocumentStore
 from core.embedding.cached_embedding import CacheEmbedding
+from core.generator.llm_generator import LLMGenerator
 from core.index.index import IndexBuilder
 from core.index.keyword_table_index.keyword_table_index import KeywordTableIndex, KeywordTableConfig
 from core.index.vector_index.vector_index import VectorIndex
@@ -70,12 +71,18 @@ class IndexingRunner:
                    dataset_document=dataset_document,
                    processing_rule=processing_rule
                )
-
+                new_documents = []
+                for document in documents:
+                    response = LLMGenerator.generate_qa_document(dataset.tenant_id, document.page_content)
+                    document_qa_list = self.format_split_text(response)
+                    for result in document_qa_list:
+                        document = Document(page_content=result['question'], metadata={'source': result['answer']})
+                        new_documents.append(document)
                # build index
                self._build_index(
                    dataset=dataset,
                    dataset_document=dataset_document,
-                    documents=documents
+                    documents=new_documents
                )
            except DocumentIsPausedException:
                raise DocumentIsPausedException('Document paused, document id: {}'.format(dataset_document.id))
@@ -91,6 +98,22 @@ class IndexingRunner:
                dataset_document.stopped_at = datetime.datetime.utcnow()
                db.session.commit()

+    def format_split_text(self, text):
+        regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)"  # 匹配Q和A的正则表达式
+        matches = re.findall(regex, text, re.MULTILINE)  # 获取所有匹配到的结果
+
+        result = []  # 存储最终的结果
+        for match in matches:
+            q = match[0]
+            a = match[1]
+            if q and a:
+                # 如果Q和A都存在，就将其添加到结果中
+                result.append({
+                    "question": q,
+                    "answer": re.sub(r"\n\s*", "\n", a.strip())
+                })
+
+        return result
    def run_in_splitting_status(self, dataset_document: DatasetDocument):
        """Run the indexing process when the index_status is splitting."""
        try:
@@ -428,7 +451,7 @@ class IndexingRunner:
        return documents

    def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter,
-                            processing_rule: DatasetProcessRule) -> List[Document]:
+                            processing_rule: DatasetProcessRule, tenant_id) -> List[Document]:
        """
        Split the text documents into nodes.
        """
@@ -446,6 +469,11 @@ class IndexingRunner:
                if document.page_content is None or not document.page_content.strip():
                    continue

+                response = LLMGenerator.generate_qa_document(processing_rule.tenant_id, document.page_content)
+                document_qa_list = self.format_split_text(response)
+                for result in document_qa_list:
+                    document = Document(page_content=result['question'], metadata={'source': result['answer']})
+                    new_documents.append(document)
                doc_id = str(uuid.uuid4())
                hash = helper.generate_text_hash(document.page_content)


--- a/api/core/prompt/prompts.py
+++ b/api/core/prompt/prompts.py
@@ -43,6 +43,16 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
    "[\"question1\",\"question2\",\"question3\"]\n"
 )

+GENERATOR_QA_PROMPT = (
+    "你是出题人.\n"
+    "用户会发送一段长文本.\n请一步一步思考"
+    'Step1：了解并总结这段文本的主要内容\n'
+    'Step2：这段文本提到了哪些关键信息或概念\n'
+    'Step3：可分解或结合多个信息与概念\n'
+    'Step4：将这些关键信息与概念生成 10 个问题与答案，问题描述清楚并且详细完整,答案详细完整.\n'
+    "按格式回答: Q1:\nA1:\nQ2:\nA2:...\n"
+)
+
 RULE_CONFIG_GENERATE_TEMPLATE = """Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select \
 the model prompt that best suits the input. 
 You will be provided with the prompt, variables, and an opening statement. 

--- a/api/services/completion_service.py
+++ b/api/services/completion_service.py
@@ -198,6 +198,7 @@ class CompletionService:
                    conversation = db.session.query(Conversation).filter_by(id=conversation.id).first()

                # run
+
                Completion.generate(
                    task_id=generate_task_id,
                    app=app_model,