fix clean dataset task

4a45e94b · jyong · 5269b00d · 4a45e94b · 4a45e94b · 4a45e94b
Commit 4a45e94b authored Jul 16, 2023 by jyong
Showing with 88 additions and 2 deletions

app.py api/app.py +13 -0

llm_generator.py api/core/generator/llm_generator.py +2 -1

indexing_runner.py api/core/indexing_runner.py +49 -1

generate_test_task.py api/tasks/generate_test_task.py +24 -0

No files found.
--- a/api/app.py
+++ b/api/app.py
@@ -2,6 +2,10 @@
 import os
 from datetime import datetime

+import requests
+
+from tasks.generate_test_task import generate_test_task
+
 if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true':
    from gevent import monkey
    monkey.patch_all()
@@ -199,6 +203,15 @@ def health():
    }), status=200, content_type="application/json")


+@app.route('/test')
+def test():
+    generate_test_task.delay()
+    res = requests.post('https://api.openai.com/v1/chat/completions')
+    print(res)
+    return Response(json.dumps({
+            'status': 'ok',
+        }), status=200, content_type="application/json")
+
 @app.route('/threads')
 def threads():
    num_threads = threading.active_count()

--- a/api/core/generator/llm_generator.py
+++ b/api/core/generator/llm_generator.py
@@ -27,7 +27,8 @@ class LLMGenerator:
        llm: StreamableOpenAI = LLMBuilder.to_llm(
            tenant_id=tenant_id,
            model_name='gpt-3.5-turbo',
-            max_tokens=50
+            max_tokens=50,
+            timeout=600
        )

        if isinstance(llm, BaseChatModel):

--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -6,6 +6,7 @@ import time
 import uuid
 from typing import Optional, List, cast

+import openai
 from flask import current_app
 from flask_login import current_user
 from langchain.embeddings import OpenAIEmbeddings
@@ -471,9 +472,41 @@ class IndexingRunner:
            for document in documents:
                if document.page_content is None or not document.page_content.strip():
                    continue
-
+                #
                response = LLMGenerator.generate_qa_document(tenant_id, document.page_content)
                document_qa_list = self.format_split_text(response)
+                # CONVERSATION_PROMPT = (
+                #     "你是出题人.\n"
+                #     "用户会发送一段长文本.\n请一步一步思考"
+                #     'Step1：了解并总结这段文本的主要内容\n'
+                #     'Step2：这段文本提到了哪些关键信息或概念\n'
+                #     'Step3：可分解或结合多个信息与概念\n'
+                #     'Step4：将这些关键信息与概念生成 10 个问题与答案，问题描述清楚并且详细完整,答案详细完整.\n'
+                #     "按格式回答: Q1:\nA1:\nQ2:\nA2:...\n"
+                # )
+                # openai.api_key = "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"
+                # response = openai.ChatCompletion.create(
+                #     model='gpt-3.5-turbo',
+                #     messages=[
+                #         {
+                #             'role': 'system',
+                #             'content': CONVERSATION_PROMPT
+                #         },
+                #         {
+                #             'role': 'user',
+                #             'content': document.page_content
+                #         }
+                #     ],
+                #     temperature=0,
+                #     stream=False,  # this time, we set stream=True
+                #
+                #     n=1,
+                #     top_p=1,
+                #     frequency_penalty=0,
+                #     presence_penalty=0
+                # )
+                # # response = LLMGenerator.generate_qa_document('84b2202c-c359-46b7-a810-bce50feaa4d1', doc.page_content)
+                # document_qa_list = self.format_split_text(response['choices'][0]['message']['content'])
                qa_documents = []
                for result in document_qa_list:
                    document = Document(page_content=result['question'], metadata={'source': result['answer']})
@@ -517,7 +550,22 @@ class IndexingRunner:
                    text = re.sub(pattern, '', text)

        return text
+    def format_split_text(self, text):
+        regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)"  # 匹配Q和A的正则表达式
+        matches = re.findall(regex, text, re.MULTILINE)  # 获取所有匹配到的结果
+
+        result = []  # 存储最终的结果
+        for match in matches:
+            q = match[0]
+            a = match[1]
+            if q and a:
+                # 如果Q和A都存在，就将其添加到结果中
+                result.append({
+                    "question": q,
+                    "answer": re.sub(r"\n\s*", "\n", a.strip())
+                })

+        return result
    def _build_index(self, dataset: Dataset, dataset_document: DatasetDocument, documents: List[Document]) -> None:
        """
        Build the index for the document.

--- a/api/tasks/generate_test_task.py
+++ b/api/tasks/generate_test_task.py
+import logging
+import time
+
+import click
+import requests
+from celery import shared_task
+
+from core.generator.llm_generator import LLMGenerator
+
+
+@shared_task
+def generate_test_task():
+    logging.info(click.style('Start generate test', fg='green'))
+    start_at = time.perf_counter()
+
+    try:
+        #res = requests.post('https://api.openai.com/v1/chat/completions')
+        answer = LLMGenerator.generate_conversation_name('84b2202c-c359-46b7-a810-bce50feaa4d1', 'avb', 'ccc')
+        print(f'answer: {answer}')
+
+        end_at = time.perf_counter()
+        logging.info(click.style('Conversation test, latency: {}'.format(end_at - start_at), fg='green'))
+    except Exception:
+        logging.exception("generate test failed")