1.add batch add segment 2.support delete segment 3.support un_archive document 4. QA language

ceff8edb · jyong · 1cd5d5eb · ceff8edb · ceff8edb · ceff8edb
Commit ceff8edb authored Aug 01, 2023 by jyong
10 changed files
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@@ -221,6 +221,7 @@ class DatasetIndexingEstimateApi(Resource):
        parser.add_argument('info_list', type=dict, required=True, nullable=True, location='json')
        parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
        parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
+        parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False, location='json')
        args = parser.parse_args()
        # validate args
        DocumentService.estimate_args_validate(args)
@@ -235,12 +236,14 @@ class DatasetIndexingEstimateApi(Resource):
                raise NotFound("File not found.")
            indexing_runner = IndexingRunner()
-            response = indexing_runner.file_indexing_estimate(file_details, args['process_rule'], args['doc_form'])
+            response = indexing_runner.file_indexing_estimate(file_details, args['process_rule'],
+                                                              args['doc_form'], args['doc_language'])
        elif args['info_list']['data_source_type'] == 'notion_import':
            indexing_runner = IndexingRunner()
            response = indexing_runner.notion_indexing_estimate(args['info_list']['notion_info_list'],
-                                                                args['process_rule'], args['doc_form'])
+                                                                args['process_rule'], args['doc_form'],
+                                                                args['doc_language'])
        else:
            raise ValueError('Data source type not support')
        return response, 200

--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@@ -539,7 +539,8 @@ class DocumentIndexingStatusApi(DocumentResource):
        document.completed_segments = completed_segments
        document.total_segments = total_segments
+        if document.is_paused:
+            document.indexing_status = 'paused'
        return marshal(document, self.document_status_fields)
@@ -796,6 +797,22 @@ class DocumentStatusApi(DocumentResource):
                remove_document_from_index_task.delay(document_id)
+            return {'result': 'success'}, 200
+        elif action == "un_archive":
+            if not document.archived:
+                raise InvalidActionError('Document is not archived.')
+            document.archived = False
+            document.archived_at = None
+            document.archived_by = None
+            document.updated_at = datetime.utcnow()
+            db.session.commit()
+            # Set cache to prevent indexing the same document multiple times
+            redis_client.setex(indexing_cache_key, 600, 1)
+            add_document_to_index_task.delay(document_id)
            return {'result': 'success'}, 200
        else:
            raise InvalidActionError()

--- a/api/controllers/console/datasets/datasets_segments.py
+++ b/api/controllers/console/datasets/datasets_segments.py
 # -*- coding:utf-8 -*-
+import uuid
 from datetime import datetime
+from flask import request
 from flask_login import login_required, current_user
 from flask_restful import Resource, reqparse, fields, marshal
 from werkzeug.exceptions import NotFound, Forbidden
 import services
 from controllers.console import api
-from controllers.console.datasets.error import InvalidActionError
+from controllers.console.datasets.error import InvalidActionError, NoFileUploadedError, TooManyFilesError
 from controllers.console.setup import setup_required
 from controllers.console.wraps import account_initialization_required
 from extensions.ext_database import db
@@ -17,7 +19,9 @@ from models.dataset import DocumentSegment
 from libs.helper import TimestampField
 from services.dataset_service import DatasetService, DocumentService, SegmentService
 from tasks.enable_segment_to_index_task import enable_segment_to_index_task
-from tasks.remove_segment_from_index_task import remove_segment_from_index_task
+from tasks.disable_segment_from_index_task import disable_segment_from_index_task
+from tasks.batch_create_segment_to_index_task import batch_create_segment_to_index_task
+import pandas as pd
 segment_fields = {
    'id': fields.String,
@@ -197,7 +201,7 @@ class DatasetDocumentSegmentApi(Resource):
            # Set cache to prevent indexing the same segment multiple times
            redis_client.setex(indexing_cache_key, 600, 1)
-            remove_segment_from_index_task.delay(segment.id)
+            disable_segment_from_index_task.delay(segment.id)
            return {'result': 'success'}, 200
        else:
@@ -283,6 +287,104 @@ class DatasetDocumentSegmentUpdateApi(Resource):
            'doc_form': document.doc_form
        }, 200
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def delete(self, dataset_id, document_id, segment_id):
+        # check dataset
+        dataset_id = str(dataset_id)
+        dataset = DatasetService.get_dataset(dataset_id)
+        if not dataset:
+            raise NotFound('Dataset not found.')
+        # check document
+        document_id = str(document_id)
+        document = DocumentService.get_document(dataset_id, document_id)
+        if not document:
+            raise NotFound('Document not found.')
+        # check segment
+        segment_id = str(segment_id)
+        segment = DocumentSegment.query.filter(
+            DocumentSegment.id == str(segment_id),
+            DocumentSegment.tenant_id == current_user.current_tenant_id
+        ).first()
+        if not segment:
+            raise NotFound('Segment not found.')
+        # The role of the current user in the ta table must be admin or owner
+        if current_user.current_tenant.current_role not in ['admin', 'owner']:
+            raise Forbidden()
+        try:
+            DatasetService.check_dataset_permission(dataset, current_user)
+        except services.errors.account.NoPermissionError as e:
+            raise Forbidden(str(e))
+        SegmentService.delete_segment(segment, document, dataset)
+        return {'result': 'success'}, 200
+class DatasetDocumentSegmentBatchImportApi(Resource):
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def post(self, dataset_id, document_id):
+        # check dataset
+        dataset_id = str(dataset_id)
+        dataset = DatasetService.get_dataset(dataset_id)
+        if not dataset:
+            raise NotFound('Dataset not found.')
+        # check document
+        document_id = str(document_id)
+        document = DocumentService.get_document(dataset_id, document_id)
+        if not document:
+            raise NotFound('Document not found.')
+        # get file from request
+        file = request.files['file']
+        # check file
+        if 'file' not in request.files:
+            raise NoFileUploadedError()
+        if len(request.files) > 1:
+            raise TooManyFilesError()
+        # check file type
+        if not file.filename.endswith('.csv'):
+            raise ValueError("Invalid file type. Only CSV files are allowed")
+        try:
+            # Skip the first row
+            df = pd.read_csv(file)
+            result = []
+            for index, row in df.iterrows():
+                data = {'content': row[0], 'answer': row[1]}
+                result.append(data)
+            if len(result) == 0:
+                raise ValueError("The CSV file is empty.")
+            # async job
+            job_id = str(uuid.uuid4())
+            indexing_cache_key = 'segment_batch_import_{}'.format(str(job_id))
+            # send batch add segments task
+            redis_client.setnx(indexing_cache_key, 'waiting')
+            batch_create_segment_to_index_task.delay(str(job_id), result, dataset_id, document_id,
+                                                     current_user.current_tenant_id, current_user.id)
+        except Exception as e:
+            return {'error': str(e)}, 500
+        return {
+            'job_id': job_id,
+            'job_status': 'waiting'
+        }, 200
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def get(self, job_id):
+        job_id = str(job_id)
+        indexing_cache_key = 'segment_batch_import_{}'.format(job_id)
+        cache_result = redis_client.get(indexing_cache_key)
+        if cache_result is None:
+            raise ValueError("The job is not exist.")
+        return {
+            'job_id': job_id,
+            'job_status': cache_result.decode()
+        }, 200
 api.add_resource(DatasetDocumentSegmentListApi,
                 '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments')
@@ -292,3 +394,6 @@ api.add_resource(DatasetDocumentSegmentAddApi,
                 '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segment')
 api.add_resource(DatasetDocumentSegmentUpdateApi,
                 '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments/<uuid:segment_id>')
+api.add_resource(DatasetDocumentSegmentBatchImportApi,
+                 '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments/batch_import',
+                 '/datasets/batch_import_status/<uuid:job_id>')
--- a/api/core/generator/llm_generator.py
+++ b/api/core/generator/llm_generator.py
@@ -188,22 +188,8 @@ class LLMGenerator:
        return rule_config
    @classmethod
-    async def generate_qa_document(cls, llm: StreamableOpenAI, query):
+    def generate_qa_document_sync(cls, llm: StreamableOpenAI, query: str, document_language: str):
-        prompt = GENERATOR_QA_PROMPT
+        prompt = GENERATOR_QA_PROMPT.format(language=document_language)
-        if isinstance(llm, BaseChatModel):
-            prompt = [SystemMessage(content=prompt), HumanMessage(content=query)]
-        response = llm.generate([prompt])
-        answer = response.generations[0][0].text
-        return answer.strip()
-    @classmethod
-    def generate_qa_document_sync(cls, llm: StreamableOpenAI, query):
-        prompt = GENERATOR_QA_PROMPT
        if isinstance(llm, BaseChatModel):
            prompt = [SystemMessage(content=prompt), HumanMessage(content=query)]

--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -70,14 +70,6 @@ class IndexingRunner:
                    dataset_document=dataset_document,
                    processing_rule=processing_rule
                )
-                # new_documents = []
-                # for document in documents:
-                #     response = LLMGenerator.generate_qa_document(dataset.tenant_id, document.page_content)
-                #     document_qa_list = self.format_split_text(response)
-                #     for result in document_qa_list:
-                #         document = Document(page_content=result['question'], metadata={'source': result['answer']})
-                #         new_documents.append(document)
-                # build index
                self._build_index(
                    dataset=dataset,
                    dataset_document=dataset_document,
@@ -228,7 +220,7 @@ class IndexingRunner:
            db.session.commit()
    def file_indexing_estimate(self, file_details: List[UploadFile], tmp_processing_rule: dict,
-                               doc_form: str = None) -> dict:
+                               doc_form: str = None, doc_language: str = 'English') -> dict:
        """
        Estimate the indexing for the document.
        """
@@ -268,7 +260,7 @@ class IndexingRunner:
                    model_name='gpt-3.5-turbo',
                    max_tokens=2000
                )
-                response = LLMGenerator.generate_qa_document_sync(llm, preview_texts[0])
+                response = LLMGenerator.generate_qa_document_sync(llm, preview_texts[0], doc_language)
                document_qa_list = self.format_split_text(response)
                return {
                    "total_segments": total_segments * 20,
@@ -287,7 +279,8 @@ class IndexingRunner:
            "preview": preview_texts
        }
-    def notion_indexing_estimate(self, notion_info_list: list, tmp_processing_rule: dict, doc_form: str = None) -> dict:
+    def notion_indexing_estimate(self, notion_info_list: list, tmp_processing_rule: dict,
+                                 doc_form: str = None, doc_language: str = 'English') -> dict:
        """
        Estimate the indexing for the document.
        """
@@ -345,7 +338,7 @@ class IndexingRunner:
                    model_name='gpt-3.5-turbo',
                    max_tokens=2000
                )
-                response = LLMGenerator.generate_qa_document_sync(llm, preview_texts[0])
+                response = LLMGenerator.generate_qa_document_sync(llm, preview_texts[0], doc_language)
                document_qa_list = self.format_split_text(response)
                return {
                    "total_segments": total_segments * 20,
@@ -452,7 +445,8 @@ class IndexingRunner:
            splitter=splitter,
            processing_rule=processing_rule,
            tenant_id=dataset.tenant_id,
-            document_form=dataset_document.doc_form
+            document_form=dataset_document.doc_form,
+            document_language=dataset_document.doc_language
        )
        # save node to document segment
@@ -489,7 +483,8 @@ class IndexingRunner:
        return documents
    def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter,
-                            processing_rule: DatasetProcessRule, tenant_id: str, document_form: str) -> List[Document]:
+                            processing_rule: DatasetProcessRule, tenant_id: str,
+                            document_form: str, document_language: str) -> List[Document]:
        """
        Split the text documents into nodes.
        """
@@ -523,7 +518,8 @@ class IndexingRunner:
                sub_documents = all_documents[i:i + 10]
                for doc in sub_documents:
                    document_format_thread = threading.Thread(target=self.format_qa_document, kwargs={
-                        'llm': llm, 'document_node': doc, 'all_qa_documents': all_qa_documents})
+                        'llm': llm, 'document_node': doc, 'all_qa_documents': all_qa_documents,
+                        'document_language': document_language})
                    threads.append(document_format_thread)
                    document_format_thread.start()
                for thread in threads:
@@ -531,13 +527,13 @@ class IndexingRunner:
            return all_qa_documents
        return all_documents
-    def format_qa_document(self, llm: StreamableOpenAI, document_node, all_qa_documents):
+    def format_qa_document(self, llm: StreamableOpenAI, document_node, all_qa_documents, document_language):
        format_documents = []
        if document_node.page_content is None or not document_node.page_content.strip():
            return
        try:
            # qa model document
-            response = LLMGenerator.generate_qa_document_sync(llm, document_node.page_content)
+            response = LLMGenerator.generate_qa_document_sync(llm, document_node.page_content, document_language)
            document_qa_list = self.format_split_text(response)
            qa_documents = []
            for result in document_qa_list:
@@ -716,6 +712,32 @@ class IndexingRunner:
        DocumentSegment.query.filter_by(document_id=dataset_document_id).update(update_params)
        db.session.commit()
+    def batch_add_segments(self, segments: List[DocumentSegment], dataset: Dataset):
+        """
+        Batch add segments index processing
+        """
+        documents = []
+        for segment in segments:
+            document = Document(
+                page_content=segment.content,
+                metadata={
+                    "doc_id": segment.index_node_id,
+                    "doc_hash": segment.index_node_hash,
+                    "document_id": segment.document_id,
+                    "dataset_id": segment.dataset_id,
+                }
+            )
+            documents.append(document)
+        # save vector index
+        index = IndexBuilder.get_index(dataset, 'high_quality')
+        if index:
+            index.add_texts(documents, duplicate_check=True)
+        # save keyword index
+        index = IndexBuilder.get_index(dataset, 'economy')
+        if index:
+            index.add_texts(documents)
 class DocumentIsPausedException(Exception):
    pass
--- a/api/core/prompt/prompts.py
+++ b/api/core/prompt/prompts.py
@@ -44,13 +44,13 @@ SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = (
 )
 GENERATOR_QA_PROMPT = (
-    "Please respond according to the language of the user's input text. If the text is in language [A], you must also reply in language [A].\n"
+    'The user will send a long text. Please think step by step.'
    'Step 1: Understand and summarize the main content of this text.\n'
    'Step 2: What key information or concepts are mentioned in this text?\n'
    'Step 3: Decompose or combine multiple pieces of information and concepts.\n'
    'Step 4: Generate 20 questions and answers based on these key information and concepts.'
    'The questions should be clear and detailed, and the answers should be detailed and complete.\n'
-    "Answer in the following format: Q1:\nA1:\nQ2:\nA2:...\n"
+    "Answer must be the language:{language} and in the following format: Q1:\nA1:\nQ2:\nA2:...\n"
 )
 RULE_CONFIG_GENERATE_TEMPLATE = """Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select \

--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@@ -208,6 +208,7 @@ class Document(db.Model):
    doc_metadata = db.Column(db.JSON, nullable=True)
    doc_form = db.Column(db.String(
        255), nullable=False, server_default=db.text("'text_model'::character varying"))
+    doc_language = db.Column(db.String(255), nullable=True)
    DATA_SOURCES = ['upload_file', 'notion_import']

--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -40,4 +40,5 @@ newspaper3k==0.2.8
 google-api-python-client==2.90.0
 wikipedia==1.4.0
 readabilipy==0.2.0
 google-search-results==2.4.2
\ No newline at end of file
+pandas==1.5.3
\ No newline at end of file
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -32,8 +32,9 @@ from tasks.document_indexing_task import document_indexing_task
 from tasks.document_indexing_update_task import document_indexing_update_task
 from tasks.create_segment_to_index_task import create_segment_to_index_task
 from tasks.update_segment_index_task import update_segment_index_task
-from tasks.update_segment_keyword_index_task\
+from tasks.recover_document_indexing_task import recover_document_indexing_task
-    import update_segment_keyword_index_task
+from tasks.update_segment_keyword_index_task import update_segment_keyword_index_task
+from tasks.delete_segment_from_index_task import delete_segment_from_index_task
 class DatasetService:
@@ -373,7 +374,7 @@ class DocumentService:
        indexing_cache_key = 'document_{}_is_paused'.format(document.id)
        redis_client.delete(indexing_cache_key)
        # trigger async task
-        document_indexing_task.delay(document.dataset_id, document.id)
+        recover_document_indexing_task.delay(document.dataset_id, document.id)
    @staticmethod
    def get_documents_position(dataset_id):
@@ -451,6 +452,7 @@ class DocumentService:
                    document = DocumentService.save_document(dataset, dataset_process_rule.id,
                                                             document_data["data_source"]["type"],
                                                             document_data["doc_form"],
+                                                             document_data["doc_language"],
                                                             data_source_info, created_from, position,
                                                             account, file_name, batch)
                    db.session.add(document)
@@ -496,6 +498,7 @@ class DocumentService:
                            document = DocumentService.save_document(dataset, dataset_process_rule.id,
                                                                     document_data["data_source"]["type"],
                                                                     document_data["doc_form"],
+                                                                     document_data["doc_language"],
                                                                     data_source_info, created_from, position,
                                                                     account, page['page_name'], batch)
                            db.session.add(document)
@@ -511,14 +514,15 @@ class DocumentService:
            db.session.commit()
            # trigger async task
-            document_indexing_task.delay(dataset.id, document_ids, document_data['doc_language'])
+            document_indexing_task.delay(dataset.id, document_ids)
        return documents, batch
    @staticmethod
    def save_document(dataset: Dataset, process_rule_id: str, data_source_type: str, document_form: str,
-                      data_source_info: dict, created_from: str, position: int, account: Account, name: str,
+                      document_language: str, data_source_info: dict, created_from: str, position: int,
-                      batch: str):
+                      account: Account,
+                      name: str, batch: str):
        document = Document(
            tenant_id=dataset.tenant_id,
            dataset_id=dataset.id,
@@ -530,7 +534,8 @@ class DocumentService:
            name=name,
            created_from=created_from,
            created_by=account.id,
-            doc_form=document_form
+            doc_form=document_form,
+            doc_language=document_language
        )
        return document
@@ -927,3 +932,17 @@ class SegmentService:
            redis_client.setex(indexing_cache_key, 600, 1)
            update_segment_index_task.delay(segment.id, args['keywords'])
        return segment
+    @classmethod
+    def delete_segment(cls, segment: DocumentSegment, document: Document, dataset: Dataset):
+        indexing_cache_key = 'segment_{}_delete_indexing'.format(segment.id)
+        cache_result = redis_client.get(indexing_cache_key)
+        if cache_result is not None:
+            raise ValueError("Segment is deleting.")
+        # send delete segment index task
+        redis_client.setex(indexing_cache_key, 600, 1)
+        # enabled segment need to delete index
+        if segment.enabled:
+            delete_segment_from_index_task.delay(segment.id, segment.index_node_id, dataset.id, document.id)
+        db.session.delete(segment)
+        db.session.commit()
--- a/api/tasks/remove_segment_from_index_task.py
+++ b/api/tasks/remove_segment_from_index_task.py
@@ -12,14 +12,14 @@ from models.dataset import DocumentSegment
 @shared_task
-def remove_segment_from_index_task(segment_id: str):
+def disable_segment_from_index_task(segment_id: str):
    """
-    Async Remove segment from index
+    Async disable segment from index
    :param segment_id:
-    Usage: remove_segment_from_index.delay(segment_id)
+    Usage: disable_segment_from_index_task.delay(segment_id)
    """
-    logging.info(click.style('Start remove segment from index: {}'.format(segment_id), fg='green'))
+    logging.info(click.style('Start disable segment from index: {}'.format(segment_id), fg='green'))
    start_at = time.perf_counter()
    segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_id).first()