fix clean dataset task

4c596272 · jyong · 90b22d8c · 4c596272 · 4c596272 · 4c596272
Commit 4c596272 authored Jul 12, 2023 by jyong
9 changed files
--- a/api/controllers/console/datasets/test.py
+++ b/api/controllers/console/datasets/test.py
+import datetime
+import re
+from os import environ
+from uuid import uuid4
+
+import openai
+from langchain.document_loaders import WebBaseLoader, UnstructuredFileLoader, TextLoader
+from langchain.embeddings import OpenAIEmbeddings, MiniMaxEmbeddings
+from langchain.schema import Document
+from langchain.text_splitter import CharacterTextSplitter
+from pymilvus import connections, Collection
+from pymilvus.orm import utility
+
+from core.data_loader.loader.excel import ExcelLoader
+from core.generator.llm_generator import LLMGenerator
+from core.index.vector_index.milvus import Milvus
+
+OPENAI_API_KEY = "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"  # example: "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+## Set up environment variables
+environ["OPENAI_API_KEY"] = OPENAI_API_KEY
+environ["MINIMAX_GROUP_ID"] = "1686736670459291"
+environ[
+    "MINIMAX_API_KEY"] = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJOYW1lIjoiIiwiU3ViamVjdElEIjoiMTY4NjczNjY3MDQ0NzEyNSIsIlBob25lIjoiTVRVd01UZzBNREU1TlRFPSIsIkdyb3VwSUQiOiIiLCJQYWdlTmFtZSI6IiIsIk1haWwiOiJwYW5wYW5AZGlmeS5haSIsIkNyZWF0ZVRpbWUiOiIiLCJpc3MiOiJtaW5pbWF4In0.i9gRKYmOW3zM8vEcT7lD-Ym-0eE6UUU3vb-gVxpWfSMkdc6ObbRnkP5nYumZJbV9L-yRA00GW6nMWYcWkY3IbDWWFAi-hRmzAtl-orpkz5DxPzjRJbwAPy9snYlqBWYQ4hOQ-53zmA5wgsm0ga5pMpBTN9SCkm7EnBQDEsPEY1m121tuwXe6LhAMjdX0Kic-UI-KTYbDdWGAl6nu8h8lrSHVuEEYA6Lz3VDyJTcYfME-B435vw-x1UXSb5-V-YhMEhIixEO8ezUQXaERq0mErtIQEoZN4r7OeNNGjocsfwiHRiw_EdxbfYUWjpvAytmmekIuL3tfvfhbif-EZc4E5w"
+
+CONVERSATION_PROMPT = (
+    "你是出题人.\n"
+    "用户会发送一段长文本.\n请一步一步思考"
+    'Step1：了解并总结这段文本的主要内容\n'
+    'Step2：这段文本提到了哪些关键信息或概念\n'
+    'Step3：可分解或结合多个信息与概念\n'
+    'Step4：将这些关键信息与概念生成 10 个问题与答案，问题描述清楚并且详细完整,答案详细完整.\n'
+    "按格式回答: Q1:\nA1:\nQ2:\nA2:...\n"
+)
+
+
+def test_milvus():
+    def format_split_text(text):
+        regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)"  # 匹配Q和A的正则表达式
+        matches = re.findall(regex, text, re.MULTILINE)  # 获取所有匹配到的结果
+
+        result = []  # 存储最终的结果
+        for match in matches:
+            q = match[0]
+            a = match[1]
+            if q and a:
+                # 如果Q和A都存在，就将其添加到结果中
+                result.append({
+                    "question": q,
+                    "answer": re.sub(r"\n\s*", "\n", a.strip())
+                })
+
+        return result
+    # 84b2202c-c359-46b7-a810-bce50feaa4d1
+    # Use the WebBaseLoader to load specified web pages into documents
+    # loader = WebBaseLoader([
+    #     "https://milvus.io/docs/overview.md",
+    # ])
+    loader = ExcelLoader('/Users/jiangyong/Downloads/xiaoming.xlsx')
+    # loader = TextLoader('/Users/jiangyong/Downloads/all.txt', autodetect_encoding=True)
+    # loader = UnstructuredFileLoader('/Users/jiangyong/Downloads/douban.xlsx')
+    docs = loader.load()
+    #
+    # # Split the documents into smaller chunks
+    text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
+    #
+    docs = text_splitter.split_documents(docs)
+    new_docs = []
+    for doc in docs:
+        openai.api_key="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"
+        response = openai.ChatCompletion.create(
+            model='gpt-3.5-turbo',
+            messages=[
+                {
+                    'role': 'system',
+                    'content': CONVERSATION_PROMPT
+                },
+                {
+                    'role': 'user',
+                    'content': doc.page_content
+                }
+            ],
+            temperature=0,
+            stream=False,  # this time, we set stream=True
+
+            n=1,
+            top_p=1,
+            frequency_penalty=0,
+            presence_penalty=0
+        )
+        #response = LLMGenerator.generate_qa_document('84b2202c-c359-46b7-a810-bce50feaa4d1', doc.page_content)
+        results = format_split_text(response['choices'][0]['message']['content'])
+        print(results)
+        # for result in results:
+        #     document = Document(page_content=result['question'], metadata={'source': result['answer']})
+        #     new_docs.append(document)
+        # new_docs.append(doc)
+    # embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
+    embeddings = MiniMaxEmbeddings()
+    # cont = connections.connect(
+    #     alias="default",
+    #     user='username',
+    #     password='password',
+    #     host='localhost',
+    #     port='19530'
+    # )
+    # chunk_size = 100
+    # for i in range(0, len(new_docs), chunk_size):
+    #     # check document is paused
+    #     chunk_documents = new_docs[i:i + chunk_size]
+    #     vector_store = Milvus.from_documents(
+    #         chunk_documents,
+    #         collection_name='jytest5',
+    #         embedding=embeddings,
+    #         connection_args={"uri": 'https://in01-706333b4f51fa0b.aws-us-west-2.vectordb.zillizcloud.com:19530',
+    #                          'user': 'db_admin', 'password': 'dify123456!'}
+    #     )
+
+    # collection = Collection("jytest4")  # Get an existing collection.
+    # collection.release()
+    # print(datetime.datetime.utcnow())
+    # alias = uuid4().hex
+    # # #connection_args = {"host": 'localhost', "port": '19530'}
+    # connection_args = {"uri": 'https://in01-91c80c04f4aed06.aws-us-west-2.vectordb.zillizcloud.com:19530',
+    #                    'user': 'db_admin', 'password': 'dify123456!'}
+    # connections.connect(alias=alias, **connection_args)
+    # connection = Collection(
+    #     'jytest10',
+    #     using=alias,
+    # )
+    # print(datetime.datetime.utcnow())
+    # # connection.release()
+    # query = '阿甘正传'
+    # search_params = {"metric_type": "IP", "params": {"level": 2}}
+    # docs = Milvus(embedding_function=embeddings, collection_name='jytest4').similarity_search(query)
+    # docs = Milvus(embedding_function=embeddings, collection_name='jytest', connection_args={"uri": 'https://in01-706333b4f51fa0b.aws-us-west-2.vectordb.zillizcloud.com:19530',
+    #                        'user': 'db_admin', 'password': 'dify123456!'}).similarity_search(query)
+    # docs = Milvus(embedding_function=embeddings, collection_name='jytest10', connection_args={"uri": 'https://in01-91c80c04f4aed06.aws-us-west-2.vectordb.zillizcloud.com:19530',
+    #                           'token': '01a3da355f5645fe949b1c6e97339c90b1931b6726094fcac3dd0594ab6312eb4ea314095ca989d7dfc8abfac1092dd1a6d46017', 'db_name':'dify'}).similarity_search(query)
+    # print(datetime.datetime.utcnow())
+    # docs = vector_store.similarity_search(query)
+    # cont = connections.connect(
+    #     alias="default",
+    #     user='username',
+    #     password='password',
+    #     host='localhost',
+    #     port='19530'
+    # )
+
+    # docs = cont.search(query='What is milvus?', search_type='similarity',
+    #                    connection_args={"host": 'localhost', "port": '19530'})
+    # docs = vector_store.similarity_search(query)
+
+    # print(docs)
+
+    # connections.connect("default",
+    #                     uri='https://in01-617651a0cb211be.aws-us-west-2.vectordb.zillizcloud.com:19533',
+    #                     user='db_admin',
+    #                     password='dify123456!')
+    #
+    # # Check if the collection exists
+    # collection_name = "jytest"
+    # check_collection = utility.has_collection(collection_name)
+    # if check_collection:
+    #     drop_result = utility.drop_collection(collection_name)
+    # print("Success!")
+    # collection = Collection(name=collection_name)
+    # collection.
+    # search_params = {"metric_type": "L2", "params": {"level": 2}}
+    # results = collection.search('电影排名50',
+    #                             anns_field='page_content',
+    #                             param=search_params,
+    #                             limit=1,
+    #                             guarantee_timestamp=1)
+    # connections.disconnect("default")
+
+
--- a/api/controllers/console/datasets/test_embedding.py
+++ b/api/controllers/console/datasets/test_embedding.py
+import numpy as np
+from numpy import average
+from sentence_transformers import SentenceTransformer
+
+
+
+def test_embdding():
+    sentences = ["My name is john"]
+
+    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
+    embeddings = model.encode(sentences)
+    for embedding in embeddings:
+        print(embedding)
+        embedding = (embedding / np.linalg.norm(embedding)).tolist()
+        print(embedding)
+        embedding = (embedding / np.linalg.norm(embedding)).tolist()
+        print(embedding)
+    print(embeddings)
--- a/api/controllers/console/datasets/test_query.py
+++ b/api/controllers/console/datasets/test_query.py
+import base64
+import binascii
+import hashlib
+import secrets
+from os import environ
+
+import numpy as np
+from langchain.embeddings import MiniMaxEmbeddings
+from numpy import average
+from sentence_transformers import SentenceTransformer
+
+from core.index.vector_index.milvus import Milvus
+
+environ["MINIMAX_GROUP_ID"] = "1686736670459291"
+environ["MINIMAX_API_KEY"] = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJOYW1lIjoiIiwiU3ViamVjdElEIjoiMTY4NjczNjY3MDQ0NzEyNSIsIlBob25lIjoiTVRVd01UZzBNREU1TlRFPSIsIkdyb3VwSUQiOiIiLCJQYWdlTmFtZSI6IiIsIk1haWwiOiJwYW5wYW5AZGlmeS5haSIsIkNyZWF0ZVRpbWUiOiIiLCJpc3MiOiJtaW5pbWF4In0.i9gRKYmOW3zM8vEcT7lD-Ym-0eE6UUU3vb-gVxpWfSMkdc6ObbRnkP5nYumZJbV9L-yRA00GW6nMWYcWkY3IbDWWFAi-hRmzAtl-orpkz5DxPzjRJbwAPy9snYlqBWYQ4hOQ-53zmA5wgsm0ga5pMpBTN9SCkm7EnBQDEsPEY1m121tuwXe6LhAMjdX0Kic-UI-KTYbDdWGAl6nu8h8lrSHVuEEYA6Lz3VDyJTcYfME-B435vw-x1UXSb5-V-YhMEhIixEO8ezUQXaERq0mErtIQEoZN4r7OeNNGjocsfwiHRiw_EdxbfYUWjpvAytmmekIuL3tfvfhbif-EZc4E5w"
+
+def test_query():
+    # embeddings = MiniMaxEmbeddings()
+    # query = '你对这部电影有什么感悟'
+    # # search_params = {"metric_type": "IP", "params": {"level": 2}}
+    # # docs = Milvus(embedding_function=embeddings, collection_name='jytest4').similarity_search(query)
+    # docs = Milvus(embedding_function=embeddings, collection_name='jytest5',
+    #               connection_args={"uri": 'https://in01-706333b4f51fa0b.aws-us-west-2.vectordb.zillizcloud.com:19530',
+    #                                'user': 'db_admin', 'password': 'dify123456!'}).similarity_search(query)
+    # print(docs)
+
+    # generate password salt
+    def hash_password(password_str, salt_byte):
+        dk = hashlib.pbkdf2_hmac('sha256', password_str.encode('utf-8'), salt_byte, 10000)
+        return binascii.hexlify(dk)
+    salt = secrets.token_bytes(16)
+    base64_salt = base64.b64encode(salt).decode()
+
+    # encrypt password with salt
+    password_hashed = hash_password('dify123456!', salt)
+    base64_password_hashed = base64.b64encode(password_hashed).decode()
+    print(base64_password_hashed)
+    print('*******************')
+    print(base64_salt)
--- a/api/core/generator/llm_generator.py
+++ b/api/core/generator/llm_generator.py
@@ -178,7 +178,7 @@ class LLMGenerator:
        llm: StreamableOpenAI = LLMBuilder.to_llm(
            tenant_id=tenant_id,
            model_name='gpt-3.5-turbo',
-            max_tokens=1000
+            max_tokens=100
        )

        if isinstance(llm, BaseChatModel):

--- a/api/core/index/vector_index/milvus.py
+++ b/api/core/index/vector_index/milvus.py
--- a/api/core/index/vector_index/milvus_vector_index.py
+++ b/api/core/index/vector_index/milvus_vector_index.py
+from typing import Optional, cast
+
+import requests
+import weaviate
+from langchain.embeddings.base import Embeddings
+from langchain.schema import Document, BaseRetriever
+from langchain.vectorstores import VectorStore
+from pydantic import BaseModel, root_validator
+
+from core.index.base import BaseIndex
+from core.index.vector_index.base import BaseVectorIndex
+from core.vector_store.weaviate_vector_store import WeaviateVectorStore
+from models.dataset import Dataset
+
+
+class MilvusConfig(BaseModel):
+    uri: str
+    username: Optional[str]
+    password: Optional[str]
+    batch_size: int = 100
+
+    @root_validator()
+    def validate_config(cls, values: dict) -> dict:
+        if not values['uri']:
+            raise ValueError("config Milvus uri is required")
+        return values
+
+
+class MilvusVectorIndex(BaseVectorIndex):
+    def __init__(self, dataset: Dataset, config: MilvusConfig, embeddings: Embeddings):
+        super().__init__(dataset, embeddings)
+        self._client = self._init_client(config)
+
+    def _init_client(self, config: MilvusConfig) -> weaviate.Client:
+        auth_config = weaviate.auth.AuthApiKey(api_key=config.api_key)
+
+        weaviate.connect.connection.has_grpc = False
+
+        try:
+            client = weaviate.Client(
+                url=config.endpoint,
+                auth_client_secret=auth_config,
+                timeout_config=(5, 60),
+                startup_period=None
+            )
+        except requests.exceptions.ConnectionError:
+            raise ConnectionError("Vector database connection error")
+
+        client.batch.configure(
+            # `batch_size` takes an `int` value to enable auto-batching
+            # (`None` is used for manual batching)
+            batch_size=config.batch_size,
+            # dynamically update the `batch_size` based on import speed
+            dynamic=True,
+            # `timeout_retries` takes an `int` value to retry on time outs
+            timeout_retries=3,
+        )
+
+        return client
+
+    def get_type(self) -> str:
+        return 'weaviate'
+
+    def get_index_name(self, dataset: Dataset) -> str:
+        if self.dataset.index_struct_dict:
+            class_prefix: str = self.dataset.index_struct_dict['vector_store']['class_prefix']
+            if not class_prefix.endswith('_Node'):
+                # original class_prefix
+                class_prefix += '_Node'
+
+            return class_prefix
+
+        dataset_id = dataset.id
+        return "Vector_index_" + dataset_id.replace("-", "_") + '_Node'
+
+    def to_index_struct(self) -> dict:
+        return {
+            "type": self.get_type(),
+            "vector_store": {"class_prefix": self.get_index_name(self.dataset)}
+        }
+
+    def create(self, texts: list[Document], **kwargs) -> BaseIndex:
+        uuids = self._get_uuids(texts)
+        self._vector_store = WeaviateVectorStore.from_documents(
+            texts,
+            self._embeddings,
+            client=self._client,
+            index_name=self.get_index_name(self.dataset),
+            uuids=uuids,
+            by_text=False
+        )
+
+        return self
+
+    def _get_vector_store(self) -> VectorStore:
+        """Only for created index."""
+        if self._vector_store:
+            return self._vector_store
+
+        attributes = ['doc_id', 'dataset_id', 'document_id']
+        if self._is_origin():
+            attributes = ['doc_id']
+
+        return WeaviateVectorStore(
+            client=self._client,
+            index_name=self.get_index_name(self.dataset),
+            text_key='text',
+            embedding=self._embeddings,
+            attributes=attributes,
+            by_text=False
+        )
+
+    def _get_vector_store_class(self) -> type:
+        return WeaviateVectorStore
+
+    def delete_by_document_id(self, document_id: str):
+        if self._is_origin():
+            self.recreate_dataset(self.dataset)
+            return
+
+        vector_store = self._get_vector_store()
+        vector_store = cast(self._get_vector_store_class(), vector_store)
+
+        vector_store.del_texts({
+            "operator": "Equal",
+            "path": ["document_id"],
+            "valueText": document_id
+        })
+
+    def _is_origin(self):
+        if self.dataset.index_struct_dict:
+            class_prefix: str = self.dataset.index_struct_dict['vector_store']['class_prefix']
+            if not class_prefix.endswith('_Node'):
+                # original class_prefix
+                return True
+
+        return False
--- a/api/core/index/vector_index/test-embedding.py
+++ b/api/core/index/vector_index/test-embedding.py
+import numpy as np
+import sklearn.decomposition
+import pickle
+import time
+
+
+# Apply 'Algorithm 1' to the ada-002 embeddings to make them isotropic, taken from the paper:
+# ALL-BUT-THE-TOP: SIMPLE AND EFFECTIVE POST- PROCESSING FOR WORD REPRESENTATIONS
+# Jiaqi Mu, Pramod Viswanath
+
+# This uses Principal Component Analysis (PCA) to 'evenly distribute' the embedding vectors (make them isotropic)
+# For more information on PCA, see https://jamesmccaffrey.wordpress.com/2021/07/16/computing-pca-using-numpy-without-scikit/
+
+
+# get the file pointer of the pickle containing the embeddings
+fp = open('/path/to/your/data/Embedding-Latest.pkl', 'rb')
+
+
+# the embedding data here is a dict consisting of key / value pairs
+# the key is the hash of the message (SHA3-256), the value is the embedding from ada-002 (array of dimension 1536)
+# the hash can be used to lookup the orignal text in a database
+E = pickle.load(fp) # load the data into memory
+
+# seperate the keys (hashes) and values (embeddings) into seperate vectors
+K = list(E.keys()) # vector of all the hash values
+X = np.array(list(E.values())) # vector of all the embeddings, converted to numpy arrays
+
+
+# list the total number of embeddings
+# this can be truncated if there are too many embeddings to do PCA on
+print(f"Total number of embeddings: {len(X)}")
+
+# get dimension of embeddings, used later
+Dim = len(X[0])
+
+# flash out the first few embeddings
+print("First two embeddings are: ")
+print(X[0])
+print(f"First embedding length: {len(X[0])}")
+print(X[1])
+print(f"Second embedding length: {len(X[1])}")
+
+
+# compute the mean of all the embeddings, and flash the result
+mu = np.mean(X, axis=0) # same as mu in paper
+print(f"Mean embedding vector: {mu}")
+print(f"Mean embedding vector length: {len(mu)}")
+
+
+# subtract the mean vector from each embedding vector ... vectorized in numpy
+X_tilde = X - mu # same as v_tilde(w) in paper
+
+
+
+# do the heavy lifting of extracting the principal components
+# note that this is a function of the embeddings you currently have here, and this set may grow over time
+# therefore the PCA basis vectors may change over time, and your final isotropic embeddings may drift over time
+# but the drift should stabilize after you have extracted enough embedding data to characterize the nature of the embedding engine
+print(f"Performing PCA on the normalized embeddings ...")
+pca = sklearn.decomposition.PCA()  # new object
+TICK = time.time() # start timer
+pca.fit(X_tilde) # do the heavy lifting!
+TOCK = time.time() # end timer
+DELTA = TOCK - TICK
+
+print(f"PCA finished in {DELTA} seconds ...")
+
+# dimensional reduction stage (the only hyperparameter)
+# pick max dimension of PCA components to express embddings
+# in general this is some integer less than or equal to the dimension of your embeddings
+# it could be set as a high percentile, say 95th percentile of pca.explained_variance_ratio_
+# but just hardcoding a constant here
+D = 15 # hyperparameter on dimension (out of 1536 for ada-002), paper recommeds D = Dim/100
+
+
+# form the set of v_prime(w), which is the final embedding
+# this could be vectorized in numpy to speed it up, but coding it directly here in a double for-loop to avoid errors and to be transparent
+E_prime = dict() # output dict of the new embeddings
+N = len(X_tilde)
+N10 = round(N/10)
+U = pca.components_ # set of PCA basis vectors, sorted by most significant to least significant
+print(f"Shape of full set of PCA componenents {U.shape}")
+U = U[0:D,:] # take the top D dimensions (or take them all if D is the size of the embedding vector)
+print(f"Shape of downselected PCA componenents {U.shape}")
+for ii in range(N):
+    v_tilde = X_tilde[ii]
+    v = X[ii]
+    v_projection = np.zeros(Dim) # start to build the projection
+    # project the original embedding onto the PCA basis vectors, use only first D dimensions
+    for jj in range(D):
+        u_jj = U[jj,:] # vector
+        v_jj = np.dot(u_jj,v) # scaler
+        v_projection += v_jj*u_jj # vector
+    v_prime = v_tilde - v_projection # final embedding vector
+    v_prime = v_prime/np.linalg.norm(v_prime) # create unit vector
+    E_prime[K[ii]] = v_prime
+
+    if (ii%N10 == 0) or (ii == N-1):
+        print(f"Finished with {ii+1} embeddings out of {N} ({round(100*ii/N)}% done)")
+
+
+# save as new pickle
+print("Saving new pickle ...")
+embeddingName = '/path/to/your/data/Embedding-Latest-Isotropic.pkl'
+with open(embeddingName, 'wb') as f:  # Python 3: open(..., 'wb')
+    pickle.dump([E_prime,mu,U], f)
+    print(embeddingName)
+
+print("Done!")
+
+# When working with live data with a new embedding from ada-002, be sure to tranform it first with this function before comparing it
+#
+def projectEmbedding(v,mu,U):
+    v = np.array(v)
+    v_tilde = v - mu
+    v_projection = np.zeros(len(v)) # start to build the projection
+    # project the original embedding onto the PCA basis vectors, use only first D dimensions
+    for u in U:
+        v_jj = np.dot(u,v) # scaler
+        v_projection += v_jj*u # vector
+    v_prime = v_tilde - v_projection # final embedding vector
+    v_prime = v_prime/np.linalg.norm(v_prime) # create unit vector
+    return v_prime
\ No newline at end of file
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -71,18 +71,18 @@ class IndexingRunner:
                    dataset_document=dataset_document,
                    processing_rule=processing_rule
                )
-                new_documents = []
-                for document in documents:
-                    response = LLMGenerator.generate_qa_document(dataset.tenant_id, document.page_content)
-                    document_qa_list = self.format_split_text(response)
-                    for result in document_qa_list:
-                        document = Document(page_content=result['question'], metadata={'source': result['answer']})
-                        new_documents.append(document)
+                # new_documents = []
+                # for document in documents:
+                #     response = LLMGenerator.generate_qa_document(dataset.tenant_id, document.page_content)
+                #     document_qa_list = self.format_split_text(response)
+                #     for result in document_qa_list:
+                #         document = Document(page_content=result['question'], metadata={'source': result['answer']})
+                #         new_documents.append(document)
                # build index
                self._build_index(
                    dataset=dataset,
                    dataset_document=dataset_document,
-                    documents=new_documents
+                    documents=documents
                )
            except DocumentIsPausedException:
                raise DocumentIsPausedException('Document paused, document id: {}'.format(dataset_document.id))
@@ -251,7 +251,8 @@ class IndexingRunner:
            documents = self._split_to_documents(
                text_docs=text_docs,
                splitter=splitter,
-                processing_rule=processing_rule
+                processing_rule=processing_rule,
+                tenant_id='84b2202c-c359-46b7-a810-bce50feaa4d1'
            )
            total_segments += len(documents)
            for document in documents:
@@ -311,7 +312,8 @@ class IndexingRunner:
                documents = self._split_to_documents(
                    text_docs=documents,
                    splitter=splitter,
-                    processing_rule=processing_rule
+                    processing_rule=processing_rule,
+                    tenant_id='84b2202c-c359-46b7-a810-bce50feaa4d1'
                )
                total_segments += len(documents)
                for document in documents:
@@ -414,7 +416,8 @@ class IndexingRunner:
        documents = self._split_to_documents(
            text_docs=text_docs,
            splitter=splitter,
-            processing_rule=processing_rule
+            processing_rule=processing_rule,
+            tenant_id=dataset.tenant_id
        )

        # save node to document segment
@@ -469,18 +472,18 @@ class IndexingRunner:
                if document.page_content is None or not document.page_content.strip():
                    continue

-                response = LLMGenerator.generate_qa_document(processing_rule.tenant_id, document.page_content)
+                response = LLMGenerator.generate_qa_document(tenant_id, document.page_content)
                document_qa_list = self.format_split_text(response)
+                qa_documents = []
                for result in document_qa_list:
                    document = Document(page_content=result['question'], metadata={'source': result['answer']})
-                    new_documents.append(document)
-                doc_id = str(uuid.uuid4())
-                hash = helper.generate_text_hash(document.page_content)
-
-                document.metadata['doc_id'] = doc_id
-                document.metadata['doc_hash'] = hash
+                    doc_id = str(uuid.uuid4())
+                    hash = helper.generate_text_hash(document.page_content)

-                split_documents.append(document)
+                    document.metadata['doc_id'] = doc_id
+                    document.metadata['doc_hash'] = hash
+                    qa_documents.append(document)
+                split_documents.extend(qa_documents)

            all_documents.extend(split_documents)


--- a/api/core/prompt/prompts.py
+++ b/api/core/prompt/prompts.py
@@ -51,6 +51,7 @@ GENERATOR_QA_PROMPT = (
    'Step3：可分解或结合多个信息与概念\n'
    'Step4：将这些关键信息与概念生成 10 个问题与答案，问题描述清楚并且详细完整,答案详细完整.\n'
    "按格式回答: Q1:\nA1:\nQ2:\nA2:...\n"
+    "只输出Step4中的内容"
 )

 RULE_CONFIG_GENERATE_TEMPLATE = """Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select \