Commit 4c596272 authored by jyong's avatar jyong

fix clean dataset task

parent 90b22d8c
import datetime
import re
from os import environ
from uuid import uuid4
import openai
from langchain.document_loaders import WebBaseLoader, UnstructuredFileLoader, TextLoader
from langchain.embeddings import OpenAIEmbeddings, MiniMaxEmbeddings
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from pymilvus import connections, Collection
from pymilvus.orm import utility
from core.data_loader.loader.excel import ExcelLoader
from core.generator.llm_generator import LLMGenerator
from core.index.vector_index.milvus import Milvus
OPENAI_API_KEY = "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" # example: "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
## Set up environment variables
environ["OPENAI_API_KEY"] = OPENAI_API_KEY
environ["MINIMAX_GROUP_ID"] = "1686736670459291"
environ[
"MINIMAX_API_KEY"] = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJOYW1lIjoiIiwiU3ViamVjdElEIjoiMTY4NjczNjY3MDQ0NzEyNSIsIlBob25lIjoiTVRVd01UZzBNREU1TlRFPSIsIkdyb3VwSUQiOiIiLCJQYWdlTmFtZSI6IiIsIk1haWwiOiJwYW5wYW5AZGlmeS5haSIsIkNyZWF0ZVRpbWUiOiIiLCJpc3MiOiJtaW5pbWF4In0.i9gRKYmOW3zM8vEcT7lD-Ym-0eE6UUU3vb-gVxpWfSMkdc6ObbRnkP5nYumZJbV9L-yRA00GW6nMWYcWkY3IbDWWFAi-hRmzAtl-orpkz5DxPzjRJbwAPy9snYlqBWYQ4hOQ-53zmA5wgsm0ga5pMpBTN9SCkm7EnBQDEsPEY1m121tuwXe6LhAMjdX0Kic-UI-KTYbDdWGAl6nu8h8lrSHVuEEYA6Lz3VDyJTcYfME-B435vw-x1UXSb5-V-YhMEhIixEO8ezUQXaERq0mErtIQEoZN4r7OeNNGjocsfwiHRiw_EdxbfYUWjpvAytmmekIuL3tfvfhbif-EZc4E5w"
CONVERSATION_PROMPT = (
"你是出题人.\n"
"用户会发送一段长文本.\n请一步一步思考"
'Step1:了解并总结这段文本的主要内容\n'
'Step2:这段文本提到了哪些关键信息或概念\n'
'Step3:可分解或结合多个信息与概念\n'
'Step4:将这些关键信息与概念生成 10 个问题与答案,问题描述清楚并且详细完整,答案详细完整.\n'
"按格式回答: Q1:\nA1:\nQ2:\nA2:...\n"
)
def test_milvus():
def format_split_text(text):
regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)" # 匹配Q和A的正则表达式
matches = re.findall(regex, text, re.MULTILINE) # 获取所有匹配到的结果
result = [] # 存储最终的结果
for match in matches:
q = match[0]
a = match[1]
if q and a:
# 如果Q和A都存在,就将其添加到结果中
result.append({
"question": q,
"answer": re.sub(r"\n\s*", "\n", a.strip())
})
return result
# 84b2202c-c359-46b7-a810-bce50feaa4d1
# Use the WebBaseLoader to load specified web pages into documents
# loader = WebBaseLoader([
# "https://milvus.io/docs/overview.md",
# ])
loader = ExcelLoader('/Users/jiangyong/Downloads/xiaoming.xlsx')
# loader = TextLoader('/Users/jiangyong/Downloads/all.txt', autodetect_encoding=True)
# loader = UnstructuredFileLoader('/Users/jiangyong/Downloads/douban.xlsx')
docs = loader.load()
#
# # Split the documents into smaller chunks
text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
#
docs = text_splitter.split_documents(docs)
new_docs = []
for doc in docs:
openai.api_key="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"
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{
'role': 'system',
'content': CONVERSATION_PROMPT
},
{
'role': 'user',
'content': doc.page_content
}
],
temperature=0,
stream=False, # this time, we set stream=True
n=1,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
#response = LLMGenerator.generate_qa_document('84b2202c-c359-46b7-a810-bce50feaa4d1', doc.page_content)
results = format_split_text(response['choices'][0]['message']['content'])
print(results)
# for result in results:
# document = Document(page_content=result['question'], metadata={'source': result['answer']})
# new_docs.append(document)
# new_docs.append(doc)
# embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
embeddings = MiniMaxEmbeddings()
# cont = connections.connect(
# alias="default",
# user='username',
# password='password',
# host='localhost',
# port='19530'
# )
# chunk_size = 100
# for i in range(0, len(new_docs), chunk_size):
# # check document is paused
# chunk_documents = new_docs[i:i + chunk_size]
# vector_store = Milvus.from_documents(
# chunk_documents,
# collection_name='jytest5',
# embedding=embeddings,
# connection_args={"uri": 'https://in01-706333b4f51fa0b.aws-us-west-2.vectordb.zillizcloud.com:19530',
# 'user': 'db_admin', 'password': 'dify123456!'}
# )
# collection = Collection("jytest4") # Get an existing collection.
# collection.release()
# print(datetime.datetime.utcnow())
# alias = uuid4().hex
# # #connection_args = {"host": 'localhost', "port": '19530'}
# connection_args = {"uri": 'https://in01-91c80c04f4aed06.aws-us-west-2.vectordb.zillizcloud.com:19530',
# 'user': 'db_admin', 'password': 'dify123456!'}
# connections.connect(alias=alias, **connection_args)
# connection = Collection(
# 'jytest10',
# using=alias,
# )
# print(datetime.datetime.utcnow())
# # connection.release()
# query = '阿甘正传'
# search_params = {"metric_type": "IP", "params": {"level": 2}}
# docs = Milvus(embedding_function=embeddings, collection_name='jytest4').similarity_search(query)
# docs = Milvus(embedding_function=embeddings, collection_name='jytest', connection_args={"uri": 'https://in01-706333b4f51fa0b.aws-us-west-2.vectordb.zillizcloud.com:19530',
# 'user': 'db_admin', 'password': 'dify123456!'}).similarity_search(query)
# docs = Milvus(embedding_function=embeddings, collection_name='jytest10', connection_args={"uri": 'https://in01-91c80c04f4aed06.aws-us-west-2.vectordb.zillizcloud.com:19530',
# 'token': '01a3da355f5645fe949b1c6e97339c90b1931b6726094fcac3dd0594ab6312eb4ea314095ca989d7dfc8abfac1092dd1a6d46017', 'db_name':'dify'}).similarity_search(query)
# print(datetime.datetime.utcnow())
# docs = vector_store.similarity_search(query)
# cont = connections.connect(
# alias="default",
# user='username',
# password='password',
# host='localhost',
# port='19530'
# )
# docs = cont.search(query='What is milvus?', search_type='similarity',
# connection_args={"host": 'localhost', "port": '19530'})
# docs = vector_store.similarity_search(query)
# print(docs)
# connections.connect("default",
# uri='https://in01-617651a0cb211be.aws-us-west-2.vectordb.zillizcloud.com:19533',
# user='db_admin',
# password='dify123456!')
#
# # Check if the collection exists
# collection_name = "jytest"
# check_collection = utility.has_collection(collection_name)
# if check_collection:
# drop_result = utility.drop_collection(collection_name)
# print("Success!")
# collection = Collection(name=collection_name)
# collection.
# search_params = {"metric_type": "L2", "params": {"level": 2}}
# results = collection.search('电影排名50',
# anns_field='page_content',
# param=search_params,
# limit=1,
# guarantee_timestamp=1)
# connections.disconnect("default")
import numpy as np
from numpy import average
from sentence_transformers import SentenceTransformer
def test_embdding():
sentences = ["My name is john"]
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
embeddings = model.encode(sentences)
for embedding in embeddings:
print(embedding)
embedding = (embedding / np.linalg.norm(embedding)).tolist()
print(embedding)
embedding = (embedding / np.linalg.norm(embedding)).tolist()
print(embedding)
print(embeddings)
import base64
import binascii
import hashlib
import secrets
from os import environ
import numpy as np
from langchain.embeddings import MiniMaxEmbeddings
from numpy import average
from sentence_transformers import SentenceTransformer
from core.index.vector_index.milvus import Milvus
environ["MINIMAX_GROUP_ID"] = "1686736670459291"
environ["MINIMAX_API_KEY"] = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJOYW1lIjoiIiwiU3ViamVjdElEIjoiMTY4NjczNjY3MDQ0NzEyNSIsIlBob25lIjoiTVRVd01UZzBNREU1TlRFPSIsIkdyb3VwSUQiOiIiLCJQYWdlTmFtZSI6IiIsIk1haWwiOiJwYW5wYW5AZGlmeS5haSIsIkNyZWF0ZVRpbWUiOiIiLCJpc3MiOiJtaW5pbWF4In0.i9gRKYmOW3zM8vEcT7lD-Ym-0eE6UUU3vb-gVxpWfSMkdc6ObbRnkP5nYumZJbV9L-yRA00GW6nMWYcWkY3IbDWWFAi-hRmzAtl-orpkz5DxPzjRJbwAPy9snYlqBWYQ4hOQ-53zmA5wgsm0ga5pMpBTN9SCkm7EnBQDEsPEY1m121tuwXe6LhAMjdX0Kic-UI-KTYbDdWGAl6nu8h8lrSHVuEEYA6Lz3VDyJTcYfME-B435vw-x1UXSb5-V-YhMEhIixEO8ezUQXaERq0mErtIQEoZN4r7OeNNGjocsfwiHRiw_EdxbfYUWjpvAytmmekIuL3tfvfhbif-EZc4E5w"
def test_query():
# embeddings = MiniMaxEmbeddings()
# query = '你对这部电影有什么感悟'
# # search_params = {"metric_type": "IP", "params": {"level": 2}}
# # docs = Milvus(embedding_function=embeddings, collection_name='jytest4').similarity_search(query)
# docs = Milvus(embedding_function=embeddings, collection_name='jytest5',
# connection_args={"uri": 'https://in01-706333b4f51fa0b.aws-us-west-2.vectordb.zillizcloud.com:19530',
# 'user': 'db_admin', 'password': 'dify123456!'}).similarity_search(query)
# print(docs)
# generate password salt
def hash_password(password_str, salt_byte):
dk = hashlib.pbkdf2_hmac('sha256', password_str.encode('utf-8'), salt_byte, 10000)
return binascii.hexlify(dk)
salt = secrets.token_bytes(16)
base64_salt = base64.b64encode(salt).decode()
# encrypt password with salt
password_hashed = hash_password('dify123456!', salt)
base64_password_hashed = base64.b64encode(password_hashed).decode()
print(base64_password_hashed)
print('*******************')
print(base64_salt)
......@@ -178,7 +178,7 @@ class LLMGenerator:
llm: StreamableOpenAI = LLMBuilder.to_llm(
tenant_id=tenant_id,
model_name='gpt-3.5-turbo',
max_tokens=1000
max_tokens=100
)
if isinstance(llm, BaseChatModel):
......
This diff is collapsed.
from typing import Optional, cast
import requests
import weaviate
from langchain.embeddings.base import Embeddings
from langchain.schema import Document, BaseRetriever
from langchain.vectorstores import VectorStore
from pydantic import BaseModel, root_validator
from core.index.base import BaseIndex
from core.index.vector_index.base import BaseVectorIndex
from core.vector_store.weaviate_vector_store import WeaviateVectorStore
from models.dataset import Dataset
class MilvusConfig(BaseModel):
uri: str
username: Optional[str]
password: Optional[str]
batch_size: int = 100
@root_validator()
def validate_config(cls, values: dict) -> dict:
if not values['uri']:
raise ValueError("config Milvus uri is required")
return values
class MilvusVectorIndex(BaseVectorIndex):
def __init__(self, dataset: Dataset, config: MilvusConfig, embeddings: Embeddings):
super().__init__(dataset, embeddings)
self._client = self._init_client(config)
def _init_client(self, config: MilvusConfig) -> weaviate.Client:
auth_config = weaviate.auth.AuthApiKey(api_key=config.api_key)
weaviate.connect.connection.has_grpc = False
try:
client = weaviate.Client(
url=config.endpoint,
auth_client_secret=auth_config,
timeout_config=(5, 60),
startup_period=None
)
except requests.exceptions.ConnectionError:
raise ConnectionError("Vector database connection error")
client.batch.configure(
# `batch_size` takes an `int` value to enable auto-batching
# (`None` is used for manual batching)
batch_size=config.batch_size,
# dynamically update the `batch_size` based on import speed
dynamic=True,
# `timeout_retries` takes an `int` value to retry on time outs
timeout_retries=3,
)
return client
def get_type(self) -> str:
return 'weaviate'
def get_index_name(self, dataset: Dataset) -> str:
if self.dataset.index_struct_dict:
class_prefix: str = self.dataset.index_struct_dict['vector_store']['class_prefix']
if not class_prefix.endswith('_Node'):
# original class_prefix
class_prefix += '_Node'
return class_prefix
dataset_id = dataset.id
return "Vector_index_" + dataset_id.replace("-", "_") + '_Node'
def to_index_struct(self) -> dict:
return {
"type": self.get_type(),
"vector_store": {"class_prefix": self.get_index_name(self.dataset)}
}
def create(self, texts: list[Document], **kwargs) -> BaseIndex:
uuids = self._get_uuids(texts)
self._vector_store = WeaviateVectorStore.from_documents(
texts,
self._embeddings,
client=self._client,
index_name=self.get_index_name(self.dataset),
uuids=uuids,
by_text=False
)
return self
def _get_vector_store(self) -> VectorStore:
"""Only for created index."""
if self._vector_store:
return self._vector_store
attributes = ['doc_id', 'dataset_id', 'document_id']
if self._is_origin():
attributes = ['doc_id']
return WeaviateVectorStore(
client=self._client,
index_name=self.get_index_name(self.dataset),
text_key='text',
embedding=self._embeddings,
attributes=attributes,
by_text=False
)
def _get_vector_store_class(self) -> type:
return WeaviateVectorStore
def delete_by_document_id(self, document_id: str):
if self._is_origin():
self.recreate_dataset(self.dataset)
return
vector_store = self._get_vector_store()
vector_store = cast(self._get_vector_store_class(), vector_store)
vector_store.del_texts({
"operator": "Equal",
"path": ["document_id"],
"valueText": document_id
})
def _is_origin(self):
if self.dataset.index_struct_dict:
class_prefix: str = self.dataset.index_struct_dict['vector_store']['class_prefix']
if not class_prefix.endswith('_Node'):
# original class_prefix
return True
return False
import numpy as np
import sklearn.decomposition
import pickle
import time
# Apply 'Algorithm 1' to the ada-002 embeddings to make them isotropic, taken from the paper:
# ALL-BUT-THE-TOP: SIMPLE AND EFFECTIVE POST- PROCESSING FOR WORD REPRESENTATIONS
# Jiaqi Mu, Pramod Viswanath
# This uses Principal Component Analysis (PCA) to 'evenly distribute' the embedding vectors (make them isotropic)
# For more information on PCA, see https://jamesmccaffrey.wordpress.com/2021/07/16/computing-pca-using-numpy-without-scikit/
# get the file pointer of the pickle containing the embeddings
fp = open('/path/to/your/data/Embedding-Latest.pkl', 'rb')
# the embedding data here is a dict consisting of key / value pairs
# the key is the hash of the message (SHA3-256), the value is the embedding from ada-002 (array of dimension 1536)
# the hash can be used to lookup the orignal text in a database
E = pickle.load(fp) # load the data into memory
# seperate the keys (hashes) and values (embeddings) into seperate vectors
K = list(E.keys()) # vector of all the hash values
X = np.array(list(E.values())) # vector of all the embeddings, converted to numpy arrays
# list the total number of embeddings
# this can be truncated if there are too many embeddings to do PCA on
print(f"Total number of embeddings: {len(X)}")
# get dimension of embeddings, used later
Dim = len(X[0])
# flash out the first few embeddings
print("First two embeddings are: ")
print(X[0])
print(f"First embedding length: {len(X[0])}")
print(X[1])
print(f"Second embedding length: {len(X[1])}")
# compute the mean of all the embeddings, and flash the result
mu = np.mean(X, axis=0) # same as mu in paper
print(f"Mean embedding vector: {mu}")
print(f"Mean embedding vector length: {len(mu)}")
# subtract the mean vector from each embedding vector ... vectorized in numpy
X_tilde = X - mu # same as v_tilde(w) in paper
# do the heavy lifting of extracting the principal components
# note that this is a function of the embeddings you currently have here, and this set may grow over time
# therefore the PCA basis vectors may change over time, and your final isotropic embeddings may drift over time
# but the drift should stabilize after you have extracted enough embedding data to characterize the nature of the embedding engine
print(f"Performing PCA on the normalized embeddings ...")
pca = sklearn.decomposition.PCA() # new object
TICK = time.time() # start timer
pca.fit(X_tilde) # do the heavy lifting!
TOCK = time.time() # end timer
DELTA = TOCK - TICK
print(f"PCA finished in {DELTA} seconds ...")
# dimensional reduction stage (the only hyperparameter)
# pick max dimension of PCA components to express embddings
# in general this is some integer less than or equal to the dimension of your embeddings
# it could be set as a high percentile, say 95th percentile of pca.explained_variance_ratio_
# but just hardcoding a constant here
D = 15 # hyperparameter on dimension (out of 1536 for ada-002), paper recommeds D = Dim/100
# form the set of v_prime(w), which is the final embedding
# this could be vectorized in numpy to speed it up, but coding it directly here in a double for-loop to avoid errors and to be transparent
E_prime = dict() # output dict of the new embeddings
N = len(X_tilde)
N10 = round(N/10)
U = pca.components_ # set of PCA basis vectors, sorted by most significant to least significant
print(f"Shape of full set of PCA componenents {U.shape}")
U = U[0:D,:] # take the top D dimensions (or take them all if D is the size of the embedding vector)
print(f"Shape of downselected PCA componenents {U.shape}")
for ii in range(N):
v_tilde = X_tilde[ii]
v = X[ii]
v_projection = np.zeros(Dim) # start to build the projection
# project the original embedding onto the PCA basis vectors, use only first D dimensions
for jj in range(D):
u_jj = U[jj,:] # vector
v_jj = np.dot(u_jj,v) # scaler
v_projection += v_jj*u_jj # vector
v_prime = v_tilde - v_projection # final embedding vector
v_prime = v_prime/np.linalg.norm(v_prime) # create unit vector
E_prime[K[ii]] = v_prime
if (ii%N10 == 0) or (ii == N-1):
print(f"Finished with {ii+1} embeddings out of {N} ({round(100*ii/N)}% done)")
# save as new pickle
print("Saving new pickle ...")
embeddingName = '/path/to/your/data/Embedding-Latest-Isotropic.pkl'
with open(embeddingName, 'wb') as f: # Python 3: open(..., 'wb')
pickle.dump([E_prime,mu,U], f)
print(embeddingName)
print("Done!")
# When working with live data with a new embedding from ada-002, be sure to tranform it first with this function before comparing it
#
def projectEmbedding(v,mu,U):
v = np.array(v)
v_tilde = v - mu
v_projection = np.zeros(len(v)) # start to build the projection
# project the original embedding onto the PCA basis vectors, use only first D dimensions
for u in U:
v_jj = np.dot(u,v) # scaler
v_projection += v_jj*u # vector
v_prime = v_tilde - v_projection # final embedding vector
v_prime = v_prime/np.linalg.norm(v_prime) # create unit vector
return v_prime
\ No newline at end of file
......@@ -71,18 +71,18 @@ class IndexingRunner:
dataset_document=dataset_document,
processing_rule=processing_rule
)
new_documents = []
for document in documents:
response = LLMGenerator.generate_qa_document(dataset.tenant_id, document.page_content)
document_qa_list = self.format_split_text(response)
for result in document_qa_list:
document = Document(page_content=result['question'], metadata={'source': result['answer']})
new_documents.append(document)
# new_documents = []
# for document in documents:
# response = LLMGenerator.generate_qa_document(dataset.tenant_id, document.page_content)
# document_qa_list = self.format_split_text(response)
# for result in document_qa_list:
# document = Document(page_content=result['question'], metadata={'source': result['answer']})
# new_documents.append(document)
# build index
self._build_index(
dataset=dataset,
dataset_document=dataset_document,
documents=new_documents
documents=documents
)
except DocumentIsPausedException:
raise DocumentIsPausedException('Document paused, document id: {}'.format(dataset_document.id))
......@@ -251,7 +251,8 @@ class IndexingRunner:
documents = self._split_to_documents(
text_docs=text_docs,
splitter=splitter,
processing_rule=processing_rule
processing_rule=processing_rule,
tenant_id='84b2202c-c359-46b7-a810-bce50feaa4d1'
)
total_segments += len(documents)
for document in documents:
......@@ -311,7 +312,8 @@ class IndexingRunner:
documents = self._split_to_documents(
text_docs=documents,
splitter=splitter,
processing_rule=processing_rule
processing_rule=processing_rule,
tenant_id='84b2202c-c359-46b7-a810-bce50feaa4d1'
)
total_segments += len(documents)
for document in documents:
......@@ -414,7 +416,8 @@ class IndexingRunner:
documents = self._split_to_documents(
text_docs=text_docs,
splitter=splitter,
processing_rule=processing_rule
processing_rule=processing_rule,
tenant_id=dataset.tenant_id
)
# save node to document segment
......@@ -469,18 +472,18 @@ class IndexingRunner:
if document.page_content is None or not document.page_content.strip():
continue
response = LLMGenerator.generate_qa_document(processing_rule.tenant_id, document.page_content)
response = LLMGenerator.generate_qa_document(tenant_id, document.page_content)
document_qa_list = self.format_split_text(response)
qa_documents = []
for result in document_qa_list:
document = Document(page_content=result['question'], metadata={'source': result['answer']})
new_documents.append(document)
doc_id = str(uuid.uuid4())
hash = helper.generate_text_hash(document.page_content)
document.metadata['doc_id'] = doc_id
document.metadata['doc_hash'] = hash
doc_id = str(uuid.uuid4())
hash = helper.generate_text_hash(document.page_content)
split_documents.append(document)
document.metadata['doc_id'] = doc_id
document.metadata['doc_hash'] = hash
qa_documents.append(document)
split_documents.extend(qa_documents)
all_documents.extend(split_documents)
......
......@@ -51,6 +51,7 @@ GENERATOR_QA_PROMPT = (
'Step3:可分解或结合多个信息与概念\n'
'Step4:将这些关键信息与概念生成 10 个问题与答案,问题描述清楚并且详细完整,答案详细完整.\n'
"按格式回答: Q1:\nA1:\nQ2:\nA2:...\n"
"只输出Step4中的内容"
)
RULE_CONFIG_GENERATE_TEMPLATE = """Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment