Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
4c596272
Commit
4c596272
authored
Jul 12, 2023
by
jyong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix clean dataset task
parent
90b22d8c
Changes
9
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1329 additions
and
20 deletions
+1329
-20
test.py
api/controllers/console/datasets/test.py
+176
-0
test_embedding.py
api/controllers/console/datasets/test_embedding.py
+18
-0
test_query.py
api/controllers/console/datasets/test_query.py
+39
-0
llm_generator.py
api/core/generator/llm_generator.py
+1
-1
milvus.py
api/core/index/vector_index/milvus.py
+812
-0
milvus_vector_index.py
api/core/index/vector_index/milvus_vector_index.py
+137
-0
test-embedding.py
api/core/index/vector_index/test-embedding.py
+123
-0
indexing_runner.py
api/core/indexing_runner.py
+22
-19
prompts.py
api/core/prompt/prompts.py
+1
-0
No files found.
api/controllers/console/datasets/test.py
0 → 100644
View file @
4c596272
import
datetime
import
re
from
os
import
environ
from
uuid
import
uuid4
import
openai
from
langchain.document_loaders
import
WebBaseLoader
,
UnstructuredFileLoader
,
TextLoader
from
langchain.embeddings
import
OpenAIEmbeddings
,
MiniMaxEmbeddings
from
langchain.schema
import
Document
from
langchain.text_splitter
import
CharacterTextSplitter
from
pymilvus
import
connections
,
Collection
from
pymilvus.orm
import
utility
from
core.data_loader.loader.excel
import
ExcelLoader
from
core.generator.llm_generator
import
LLMGenerator
from
core.index.vector_index.milvus
import
Milvus
OPENAI_API_KEY
=
"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"
# example: "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
## Set up environment variables
environ
[
"OPENAI_API_KEY"
]
=
OPENAI_API_KEY
environ
[
"MINIMAX_GROUP_ID"
]
=
"1686736670459291"
environ
[
"MINIMAX_API_KEY"
]
=
"eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJOYW1lIjoiIiwiU3ViamVjdElEIjoiMTY4NjczNjY3MDQ0NzEyNSIsIlBob25lIjoiTVRVd01UZzBNREU1TlRFPSIsIkdyb3VwSUQiOiIiLCJQYWdlTmFtZSI6IiIsIk1haWwiOiJwYW5wYW5AZGlmeS5haSIsIkNyZWF0ZVRpbWUiOiIiLCJpc3MiOiJtaW5pbWF4In0.i9gRKYmOW3zM8vEcT7lD-Ym-0eE6UUU3vb-gVxpWfSMkdc6ObbRnkP5nYumZJbV9L-yRA00GW6nMWYcWkY3IbDWWFAi-hRmzAtl-orpkz5DxPzjRJbwAPy9snYlqBWYQ4hOQ-53zmA5wgsm0ga5pMpBTN9SCkm7EnBQDEsPEY1m121tuwXe6LhAMjdX0Kic-UI-KTYbDdWGAl6nu8h8lrSHVuEEYA6Lz3VDyJTcYfME-B435vw-x1UXSb5-V-YhMEhIixEO8ezUQXaERq0mErtIQEoZN4r7OeNNGjocsfwiHRiw_EdxbfYUWjpvAytmmekIuL3tfvfhbif-EZc4E5w"
CONVERSATION_PROMPT
=
(
"你是出题人.
\n
"
"用户会发送一段长文本.
\n
请一步一步思考"
'Step1:了解并总结这段文本的主要内容
\n
'
'Step2:这段文本提到了哪些关键信息或概念
\n
'
'Step3:可分解或结合多个信息与概念
\n
'
'Step4:将这些关键信息与概念生成 10 个问题与答案,问题描述清楚并且详细完整,答案详细完整.
\n
'
"按格式回答: Q1:
\n
A1:
\n
Q2:
\n
A2:...
\n
"
)
def
test_milvus
():
def
format_split_text
(
text
):
regex
=
r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q|$)"
# 匹配Q和A的正则表达式
matches
=
re
.
findall
(
regex
,
text
,
re
.
MULTILINE
)
# 获取所有匹配到的结果
result
=
[]
# 存储最终的结果
for
match
in
matches
:
q
=
match
[
0
]
a
=
match
[
1
]
if
q
and
a
:
# 如果Q和A都存在,就将其添加到结果中
result
.
append
({
"question"
:
q
,
"answer"
:
re
.
sub
(
r"\n\s*"
,
"
\n
"
,
a
.
strip
())
})
return
result
# 84b2202c-c359-46b7-a810-bce50feaa4d1
# Use the WebBaseLoader to load specified web pages into documents
# loader = WebBaseLoader([
# "https://milvus.io/docs/overview.md",
# ])
loader
=
ExcelLoader
(
'/Users/jiangyong/Downloads/xiaoming.xlsx'
)
# loader = TextLoader('/Users/jiangyong/Downloads/all.txt', autodetect_encoding=True)
# loader = UnstructuredFileLoader('/Users/jiangyong/Downloads/douban.xlsx')
docs
=
loader
.
load
()
#
# # Split the documents into smaller chunks
text_splitter
=
CharacterTextSplitter
(
chunk_size
=
1024
,
chunk_overlap
=
0
)
#
docs
=
text_splitter
.
split_documents
(
docs
)
new_docs
=
[]
for
doc
in
docs
:
openai
.
api_key
=
"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"
response
=
openai
.
ChatCompletion
.
create
(
model
=
'gpt-3.5-turbo'
,
messages
=
[
{
'role'
:
'system'
,
'content'
:
CONVERSATION_PROMPT
},
{
'role'
:
'user'
,
'content'
:
doc
.
page_content
}
],
temperature
=
0
,
stream
=
False
,
# this time, we set stream=True
n
=
1
,
top_p
=
1
,
frequency_penalty
=
0
,
presence_penalty
=
0
)
#response = LLMGenerator.generate_qa_document('84b2202c-c359-46b7-a810-bce50feaa4d1', doc.page_content)
results
=
format_split_text
(
response
[
'choices'
][
0
][
'message'
][
'content'
])
print
(
results
)
# for result in results:
# document = Document(page_content=result['question'], metadata={'source': result['answer']})
# new_docs.append(document)
# new_docs.append(doc)
# embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
embeddings
=
MiniMaxEmbeddings
()
# cont = connections.connect(
# alias="default",
# user='username',
# password='password',
# host='localhost',
# port='19530'
# )
# chunk_size = 100
# for i in range(0, len(new_docs), chunk_size):
# # check document is paused
# chunk_documents = new_docs[i:i + chunk_size]
# vector_store = Milvus.from_documents(
# chunk_documents,
# collection_name='jytest5',
# embedding=embeddings,
# connection_args={"uri": 'https://in01-706333b4f51fa0b.aws-us-west-2.vectordb.zillizcloud.com:19530',
# 'user': 'db_admin', 'password': 'dify123456!'}
# )
# collection = Collection("jytest4") # Get an existing collection.
# collection.release()
# print(datetime.datetime.utcnow())
# alias = uuid4().hex
# # #connection_args = {"host": 'localhost', "port": '19530'}
# connection_args = {"uri": 'https://in01-91c80c04f4aed06.aws-us-west-2.vectordb.zillizcloud.com:19530',
# 'user': 'db_admin', 'password': 'dify123456!'}
# connections.connect(alias=alias, **connection_args)
# connection = Collection(
# 'jytest10',
# using=alias,
# )
# print(datetime.datetime.utcnow())
# # connection.release()
# query = '阿甘正传'
# search_params = {"metric_type": "IP", "params": {"level": 2}}
# docs = Milvus(embedding_function=embeddings, collection_name='jytest4').similarity_search(query)
# docs = Milvus(embedding_function=embeddings, collection_name='jytest', connection_args={"uri": 'https://in01-706333b4f51fa0b.aws-us-west-2.vectordb.zillizcloud.com:19530',
# 'user': 'db_admin', 'password': 'dify123456!'}).similarity_search(query)
# docs = Milvus(embedding_function=embeddings, collection_name='jytest10', connection_args={"uri": 'https://in01-91c80c04f4aed06.aws-us-west-2.vectordb.zillizcloud.com:19530',
# 'token': '01a3da355f5645fe949b1c6e97339c90b1931b6726094fcac3dd0594ab6312eb4ea314095ca989d7dfc8abfac1092dd1a6d46017', 'db_name':'dify'}).similarity_search(query)
# print(datetime.datetime.utcnow())
# docs = vector_store.similarity_search(query)
# cont = connections.connect(
# alias="default",
# user='username',
# password='password',
# host='localhost',
# port='19530'
# )
# docs = cont.search(query='What is milvus?', search_type='similarity',
# connection_args={"host": 'localhost', "port": '19530'})
# docs = vector_store.similarity_search(query)
# print(docs)
# connections.connect("default",
# uri='https://in01-617651a0cb211be.aws-us-west-2.vectordb.zillizcloud.com:19533',
# user='db_admin',
# password='dify123456!')
#
# # Check if the collection exists
# collection_name = "jytest"
# check_collection = utility.has_collection(collection_name)
# if check_collection:
# drop_result = utility.drop_collection(collection_name)
# print("Success!")
# collection = Collection(name=collection_name)
# collection.
# search_params = {"metric_type": "L2", "params": {"level": 2}}
# results = collection.search('电影排名50',
# anns_field='page_content',
# param=search_params,
# limit=1,
# guarantee_timestamp=1)
# connections.disconnect("default")
api/controllers/console/datasets/test_embedding.py
0 → 100644
View file @
4c596272
import
numpy
as
np
from
numpy
import
average
from
sentence_transformers
import
SentenceTransformer
def
test_embdding
():
sentences
=
[
"My name is john"
]
model
=
SentenceTransformer
(
'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
)
embeddings
=
model
.
encode
(
sentences
)
for
embedding
in
embeddings
:
print
(
embedding
)
embedding
=
(
embedding
/
np
.
linalg
.
norm
(
embedding
))
.
tolist
()
print
(
embedding
)
embedding
=
(
embedding
/
np
.
linalg
.
norm
(
embedding
))
.
tolist
()
print
(
embedding
)
print
(
embeddings
)
api/controllers/console/datasets/test_query.py
0 → 100644
View file @
4c596272
import
base64
import
binascii
import
hashlib
import
secrets
from
os
import
environ
import
numpy
as
np
from
langchain.embeddings
import
MiniMaxEmbeddings
from
numpy
import
average
from
sentence_transformers
import
SentenceTransformer
from
core.index.vector_index.milvus
import
Milvus
environ
[
"MINIMAX_GROUP_ID"
]
=
"1686736670459291"
environ
[
"MINIMAX_API_KEY"
]
=
"eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJOYW1lIjoiIiwiU3ViamVjdElEIjoiMTY4NjczNjY3MDQ0NzEyNSIsIlBob25lIjoiTVRVd01UZzBNREU1TlRFPSIsIkdyb3VwSUQiOiIiLCJQYWdlTmFtZSI6IiIsIk1haWwiOiJwYW5wYW5AZGlmeS5haSIsIkNyZWF0ZVRpbWUiOiIiLCJpc3MiOiJtaW5pbWF4In0.i9gRKYmOW3zM8vEcT7lD-Ym-0eE6UUU3vb-gVxpWfSMkdc6ObbRnkP5nYumZJbV9L-yRA00GW6nMWYcWkY3IbDWWFAi-hRmzAtl-orpkz5DxPzjRJbwAPy9snYlqBWYQ4hOQ-53zmA5wgsm0ga5pMpBTN9SCkm7EnBQDEsPEY1m121tuwXe6LhAMjdX0Kic-UI-KTYbDdWGAl6nu8h8lrSHVuEEYA6Lz3VDyJTcYfME-B435vw-x1UXSb5-V-YhMEhIixEO8ezUQXaERq0mErtIQEoZN4r7OeNNGjocsfwiHRiw_EdxbfYUWjpvAytmmekIuL3tfvfhbif-EZc4E5w"
def
test_query
():
# embeddings = MiniMaxEmbeddings()
# query = '你对这部电影有什么感悟'
# # search_params = {"metric_type": "IP", "params": {"level": 2}}
# # docs = Milvus(embedding_function=embeddings, collection_name='jytest4').similarity_search(query)
# docs = Milvus(embedding_function=embeddings, collection_name='jytest5',
# connection_args={"uri": 'https://in01-706333b4f51fa0b.aws-us-west-2.vectordb.zillizcloud.com:19530',
# 'user': 'db_admin', 'password': 'dify123456!'}).similarity_search(query)
# print(docs)
# generate password salt
def
hash_password
(
password_str
,
salt_byte
):
dk
=
hashlib
.
pbkdf2_hmac
(
'sha256'
,
password_str
.
encode
(
'utf-8'
),
salt_byte
,
10000
)
return
binascii
.
hexlify
(
dk
)
salt
=
secrets
.
token_bytes
(
16
)
base64_salt
=
base64
.
b64encode
(
salt
)
.
decode
()
# encrypt password with salt
password_hashed
=
hash_password
(
'dify123456!'
,
salt
)
base64_password_hashed
=
base64
.
b64encode
(
password_hashed
)
.
decode
()
print
(
base64_password_hashed
)
print
(
'*******************'
)
print
(
base64_salt
)
api/core/generator/llm_generator.py
View file @
4c596272
...
...
@@ -178,7 +178,7 @@ class LLMGenerator:
llm
:
StreamableOpenAI
=
LLMBuilder
.
to_llm
(
tenant_id
=
tenant_id
,
model_name
=
'gpt-3.5-turbo'
,
max_tokens
=
100
0
max_tokens
=
100
)
if
isinstance
(
llm
,
BaseChatModel
):
...
...
api/core/index/vector_index/milvus.py
0 → 100644
View file @
4c596272
This diff is collapsed.
Click to expand it.
api/core/index/vector_index/milvus_vector_index.py
0 → 100644
View file @
4c596272
from
typing
import
Optional
,
cast
import
requests
import
weaviate
from
langchain.embeddings.base
import
Embeddings
from
langchain.schema
import
Document
,
BaseRetriever
from
langchain.vectorstores
import
VectorStore
from
pydantic
import
BaseModel
,
root_validator
from
core.index.base
import
BaseIndex
from
core.index.vector_index.base
import
BaseVectorIndex
from
core.vector_store.weaviate_vector_store
import
WeaviateVectorStore
from
models.dataset
import
Dataset
class
MilvusConfig
(
BaseModel
):
uri
:
str
username
:
Optional
[
str
]
password
:
Optional
[
str
]
batch_size
:
int
=
100
@
root_validator
()
def
validate_config
(
cls
,
values
:
dict
)
->
dict
:
if
not
values
[
'uri'
]:
raise
ValueError
(
"config Milvus uri is required"
)
return
values
class
MilvusVectorIndex
(
BaseVectorIndex
):
def
__init__
(
self
,
dataset
:
Dataset
,
config
:
MilvusConfig
,
embeddings
:
Embeddings
):
super
()
.
__init__
(
dataset
,
embeddings
)
self
.
_client
=
self
.
_init_client
(
config
)
def
_init_client
(
self
,
config
:
MilvusConfig
)
->
weaviate
.
Client
:
auth_config
=
weaviate
.
auth
.
AuthApiKey
(
api_key
=
config
.
api_key
)
weaviate
.
connect
.
connection
.
has_grpc
=
False
try
:
client
=
weaviate
.
Client
(
url
=
config
.
endpoint
,
auth_client_secret
=
auth_config
,
timeout_config
=
(
5
,
60
),
startup_period
=
None
)
except
requests
.
exceptions
.
ConnectionError
:
raise
ConnectionError
(
"Vector database connection error"
)
client
.
batch
.
configure
(
# `batch_size` takes an `int` value to enable auto-batching
# (`None` is used for manual batching)
batch_size
=
config
.
batch_size
,
# dynamically update the `batch_size` based on import speed
dynamic
=
True
,
# `timeout_retries` takes an `int` value to retry on time outs
timeout_retries
=
3
,
)
return
client
def
get_type
(
self
)
->
str
:
return
'weaviate'
def
get_index_name
(
self
,
dataset
:
Dataset
)
->
str
:
if
self
.
dataset
.
index_struct_dict
:
class_prefix
:
str
=
self
.
dataset
.
index_struct_dict
[
'vector_store'
][
'class_prefix'
]
if
not
class_prefix
.
endswith
(
'_Node'
):
# original class_prefix
class_prefix
+=
'_Node'
return
class_prefix
dataset_id
=
dataset
.
id
return
"Vector_index_"
+
dataset_id
.
replace
(
"-"
,
"_"
)
+
'_Node'
def
to_index_struct
(
self
)
->
dict
:
return
{
"type"
:
self
.
get_type
(),
"vector_store"
:
{
"class_prefix"
:
self
.
get_index_name
(
self
.
dataset
)}
}
def
create
(
self
,
texts
:
list
[
Document
],
**
kwargs
)
->
BaseIndex
:
uuids
=
self
.
_get_uuids
(
texts
)
self
.
_vector_store
=
WeaviateVectorStore
.
from_documents
(
texts
,
self
.
_embeddings
,
client
=
self
.
_client
,
index_name
=
self
.
get_index_name
(
self
.
dataset
),
uuids
=
uuids
,
by_text
=
False
)
return
self
def
_get_vector_store
(
self
)
->
VectorStore
:
"""Only for created index."""
if
self
.
_vector_store
:
return
self
.
_vector_store
attributes
=
[
'doc_id'
,
'dataset_id'
,
'document_id'
]
if
self
.
_is_origin
():
attributes
=
[
'doc_id'
]
return
WeaviateVectorStore
(
client
=
self
.
_client
,
index_name
=
self
.
get_index_name
(
self
.
dataset
),
text_key
=
'text'
,
embedding
=
self
.
_embeddings
,
attributes
=
attributes
,
by_text
=
False
)
def
_get_vector_store_class
(
self
)
->
type
:
return
WeaviateVectorStore
def
delete_by_document_id
(
self
,
document_id
:
str
):
if
self
.
_is_origin
():
self
.
recreate_dataset
(
self
.
dataset
)
return
vector_store
=
self
.
_get_vector_store
()
vector_store
=
cast
(
self
.
_get_vector_store_class
(),
vector_store
)
vector_store
.
del_texts
({
"operator"
:
"Equal"
,
"path"
:
[
"document_id"
],
"valueText"
:
document_id
})
def
_is_origin
(
self
):
if
self
.
dataset
.
index_struct_dict
:
class_prefix
:
str
=
self
.
dataset
.
index_struct_dict
[
'vector_store'
][
'class_prefix'
]
if
not
class_prefix
.
endswith
(
'_Node'
):
# original class_prefix
return
True
return
False
api/core/index/vector_index/test-embedding.py
0 → 100644
View file @
4c596272
import
numpy
as
np
import
sklearn.decomposition
import
pickle
import
time
# Apply 'Algorithm 1' to the ada-002 embeddings to make them isotropic, taken from the paper:
# ALL-BUT-THE-TOP: SIMPLE AND EFFECTIVE POST- PROCESSING FOR WORD REPRESENTATIONS
# Jiaqi Mu, Pramod Viswanath
# This uses Principal Component Analysis (PCA) to 'evenly distribute' the embedding vectors (make them isotropic)
# For more information on PCA, see https://jamesmccaffrey.wordpress.com/2021/07/16/computing-pca-using-numpy-without-scikit/
# get the file pointer of the pickle containing the embeddings
fp
=
open
(
'/path/to/your/data/Embedding-Latest.pkl'
,
'rb'
)
# the embedding data here is a dict consisting of key / value pairs
# the key is the hash of the message (SHA3-256), the value is the embedding from ada-002 (array of dimension 1536)
# the hash can be used to lookup the orignal text in a database
E
=
pickle
.
load
(
fp
)
# load the data into memory
# seperate the keys (hashes) and values (embeddings) into seperate vectors
K
=
list
(
E
.
keys
())
# vector of all the hash values
X
=
np
.
array
(
list
(
E
.
values
()))
# vector of all the embeddings, converted to numpy arrays
# list the total number of embeddings
# this can be truncated if there are too many embeddings to do PCA on
print
(
f
"Total number of embeddings: {len(X)}"
)
# get dimension of embeddings, used later
Dim
=
len
(
X
[
0
])
# flash out the first few embeddings
print
(
"First two embeddings are: "
)
print
(
X
[
0
])
print
(
f
"First embedding length: {len(X[0])}"
)
print
(
X
[
1
])
print
(
f
"Second embedding length: {len(X[1])}"
)
# compute the mean of all the embeddings, and flash the result
mu
=
np
.
mean
(
X
,
axis
=
0
)
# same as mu in paper
print
(
f
"Mean embedding vector: {mu}"
)
print
(
f
"Mean embedding vector length: {len(mu)}"
)
# subtract the mean vector from each embedding vector ... vectorized in numpy
X_tilde
=
X
-
mu
# same as v_tilde(w) in paper
# do the heavy lifting of extracting the principal components
# note that this is a function of the embeddings you currently have here, and this set may grow over time
# therefore the PCA basis vectors may change over time, and your final isotropic embeddings may drift over time
# but the drift should stabilize after you have extracted enough embedding data to characterize the nature of the embedding engine
print
(
f
"Performing PCA on the normalized embeddings ..."
)
pca
=
sklearn
.
decomposition
.
PCA
()
# new object
TICK
=
time
.
time
()
# start timer
pca
.
fit
(
X_tilde
)
# do the heavy lifting!
TOCK
=
time
.
time
()
# end timer
DELTA
=
TOCK
-
TICK
print
(
f
"PCA finished in {DELTA} seconds ..."
)
# dimensional reduction stage (the only hyperparameter)
# pick max dimension of PCA components to express embddings
# in general this is some integer less than or equal to the dimension of your embeddings
# it could be set as a high percentile, say 95th percentile of pca.explained_variance_ratio_
# but just hardcoding a constant here
D
=
15
# hyperparameter on dimension (out of 1536 for ada-002), paper recommeds D = Dim/100
# form the set of v_prime(w), which is the final embedding
# this could be vectorized in numpy to speed it up, but coding it directly here in a double for-loop to avoid errors and to be transparent
E_prime
=
dict
()
# output dict of the new embeddings
N
=
len
(
X_tilde
)
N10
=
round
(
N
/
10
)
U
=
pca
.
components_
# set of PCA basis vectors, sorted by most significant to least significant
print
(
f
"Shape of full set of PCA componenents {U.shape}"
)
U
=
U
[
0
:
D
,:]
# take the top D dimensions (or take them all if D is the size of the embedding vector)
print
(
f
"Shape of downselected PCA componenents {U.shape}"
)
for
ii
in
range
(
N
):
v_tilde
=
X_tilde
[
ii
]
v
=
X
[
ii
]
v_projection
=
np
.
zeros
(
Dim
)
# start to build the projection
# project the original embedding onto the PCA basis vectors, use only first D dimensions
for
jj
in
range
(
D
):
u_jj
=
U
[
jj
,:]
# vector
v_jj
=
np
.
dot
(
u_jj
,
v
)
# scaler
v_projection
+=
v_jj
*
u_jj
# vector
v_prime
=
v_tilde
-
v_projection
# final embedding vector
v_prime
=
v_prime
/
np
.
linalg
.
norm
(
v_prime
)
# create unit vector
E_prime
[
K
[
ii
]]
=
v_prime
if
(
ii
%
N10
==
0
)
or
(
ii
==
N
-
1
):
print
(
f
"Finished with {ii+1} embeddings out of {N} ({round(100*ii/N)}
%
done)"
)
# save as new pickle
print
(
"Saving new pickle ..."
)
embeddingName
=
'/path/to/your/data/Embedding-Latest-Isotropic.pkl'
with
open
(
embeddingName
,
'wb'
)
as
f
:
# Python 3: open(..., 'wb')
pickle
.
dump
([
E_prime
,
mu
,
U
],
f
)
print
(
embeddingName
)
print
(
"Done!"
)
# When working with live data with a new embedding from ada-002, be sure to tranform it first with this function before comparing it
#
def
projectEmbedding
(
v
,
mu
,
U
):
v
=
np
.
array
(
v
)
v_tilde
=
v
-
mu
v_projection
=
np
.
zeros
(
len
(
v
))
# start to build the projection
# project the original embedding onto the PCA basis vectors, use only first D dimensions
for
u
in
U
:
v_jj
=
np
.
dot
(
u
,
v
)
# scaler
v_projection
+=
v_jj
*
u
# vector
v_prime
=
v_tilde
-
v_projection
# final embedding vector
v_prime
=
v_prime
/
np
.
linalg
.
norm
(
v_prime
)
# create unit vector
return
v_prime
\ No newline at end of file
api/core/indexing_runner.py
View file @
4c596272
...
...
@@ -71,18 +71,18 @@ class IndexingRunner:
dataset_document
=
dataset_document
,
processing_rule
=
processing_rule
)
new_documents
=
[]
for
document
in
documents
:
response
=
LLMGenerator
.
generate_qa_document
(
dataset
.
tenant_id
,
document
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
for
result
in
document_qa_list
:
document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
{
'source'
:
result
[
'answer'
]})
new_documents
.
append
(
document
)
#
new_documents = []
#
for document in documents:
#
response = LLMGenerator.generate_qa_document(dataset.tenant_id, document.page_content)
#
document_qa_list = self.format_split_text(response)
#
for result in document_qa_list:
#
document = Document(page_content=result['question'], metadata={'source': result['answer']})
#
new_documents.append(document)
# build index
self
.
_build_index
(
dataset
=
dataset
,
dataset_document
=
dataset_document
,
documents
=
new_
documents
documents
=
documents
)
except
DocumentIsPausedException
:
raise
DocumentIsPausedException
(
'Document paused, document id: {}'
.
format
(
dataset_document
.
id
))
...
...
@@ -251,7 +251,8 @@ class IndexingRunner:
documents
=
self
.
_split_to_documents
(
text_docs
=
text_docs
,
splitter
=
splitter
,
processing_rule
=
processing_rule
processing_rule
=
processing_rule
,
tenant_id
=
'84b2202c-c359-46b7-a810-bce50feaa4d1'
)
total_segments
+=
len
(
documents
)
for
document
in
documents
:
...
...
@@ -311,7 +312,8 @@ class IndexingRunner:
documents
=
self
.
_split_to_documents
(
text_docs
=
documents
,
splitter
=
splitter
,
processing_rule
=
processing_rule
processing_rule
=
processing_rule
,
tenant_id
=
'84b2202c-c359-46b7-a810-bce50feaa4d1'
)
total_segments
+=
len
(
documents
)
for
document
in
documents
:
...
...
@@ -414,7 +416,8 @@ class IndexingRunner:
documents
=
self
.
_split_to_documents
(
text_docs
=
text_docs
,
splitter
=
splitter
,
processing_rule
=
processing_rule
processing_rule
=
processing_rule
,
tenant_id
=
dataset
.
tenant_id
)
# save node to document segment
...
...
@@ -469,18 +472,18 @@ class IndexingRunner:
if
document
.
page_content
is
None
or
not
document
.
page_content
.
strip
():
continue
response
=
LLMGenerator
.
generate_qa_document
(
processing_rule
.
tenant_id
,
document
.
page_content
)
response
=
LLMGenerator
.
generate_qa_document
(
tenant_id
,
document
.
page_content
)
document_qa_list
=
self
.
format_split_text
(
response
)
qa_documents
=
[]
for
result
in
document_qa_list
:
document
=
Document
(
page_content
=
result
[
'question'
],
metadata
=
{
'source'
:
result
[
'answer'
]})
new_documents
.
append
(
document
)
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document
.
page_content
)
document
.
metadata
[
'doc_id'
]
=
doc_id
document
.
metadata
[
'doc_hash'
]
=
hash
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document
.
page_content
)
split_documents
.
append
(
document
)
document
.
metadata
[
'doc_id'
]
=
doc_id
document
.
metadata
[
'doc_hash'
]
=
hash
qa_documents
.
append
(
document
)
split_documents
.
extend
(
qa_documents
)
all_documents
.
extend
(
split_documents
)
...
...
api/core/prompt/prompts.py
View file @
4c596272
...
...
@@ -51,6 +51,7 @@ GENERATOR_QA_PROMPT = (
'Step3:可分解或结合多个信息与概念
\n
'
'Step4:将这些关键信息与概念生成 10 个问题与答案,问题描述清楚并且详细完整,答案详细完整.
\n
'
"按格式回答: Q1:
\n
A1:
\n
Q2:
\n
A2:...
\n
"
"只输出Step4中的内容"
)
RULE_CONFIG_GENERATE_TEMPLATE
=
"""Given MY INTENDED AUDIENCES and HOPING TO SOLVE using a language model, please select
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment