Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
71981eac
Commit
71981eac
authored
Jun 20, 2023
by
John Wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: kw table bugs
parent
ced9fc52
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
109 additions
and
53 deletions
+109
-53
file_extractor.py
api/core/data_loader/file_extractor.py
+17
-17
excel.py
api/core/data_loader/loader/excel.py
+0
-4
html.py
api/core/data_loader/loader/html.py
+2
-2
pdf.py
api/core/data_loader/loader/pdf.py
+0
-3
cached_embedding.py
api/core/embedding/cached_embedding.py
+2
-2
keyword_table_index.py
api/core/index/keyword_table_index/keyword_table_index.py
+53
-13
base.py
api/core/index/vector_index/base.py
+2
-1
qdrant_vector_index.py
api/core/index/vector_index/qdrant_vector_index.py
+9
-5
weaviate_vector_index.py
api/core/index/vector_index/weaviate_vector_index.py
+13
-4
indexing_runner.py
api/core/indexing_runner.py
+1
-1
dataset.py
api/models/dataset.py
+10
-1
No files found.
api/core/data_loader/file_extractor.py
View file @
71981eac
...
...
@@ -22,21 +22,21 @@ class FileExtractor:
file_path
=
f
"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
storage
.
download
(
upload_file
.
key
,
file_path
)
input_file
=
Path
(
file_path
)
if
input_file
.
suffix
==
'.xlxs'
:
loader
=
ExcelLoader
(
file_path
)
elif
input_file
.
suffix
==
'.pdf'
:
loader
=
PdfLoader
(
file_path
,
upload_file
=
upload_file
)
elif
input_file
.
suffix
in
[
'.md'
,
'.markdown'
]:
loader
=
MarkdownLoader
(
file_path
,
autodetect_encoding
=
True
)
elif
input_file
.
suffix
in
[
'.htm'
,
'.html'
]:
loader
=
HTMLLoader
(
file_path
)
elif
input_file
.
suffix
==
'.docx'
:
loader
=
Docx2txtLoader
(
file_path
)
elif
input_file
.
suffix
==
'.csv'
:
loader
=
CSVLoader
(
file_path
,
autodetect_encoding
=
True
)
else
:
# txt
loader
=
TextLoader
(
file_path
,
autodetect_encoding
=
True
)
input_file
=
Path
(
file_path
)
if
input_file
.
suffix
==
'.xlxs'
:
loader
=
ExcelLoader
(
file_path
)
elif
input_file
.
suffix
==
'.pdf'
:
loader
=
PdfLoader
(
file_path
,
upload_file
=
upload_file
)
elif
input_file
.
suffix
in
[
'.md'
,
'.markdown'
]:
loader
=
MarkdownLoader
(
file_path
,
autodetect_encoding
=
True
)
elif
input_file
.
suffix
in
[
'.htm'
,
'.html'
]:
loader
=
HTMLLoader
(
file_path
)
elif
input_file
.
suffix
==
'.docx'
:
loader
=
Docx2txtLoader
(
file_path
)
elif
input_file
.
suffix
==
'.csv'
:
loader
=
CSVLoader
(
file_path
,
autodetect_encoding
=
True
)
else
:
# txt
loader
=
TextLoader
(
file_path
,
autodetect_encoding
=
True
)
return
loader
.
load_as_text
(
)
if
return_text
else
loader
.
load
()
return
'
\n
'
.
join
([
document
.
page_content
for
document
in
loader
.
load
()]
)
if
return_text
else
loader
.
load
()
api/core/data_loader/loader/excel.py
View file @
71981eac
...
...
@@ -40,7 +40,3 @@ class ExcelLoader(BaseLoader):
metadata
=
{
"source"
:
self
.
_file_path
}
return
[
Document
(
page_content
=
'
\n\n
'
.
join
(
data
),
metadata
=
metadata
)]
def
load_as_text
(
self
)
->
str
:
documents
=
self
.
load
()
return
''
.
join
([
document
.
page_content
for
document
in
documents
])
api/core/data_loader/loader/html.py
View file @
71981eac
...
...
@@ -25,9 +25,9 @@ class HTMLLoader(BaseLoader):
def
load
(
self
)
->
List
[
Document
]:
metadata
=
{
"source"
:
self
.
_file_path
}
return
[
Document
(
page_content
=
self
.
load_as_text
(),
metadata
=
metadata
)]
return
[
Document
(
page_content
=
self
.
_
load_as_text
(),
metadata
=
metadata
)]
def
load_as_text
(
self
)
->
str
:
def
_
load_as_text
(
self
)
->
str
:
with
open
(
self
.
_file_path
,
"rb"
)
as
fp
:
soup
=
BeautifulSoup
(
fp
,
'html.parser'
)
text
=
soup
.
get_text
()
...
...
api/core/data_loader/loader/pdf.py
View file @
71981eac
...
...
@@ -64,6 +64,3 @@ class PdfLoader(BaseLoader):
metadata
=
{
"source"
:
self
.
_file_path
}
return
[
Document
(
page_content
=
text
,
metadata
=
metadata
)]
def
load_as_text
(
self
)
->
str
:
documents
=
self
.
load
()
return
'
\n
'
.
join
([
document
.
page_content
for
document
in
documents
])
api/core/embedding/cached_embedding.py
View file @
71981eac
...
...
@@ -46,8 +46,8 @@ class CacheEmbedding(Embeddings):
i
+=
1
embedding_queue_text
s
.
extend
(
embedding_results
)
return
embedding_queue_text
s
text_embedding
s
.
extend
(
embedding_results
)
return
text_embedding
s
def
embed_query
(
self
,
text
:
str
)
->
List
[
float
]:
"""Embed query text."""
...
...
api/core/index/keyword_table_index/keyword_table_index.py
View file @
71981eac
...
...
@@ -30,11 +30,20 @@ class KeywordTableIndex(BaseIndex):
dataset_keyword_table
=
DatasetKeywordTable
(
dataset_id
=
self
.
_dataset
.
id
,
keyword_table
=
json
.
dumps
(
keyword_table
)
keyword_table
=
json
.
dumps
({
'__type__'
:
'keyword_table'
,
'__data__'
:
{
"index_id"
:
self
.
_dataset
.
id
,
"summary"
:
None
,
"table"
:
{}
}
},
cls
=
SetEncoder
)
)
db
.
session
.
add
(
dataset_keyword_table
)
db
.
session
.
commit
()
self
.
_save_dataset_keyword_table
(
keyword_table
)
return
self
def
add_texts
(
self
,
texts
:
list
[
Document
],
**
kwargs
):
...
...
@@ -46,8 +55,7 @@ class KeywordTableIndex(BaseIndex):
self
.
_update_segment_keywords
(
text
.
metadata
[
'doc_id'
],
list
(
keywords
))
keyword_table
=
self
.
_add_text_to_keyword_table
(
keyword_table
,
text
.
metadata
[
'doc_id'
],
list
(
keywords
))
self
.
_dataset
.
dataset_keyword_table
.
keyword_table
=
json
.
dumps
(
keyword_table
)
db
.
session
.
commit
()
self
.
_save_dataset_keyword_table
(
keyword_table
)
def
text_exists
(
self
,
id
:
str
)
->
bool
:
keyword_table
=
self
.
_get_dataset_keyword_table
()
...
...
@@ -57,8 +65,7 @@ class KeywordTableIndex(BaseIndex):
keyword_table
=
self
.
_get_dataset_keyword_table
()
keyword_table
=
self
.
_delete_ids_from_keyword_table
(
keyword_table
,
ids
)
self
.
_dataset
.
dataset_keyword_table
.
keyword_table
=
json
.
dumps
(
keyword_table
)
db
.
session
.
commit
()
self
.
_save_dataset_keyword_table
(
keyword_table
)
def
delete_by_document_id
(
self
,
document_id
:
str
):
# get segment ids by document_id
...
...
@@ -72,8 +79,7 @@ class KeywordTableIndex(BaseIndex):
keyword_table
=
self
.
_get_dataset_keyword_table
()
keyword_table
=
self
.
_delete_ids_from_keyword_table
(
keyword_table
,
ids
)
self
.
_dataset
.
dataset_keyword_table
.
keyword_table
=
json
.
dumps
(
keyword_table
)
db
.
session
.
commit
()
self
.
_save_dataset_keyword_table
(
keyword_table
)
def
get_retriever
(
self
,
**
kwargs
:
Any
)
->
BaseRetriever
:
return
KeywordTableRetriever
(
index
=
self
,
**
kwargs
)
...
...
@@ -108,10 +114,38 @@ class KeywordTableIndex(BaseIndex):
return
documents
def
_save_dataset_keyword_table
(
self
,
keyword_table
):
keyword_table_dict
=
{
'__type__'
:
'keyword_table'
,
'__data__'
:
{
"index_id"
:
self
.
_dataset
.
id
,
"summary"
:
None
,
"table"
:
keyword_table
}
}
self
.
_dataset
.
dataset_keyword_table
.
keyword_table
=
json
.
dumps
(
keyword_table_dict
,
cls
=
SetEncoder
)
db
.
session
.
commit
()
def
_get_dataset_keyword_table
(
self
)
->
Optional
[
dict
]:
keyword_table_dict
=
self
.
_dataset
.
dataset_keyword_table
.
keyword_table_dict
if
keyword_table_dict
:
return
keyword_table_dict
dataset_keyword_table
=
self
.
_dataset
.
dataset_keyword_table
if
dataset_keyword_table
:
if
dataset_keyword_table
.
keyword_table_dict
:
return
dataset_keyword_table
.
keyword_table_dict
[
'__data__'
][
'table'
]
else
:
dataset_keyword_table
=
DatasetKeywordTable
(
dataset_id
=
self
.
_dataset
.
id
,
keyword_table
=
json
.
dumps
({
'__type__'
:
'keyword_table'
,
'__data__'
:
{
"index_id"
:
self
.
_dataset
.
id
,
"summary"
:
None
,
"table"
:
{}
}
},
cls
=
SetEncoder
)
)
db
.
session
.
add
(
dataset_keyword_table
)
db
.
session
.
commit
()
return
{}
def
_add_text_to_keyword_table
(
self
,
keyword_table
:
dict
,
id
:
str
,
keywords
:
list
[
str
])
->
dict
:
...
...
@@ -146,9 +180,9 @@ class KeywordTableIndex(BaseIndex):
# go through text chunks in order of most matching keywords
chunk_indices_count
:
Dict
[
str
,
int
]
=
defaultdict
(
int
)
keywords
=
[
k
for
k
in
keywords
if
k
in
set
(
keyword_table
.
keys
())]
for
k
in
keywords
:
for
node_id
in
keyword_table
[
k
]:
keywords
=
[
k
eyword
for
keyword
in
keywords
if
keyword
in
set
(
keyword_table
.
keys
())]
for
k
eyword
in
keywords
:
for
node_id
in
keyword_table
[
k
eyword
]:
chunk_indices_count
[
node_id
]
+=
1
sorted_chunk_indices
=
sorted
(
...
...
@@ -190,3 +224,9 @@ class KeywordTableRetriever(BaseRetriever, BaseModel):
async
def
aget_relevant_documents
(
self
,
query
:
str
)
->
List
[
Document
]:
raise
NotImplementedError
(
"KeywordTableRetriever does not support async"
)
class
SetEncoder
(
json
.
JSONEncoder
):
def
default
(
self
,
obj
):
if
isinstance
(
obj
,
set
):
return
list
(
obj
)
return
super
()
.
default
(
obj
)
\ No newline at end of file
api/core/index/vector_index/base.py
View file @
71981eac
...
...
@@ -5,6 +5,7 @@ from langchain.schema import Document, BaseRetriever
from
langchain.vectorstores
import
VectorStore
from
core.index.base
import
BaseIndex
from
models.dataset
import
Dataset
class
BaseVectorIndex
(
BaseIndex
):
...
...
@@ -12,7 +13,7 @@ class BaseVectorIndex(BaseIndex):
raise
NotImplementedError
@
abstractmethod
def
get_index_name
(
self
,
dataset
_id
:
str
)
->
str
:
def
get_index_name
(
self
,
dataset
:
Dataset
)
->
str
:
raise
NotImplementedError
@
abstractmethod
...
...
api/core/index/vector_index/qdrant_vector_index.py
View file @
71981eac
...
...
@@ -44,13 +44,17 @@ class QdrantVectorIndex(BaseVectorIndex):
def
get_type
(
self
)
->
str
:
return
'qdrant'
def
get_index_name
(
self
,
dataset_id
:
str
)
->
str
:
return
"Vector_index_"
+
dataset_id
.
replace
(
"-"
,
"_"
)
+
'_Node'
def
get_index_name
(
self
,
dataset
:
Dataset
)
->
str
:
if
self
.
_dataset
.
index_struct_dict
:
return
self
.
_dataset
.
index_struct_dict
[
'vector_store'
][
'collection_name'
]
dataset_id
=
dataset
.
id
return
"Vector_index_"
+
dataset_id
.
replace
(
"-"
,
"_"
)
def
to_index_struct
(
self
)
->
dict
:
return
{
"type"
:
self
.
get_type
(),
"vector_store"
:
{
"collection_name"
:
self
.
get_index_name
(
self
.
_dataset
.
id
)}
"vector_store"
:
{
"collection_name"
:
self
.
get_index_name
(
self
.
_dataset
)}
}
def
create
(
self
,
texts
:
list
[
Document
],
**
kwargs
)
->
BaseIndex
:
...
...
@@ -58,7 +62,7 @@ class QdrantVectorIndex(BaseVectorIndex):
self
.
_vector_store
=
QdrantVectorStore
.
from_documents
(
texts
,
self
.
_embeddings
,
collection_name
=
self
.
get_index_name
(
self
.
_dataset
.
id
),
collection_name
=
self
.
get_index_name
(
self
.
_dataset
),
ids
=
uuids
,
**
self
.
_client_config
.
to_qdrant_params
()
)
...
...
@@ -76,7 +80,7 @@ class QdrantVectorIndex(BaseVectorIndex):
return
QdrantVectorStore
(
client
=
client
,
collection_name
=
self
.
get_index_name
(
self
.
_dataset
.
id
),
collection_name
=
self
.
get_index_name
(
self
.
_dataset
),
embeddings
=
self
.
_embeddings
)
...
...
api/core/index/vector_index/weaviate_vector_index.py
View file @
71981eac
...
...
@@ -59,13 +59,22 @@ class WeaviateVectorIndex(BaseVectorIndex):
def
get_type
(
self
)
->
str
:
return
'weaviate'
def
get_index_name
(
self
,
dataset_id
:
str
)
->
str
:
def
get_index_name
(
self
,
dataset
:
Dataset
)
->
str
:
if
self
.
_dataset
.
index_struct_dict
:
class_prefix
:
str
=
self
.
_dataset
.
index_struct_dict
[
'vector_store'
][
'class_prefix'
]
if
not
class_prefix
.
endswith
(
'_Node'
):
# original class_prefix
class_prefix
+=
'_Node'
return
class_prefix
dataset_id
=
dataset
.
id
return
"Vector_index_"
+
dataset_id
.
replace
(
"-"
,
"_"
)
+
'_Node'
def
to_index_struct
(
self
)
->
dict
:
return
{
"type"
:
self
.
get_type
(),
"vector_store"
:
{
"class_prefix"
:
self
.
get_index_name
(
self
.
_dataset
.
id
)}
"vector_store"
:
{
"class_prefix"
:
self
.
get_index_name
(
self
.
_dataset
)}
}
def
create
(
self
,
texts
:
list
[
Document
],
**
kwargs
)
->
BaseIndex
:
...
...
@@ -74,7 +83,7 @@ class WeaviateVectorIndex(BaseVectorIndex):
texts
,
self
.
_embeddings
,
client
=
self
.
_client
,
index_name
=
self
.
get_index_name
(
self
.
_dataset
.
id
),
index_name
=
self
.
get_index_name
(
self
.
_dataset
),
uuids
=
uuids
,
by_text
=
False
)
...
...
@@ -88,7 +97,7 @@ class WeaviateVectorIndex(BaseVectorIndex):
return
WeaviateVectorStore
(
client
=
self
.
_client
,
index_name
=
self
.
get_index_name
(
self
.
_dataset
.
id
),
index_name
=
self
.
get_index_name
(
self
.
_dataset
),
text_key
=
'text'
,
embedding
=
self
.
_embeddings
,
attributes
=
self
.
_attributes
,
...
...
api/core/indexing_runner.py
View file @
71981eac
...
...
@@ -329,7 +329,7 @@ class IndexingRunner:
document_id
=
dataset_document
.
id
,
after_indexing_status
=
"splitting"
,
extra_update_params
=
{
DatasetDocument
.
word_count
:
sum
([
len
(
text_doc
.
tex
t
)
for
text_doc
in
text_docs
]),
DatasetDocument
.
word_count
:
sum
([
len
(
text_doc
.
page_conten
t
)
for
text_doc
in
text_docs
]),
DatasetDocument
.
parsing_completed_at
:
datetime
.
datetime
.
utcnow
()
}
)
...
...
api/models/dataset.py
View file @
71981eac
...
...
@@ -395,7 +395,16 @@ class DatasetKeywordTable(db.Model):
@
property
def
keyword_table_dict
(
self
):
return
json
.
loads
(
self
.
keyword_table
)
if
self
.
keyword_table
else
None
class
SetDecoder
(
json
.
JSONDecoder
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
()
.
__init__
(
object_hook
=
self
.
object_hook
,
*
args
,
**
kwargs
)
def
object_hook
(
self
,
dct
):
if
"__set__"
in
dct
:
return
set
(
dct
[
"__set__"
])
return
dct
return
json
.
loads
(
self
.
keyword_table
,
cls
=
SetDecoder
)
if
self
.
keyword_table
else
None
class
Embedding
(
db
.
Model
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment