Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
e2bf1805
Unverified
Commit
e2bf1805
authored
May 23, 2023
by
Jyong
Committed by
GitHub
May 23, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix/dateset update rule (#177)
parent
4350bb9a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
82 additions
and
1 deletion
+82
-1
dataset_service.py
api/services/dataset_service.py
+7
-1
deal_dataset_vector_index_task.py
api/tasks/deal_dataset_vector_index_task.py
+75
-0
No files found.
api/services/dataset_service.py
View file @
e2bf1805
...
@@ -18,6 +18,7 @@ from services.errors.account import NoPermissionError
...
@@ -18,6 +18,7 @@ from services.errors.account import NoPermissionError
from
services.errors.dataset
import
DatasetNameDuplicateError
from
services.errors.dataset
import
DatasetNameDuplicateError
from
services.errors.document
import
DocumentIndexingError
from
services.errors.document
import
DocumentIndexingError
from
services.errors.file
import
FileNotExistsError
from
services.errors.file
import
FileNotExistsError
from
tasks.deal_dataset_vector_index_task
import
deal_dataset_vector_index_task
from
tasks.document_indexing_task
import
document_indexing_task
from
tasks.document_indexing_task
import
document_indexing_task
...
@@ -97,7 +98,12 @@ class DatasetService:
...
@@ -97,7 +98,12 @@ class DatasetService:
def
update_dataset
(
dataset_id
,
data
,
user
):
def
update_dataset
(
dataset_id
,
data
,
user
):
dataset
=
DatasetService
.
get_dataset
(
dataset_id
)
dataset
=
DatasetService
.
get_dataset
(
dataset_id
)
DatasetService
.
check_dataset_permission
(
dataset
,
user
)
DatasetService
.
check_dataset_permission
(
dataset
,
user
)
if
dataset
.
indexing_technique
!=
data
[
'indexing_technique'
]:
# if update indexing_technique
if
data
[
'indexing_technique'
]
==
'economy'
:
deal_dataset_vector_index_task
.
delay
(
dataset_id
,
'remove'
)
elif
data
[
'indexing_technique'
]
==
'high_quality'
:
deal_dataset_vector_index_task
.
delay
(
dataset_id
,
'add'
)
filtered_data
=
{
k
:
v
for
k
,
v
in
data
.
items
()
if
v
is
not
None
or
k
==
'description'
}
filtered_data
=
{
k
:
v
for
k
,
v
in
data
.
items
()
if
v
is
not
None
or
k
==
'description'
}
filtered_data
[
'updated_by'
]
=
user
.
id
filtered_data
[
'updated_by'
]
=
user
.
id
...
...
api/tasks/deal_dataset_vector_index_task.py
0 → 100644
View file @
e2bf1805
import
logging
import
time
import
click
from
celery
import
shared_task
from
llama_index.data_structs.node_v2
import
DocumentRelationship
,
Node
from
core.index.vector_index
import
VectorIndex
from
extensions.ext_database
import
db
from
models.dataset
import
DocumentSegment
,
Document
,
Dataset
@
shared_task
def
deal_dataset_vector_index_task
(
dataset_id
:
str
,
action
:
str
):
"""
Async deal dataset from index
:param dataset_id: dataset_id
:param action: action
Usage: deal_dataset_vector_index_task.delay(dataset_id, action)
"""
logging
.
info
(
click
.
style
(
'Start deal dataset vector index: {}'
.
format
(
dataset_id
),
fg
=
'green'
))
start_at
=
time
.
perf_counter
()
try
:
dataset
=
Dataset
.
query
.
filter_by
(
id
=
dataset_id
)
.
first
()
if
not
dataset
:
raise
Exception
(
'Dataset not found'
)
documents
=
Document
.
query
.
filter_by
(
dataset_id
=
dataset_id
)
.
all
()
if
documents
:
vector_index
=
VectorIndex
(
dataset
=
dataset
)
for
document
in
documents
:
# delete from vector index
if
action
==
"remove"
:
vector_index
.
del_doc
(
document
.
id
)
elif
action
==
"add"
:
segments
=
db
.
session
.
query
(
DocumentSegment
)
.
filter
(
DocumentSegment
.
document_id
==
document
.
id
,
DocumentSegment
.
enabled
==
True
)
.
order_by
(
DocumentSegment
.
position
.
asc
())
.
all
()
nodes
=
[]
previous_node
=
None
for
segment
in
segments
:
relationships
=
{
DocumentRelationship
.
SOURCE
:
document
.
id
}
if
previous_node
:
relationships
[
DocumentRelationship
.
PREVIOUS
]
=
previous_node
.
doc_id
previous_node
.
relationships
[
DocumentRelationship
.
NEXT
]
=
segment
.
index_node_id
node
=
Node
(
doc_id
=
segment
.
index_node_id
,
doc_hash
=
segment
.
index_node_hash
,
text
=
segment
.
content
,
extra_info
=
None
,
node_info
=
None
,
relationships
=
relationships
)
previous_node
=
node
nodes
.
append
(
node
)
# save vector index
vector_index
.
add_nodes
(
nodes
=
nodes
,
duplicate_check
=
True
)
end_at
=
time
.
perf_counter
()
logging
.
info
(
click
.
style
(
'Deal dataset vector index: {} latency: {}'
.
format
(
dataset_id
,
end_at
-
start_at
),
fg
=
'green'
))
except
Exception
:
logging
.
exception
(
"Deal dataset vector index failed"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment