Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
289c93d0
Unverified
Commit
289c93d0
authored
Oct 12, 2023
by
Jyong
Committed by
GitHub
Oct 12, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Feat/improve document delete logic (#1325)
Co-authored-by:
jyong
<
jyong@dify.ai
>
parent
c0fe7065
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
34 additions
and
26 deletions
+34
-26
indexing_runner.py
api/core/indexing_runner.py
+14
-3
dataset_service.py
api/services/dataset_service.py
+0
-3
clean_document_task.py
api/tasks/clean_document_task.py
+15
-13
document_indexing_task.py
api/tasks/document_indexing_task.py
+5
-7
No files found.
api/core/indexing_runner.py
View file @
289c93d0
...
@@ -11,6 +11,7 @@ from flask import current_app, Flask
...
@@ -11,6 +11,7 @@ from flask import current_app, Flask
from
flask_login
import
current_user
from
flask_login
import
current_user
from
langchain.schema
import
Document
from
langchain.schema
import
Document
from
langchain.text_splitter
import
RecursiveCharacterTextSplitter
,
TextSplitter
from
langchain.text_splitter
import
RecursiveCharacterTextSplitter
,
TextSplitter
from
sqlalchemy.orm.exc
import
ObjectDeletedError
from
core.data_loader.file_extractor
import
FileExtractor
from
core.data_loader.file_extractor
import
FileExtractor
from
core.data_loader.loader.notion
import
NotionLoader
from
core.data_loader.loader.notion
import
NotionLoader
...
@@ -79,6 +80,8 @@ class IndexingRunner:
...
@@ -79,6 +80,8 @@ class IndexingRunner:
dataset_document
.
error
=
str
(
e
.
description
)
dataset_document
.
error
=
str
(
e
.
description
)
dataset_document
.
stopped_at
=
datetime
.
datetime
.
utcnow
()
dataset_document
.
stopped_at
=
datetime
.
datetime
.
utcnow
()
db
.
session
.
commit
()
db
.
session
.
commit
()
except
ObjectDeletedError
:
logging
.
warning
(
'Document deleted, document id: {}'
.
format
(
dataset_document
.
id
))
except
Exception
as
e
:
except
Exception
as
e
:
logging
.
exception
(
"consume document failed"
)
logging
.
exception
(
"consume document failed"
)
dataset_document
.
indexing_status
=
'error'
dataset_document
.
indexing_status
=
'error'
...
@@ -276,7 +279,8 @@ class IndexingRunner:
...
@@ -276,7 +279,8 @@ class IndexingRunner:
)
)
if
len
(
preview_texts
)
>
0
:
if
len
(
preview_texts
)
>
0
:
# qa model document
# qa model document
response
=
LLMGenerator
.
generate_qa_document
(
current_user
.
current_tenant_id
,
preview_texts
[
0
],
doc_language
)
response
=
LLMGenerator
.
generate_qa_document
(
current_user
.
current_tenant_id
,
preview_texts
[
0
],
doc_language
)
document_qa_list
=
self
.
format_split_text
(
response
)
document_qa_list
=
self
.
format_split_text
(
response
)
return
{
return
{
"total_segments"
:
total_segments
*
20
,
"total_segments"
:
total_segments
*
20
,
...
@@ -372,7 +376,8 @@ class IndexingRunner:
...
@@ -372,7 +376,8 @@ class IndexingRunner:
)
)
if
len
(
preview_texts
)
>
0
:
if
len
(
preview_texts
)
>
0
:
# qa model document
# qa model document
response
=
LLMGenerator
.
generate_qa_document
(
current_user
.
current_tenant_id
,
preview_texts
[
0
],
doc_language
)
response
=
LLMGenerator
.
generate_qa_document
(
current_user
.
current_tenant_id
,
preview_texts
[
0
],
doc_language
)
document_qa_list
=
self
.
format_split_text
(
response
)
document_qa_list
=
self
.
format_split_text
(
response
)
return
{
return
{
"total_segments"
:
total_segments
*
20
,
"total_segments"
:
total_segments
*
20
,
...
@@ -582,7 +587,6 @@ class IndexingRunner:
...
@@ -582,7 +587,6 @@ class IndexingRunner:
all_qa_documents
.
extend
(
format_documents
)
all_qa_documents
.
extend
(
format_documents
)
def
_split_to_documents_for_estimate
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
def
_split_to_documents_for_estimate
(
self
,
text_docs
:
List
[
Document
],
splitter
:
TextSplitter
,
processing_rule
:
DatasetProcessRule
)
->
List
[
Document
]:
processing_rule
:
DatasetProcessRule
)
->
List
[
Document
]:
"""
"""
...
@@ -734,6 +738,9 @@ class IndexingRunner:
...
@@ -734,6 +738,9 @@ class IndexingRunner:
count
=
DatasetDocument
.
query
.
filter_by
(
id
=
document_id
,
is_paused
=
True
)
.
count
()
count
=
DatasetDocument
.
query
.
filter_by
(
id
=
document_id
,
is_paused
=
True
)
.
count
()
if
count
>
0
:
if
count
>
0
:
raise
DocumentIsPausedException
()
raise
DocumentIsPausedException
()
document
=
DatasetDocument
.
query
.
filter_by
(
id
=
document_id
)
.
first
()
if
not
document
:
raise
DocumentIsDeletedPausedException
()
update_params
=
{
update_params
=
{
DatasetDocument
.
indexing_status
:
after_indexing_status
DatasetDocument
.
indexing_status
:
after_indexing_status
...
@@ -781,3 +788,7 @@ class IndexingRunner:
...
@@ -781,3 +788,7 @@ class IndexingRunner:
class
DocumentIsPausedException
(
Exception
):
class
DocumentIsPausedException
(
Exception
):
pass
pass
class
DocumentIsDeletedPausedException
(
Exception
):
pass
api/services/dataset_service.py
View file @
289c93d0
...
@@ -385,9 +385,6 @@ class DocumentService:
...
@@ -385,9 +385,6 @@ class DocumentService:
@
staticmethod
@
staticmethod
def
delete_document
(
document
):
def
delete_document
(
document
):
if
document
.
indexing_status
in
[
"parsing"
,
"cleaning"
,
"splitting"
,
"indexing"
]:
raise
DocumentIndexingError
()
# trigger document_was_deleted signal
# trigger document_was_deleted signal
document_was_deleted
.
send
(
document
.
id
,
dataset_id
=
document
.
dataset_id
)
document_was_deleted
.
send
(
document
.
id
,
dataset_id
=
document
.
dataset_id
)
...
...
api/tasks/clean_document_task.py
View file @
289c93d0
...
@@ -31,6 +31,8 @@ def clean_document_task(document_id: str, dataset_id: str):
...
@@ -31,6 +31,8 @@ def clean_document_task(document_id: str, dataset_id: str):
kw_index
=
IndexBuilder
.
get_index
(
dataset
,
'economy'
)
kw_index
=
IndexBuilder
.
get_index
(
dataset
,
'economy'
)
segments
=
db
.
session
.
query
(
DocumentSegment
)
.
filter
(
DocumentSegment
.
document_id
==
document_id
)
.
all
()
segments
=
db
.
session
.
query
(
DocumentSegment
)
.
filter
(
DocumentSegment
.
document_id
==
document_id
)
.
all
()
# check segment is exist
if
segments
:
index_node_ids
=
[
segment
.
index_node_id
for
segment
in
segments
]
index_node_ids
=
[
segment
.
index_node_id
for
segment
in
segments
]
# delete from vector index
# delete from vector index
...
...
api/tasks/document_indexing_task.py
View file @
289c93d0
...
@@ -30,9 +30,7 @@ def document_indexing_task(dataset_id: str, document_ids: list):
...
@@ -30,9 +30,7 @@ def document_indexing_task(dataset_id: str, document_ids: list):
Document
.
dataset_id
==
dataset_id
Document
.
dataset_id
==
dataset_id
)
.
first
()
)
.
first
()
if
not
document
:
if
document
:
raise
NotFound
(
'Document not found'
)
document
.
indexing_status
=
'parsing'
document
.
indexing_status
=
'parsing'
document
.
processing_started_at
=
datetime
.
datetime
.
utcnow
()
document
.
processing_started_at
=
datetime
.
datetime
.
utcnow
()
documents
.
append
(
document
)
documents
.
append
(
document
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment