Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
4354b680
Commit
4354b680
authored
Jun 14, 2023
by
jyong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add get index estimate by batch
parent
44fcd1c1
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
85 additions
and
4 deletions
+85
-4
datasets_document.py
api/controllers/console/datasets/datasets_document.py
+69
-0
indexing_runner.py
api/core/indexing_runner.py
+1
-1
dataset_service.py
api/services/dataset_service.py
+15
-3
No files found.
api/controllers/console/datasets/datasets_document.py
View file @
4354b680
...
@@ -351,6 +351,73 @@ class DocumentIndexingEstimateApi(DocumentResource):
...
@@ -351,6 +351,73 @@ class DocumentIndexingEstimateApi(DocumentResource):
return
response
return
response
class
DocumentBatchIndexingEstimateApi
(
DocumentResource
):
@
setup_required
@
login_required
@
account_initialization_required
def
get
(
self
,
dataset_id
,
batch
):
dataset_id
=
str
(
dataset_id
)
batch
=
str
(
batch
)
dataset
=
DatasetService
.
get_dataset
(
dataset_id
)
if
dataset
is
None
:
raise
NotFound
(
"Dataset not found."
)
documents
=
self
.
get_batch_documents
(
dataset_id
,
batch
)
response
=
{
"tokens"
:
0
,
"total_price"
:
0
,
"currency"
:
"USD"
,
"total_segments"
:
0
,
"preview"
:
[]
}
if
not
documents
:
return
response
data_process_rule
=
documents
[
0
]
.
dataset_process_rule
data_process_rule_dict
=
data_process_rule
.
to_dict
()
info_list
=
[]
for
document
in
documents
:
if
document
.
indexing_status
in
[
'completed'
,
'error'
]:
raise
DocumentAlreadyFinishedError
()
data_source_info
=
document
.
data_source_info_dict
# format document files info
if
data_source_info
and
'upload_file_id'
in
data_source_info
:
file_id
=
data_source_info
[
'upload_file_id'
]
info_list
.
append
(
file_id
)
# format document notion info
elif
data_source_info
and
'notion_workspace_id'
in
data_source_info
and
'notion_page_id'
in
data_source_info
:
pages
=
[]
page
=
{
'page_id'
:
data_source_info
[
'notion_page_id'
],
'type'
:
data_source_info
[
'type'
]
}
pages
.
append
(
page
)
notion_info
=
{
'workspace_id'
:
data_source_info
[
'notion_workspace_id'
],
'pages'
:
pages
}
info_list
.
append
(
notion_info
)
if
dataset
.
data_source_type
==
'upload_file'
:
file_details
=
db
.
session
.
query
(
UploadFile
)
.
filter
(
UploadFile
.
tenant_id
==
current_user
.
current_tenant_id
,
UploadFile
.
id
in
info_list
)
.
all
()
if
file_details
is
None
:
raise
NotFound
(
"File not found."
)
indexing_runner
=
IndexingRunner
()
response
=
indexing_runner
.
file_indexing_estimate
(
file_details
,
data_process_rule_dict
)
elif
dataset
.
data_source_type
:
indexing_runner
=
IndexingRunner
()
response
=
indexing_runner
.
notion_indexing_estimate
(
info_list
,
data_process_rule_dict
)
else
:
raise
ValueError
(
'Data source type not support'
)
return
response
class
DocumentBatchIndexingStatusApi
(
DocumentResource
):
class
DocumentBatchIndexingStatusApi
(
DocumentResource
):
document_status_fields
=
{
document_status_fields
=
{
'id'
:
fields
.
String
,
'id'
:
fields
.
String
,
...
@@ -750,6 +817,8 @@ api.add_resource(DatasetInitApi,
...
@@ -750,6 +817,8 @@ api.add_resource(DatasetInitApi,
'/datasets/init'
)
'/datasets/init'
)
api
.
add_resource
(
DocumentIndexingEstimateApi
,
api
.
add_resource
(
DocumentIndexingEstimateApi
,
'/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate'
)
'/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate'
)
api
.
add_resource
(
DocumentBatchIndexingEstimateApi
,
'/datasets/<uuid:dataset_id>/batch/<uuid:batch>/indexing-estimate'
)
api
.
add_resource
(
DocumentBatchIndexingStatusApi
,
api
.
add_resource
(
DocumentBatchIndexingStatusApi
,
'/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status'
)
'/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status'
)
api
.
add_resource
(
DocumentIndexingStatusApi
,
api
.
add_resource
(
DocumentIndexingStatusApi
,
...
...
api/core/indexing_runner.py
View file @
4354b680
...
@@ -210,7 +210,7 @@ class IndexingRunner:
...
@@ -210,7 +210,7 @@ class IndexingRunner:
"preview"
:
preview_texts
"preview"
:
preview_texts
}
}
def
notion_indexing_estimate
(
self
,
notion_info_list
:
dic
t
,
tmp_processing_rule
:
dict
)
->
dict
:
def
notion_indexing_estimate
(
self
,
notion_info_list
:
lis
t
,
tmp_processing_rule
:
dict
)
->
dict
:
"""
"""
Estimate the indexing for the document.
Estimate the indexing for the document.
"""
"""
...
...
api/services/dataset_service.py
View file @
4354b680
...
@@ -384,6 +384,7 @@ class DocumentService:
...
@@ -384,6 +384,7 @@ class DocumentService:
if
dataset
.
indexing_technique
==
'high_quality'
:
if
dataset
.
indexing_technique
==
'high_quality'
:
IndexBuilder
.
get_default_service_context
(
dataset
.
tenant_id
)
IndexBuilder
.
get_default_service_context
(
dataset
.
tenant_id
)
documents
=
[]
documents
=
[]
batch
=
time
.
strftime
(
'
%
Y
%
m
%
d
%
H
%
M
%
S'
)
+
str
(
random
.
randint
(
100000
,
999999
))
if
'original_document_id'
in
document_data
and
document_data
[
"original_document_id"
]:
if
'original_document_id'
in
document_data
and
document_data
[
"original_document_id"
]:
document
=
DocumentService
.
update_document_with_dataset_id
(
dataset
,
document_data
,
account
)
document
=
DocumentService
.
update_document_with_dataset_id
(
dataset
,
document_data
,
account
)
documents
.
append
(
document
)
documents
.
append
(
document
)
...
@@ -408,7 +409,6 @@ class DocumentService:
...
@@ -408,7 +409,6 @@ class DocumentService:
db
.
session
.
add
(
dataset_process_rule
)
db
.
session
.
add
(
dataset_process_rule
)
db
.
session
.
commit
()
db
.
session
.
commit
()
position
=
DocumentService
.
get_documents_position
(
dataset
.
id
)
position
=
DocumentService
.
get_documents_position
(
dataset
.
id
)
batch
=
time
.
strftime
(
'
%
Y
%
m
%
d
%
H
%
M
%
S'
)
+
str
(
random
.
randint
(
100000
,
999999
))
document_ids
=
[]
document_ids
=
[]
if
document_data
[
"data_source"
][
"type"
]
==
"upload_file"
:
if
document_data
[
"data_source"
][
"type"
]
==
"upload_file"
:
upload_file_list
=
document_data
[
"data_source"
][
"info_list"
][
'file_info_list'
][
'file_ids'
]
upload_file_list
=
document_data
[
"data_source"
][
"info_list"
][
'file_info_list'
][
'file_ids'
]
...
@@ -466,15 +466,26 @@ class DocumentService:
...
@@ -466,15 +466,26 @@ class DocumentService:
if
page
[
'page_id'
]
not
in
exist_page_ids
:
if
page
[
'page_id'
]
not
in
exist_page_ids
:
data_source_info
=
{
data_source_info
=
{
"notion_workspace_id"
:
workspace_id
,
"notion_workspace_id"
:
workspace_id
,
"notion_page_id"
:
page
[
'page_id'
]
"notion_page_id"
:
page
[
'page_id'
],
"type"
:
page
[
'type'
]
}
}
document
=
DocumentService
.
save_document
(
dataset
,
dataset_process_rule
.
id
,
document
=
DocumentService
.
save_document
(
dataset
,
dataset_process_rule
.
id
,
document_data
[
"data_source"
][
"type"
],
document_data
[
"data_source"
][
"type"
],
data_source_info
,
created_from
,
position
,
data_source_info
,
created_from
,
position
,
account
,
page
[
'page_name'
],
batch
)
account
,
page
[
'page_name'
],
batch
)
if
page
[
'type'
]
==
'database'
:
document
.
splitting_completed_at
=
datetime
.
datetime
.
utcnow
()
document
.
cleaning_completed_at
=
datetime
.
datetime
.
utcnow
()
document
.
parsing_completed_at
=
datetime
.
datetime
.
utcnow
()
document
.
completed_at
=
datetime
.
datetime
.
utcnow
()
document
.
indexing_status
=
'completed'
document
.
word_count
=
0
document
.
tokens
=
0
document
.
indexing_latency
=
0
db
.
session
.
add
(
document
)
db
.
session
.
add
(
document
)
db
.
session
.
flush
()
db
.
session
.
flush
()
document_ids
.
append
(
document
.
id
)
if
page
[
'type'
]
!=
'database'
:
document_ids
.
append
(
document
.
id
)
documents
.
append
(
document
)
documents
.
append
(
document
)
position
+=
1
position
+=
1
else
:
else
:
...
@@ -571,6 +582,7 @@ class DocumentService:
...
@@ -571,6 +582,7 @@ class DocumentService:
data_source_info
=
{
data_source_info
=
{
"notion_workspace_id"
:
workspace_id
,
"notion_workspace_id"
:
workspace_id
,
"notion_page_id"
:
page
[
'page_id'
],
"notion_page_id"
:
page
[
'page_id'
],
"type"
:
page
[
'type'
]
}
}
document
.
data_source_type
=
document_data
[
"data_source"
][
"type"
]
document
.
data_source_type
=
document_data
[
"data_source"
][
"type"
]
document
.
data_source_info
=
json
.
dumps
(
data_source_info
)
document
.
data_source_info
=
json
.
dumps
(
data_source_info
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment