Commit 4354b680 authored by jyong's avatar jyong

add get index estimate by batch

parent 44fcd1c1
...@@ -351,6 +351,73 @@ class DocumentIndexingEstimateApi(DocumentResource): ...@@ -351,6 +351,73 @@ class DocumentIndexingEstimateApi(DocumentResource):
return response return response
class DocumentBatchIndexingEstimateApi(DocumentResource):
@setup_required
@login_required
@account_initialization_required
def get(self, dataset_id, batch):
dataset_id = str(dataset_id)
batch = str(batch)
dataset = DatasetService.get_dataset(dataset_id)
if dataset is None:
raise NotFound("Dataset not found.")
documents = self.get_batch_documents(dataset_id, batch)
response = {
"tokens": 0,
"total_price": 0,
"currency": "USD",
"total_segments": 0,
"preview": []
}
if not documents:
return response
data_process_rule = documents[0].dataset_process_rule
data_process_rule_dict = data_process_rule.to_dict()
info_list = []
for document in documents:
if document.indexing_status in ['completed', 'error']:
raise DocumentAlreadyFinishedError()
data_source_info = document.data_source_info_dict
# format document files info
if data_source_info and 'upload_file_id' in data_source_info:
file_id = data_source_info['upload_file_id']
info_list.append(file_id)
# format document notion info
elif data_source_info and 'notion_workspace_id' in data_source_info and 'notion_page_id' in data_source_info:
pages = []
page = {
'page_id': data_source_info['notion_page_id'],
'type': data_source_info['type']
}
pages.append(page)
notion_info = {
'workspace_id': data_source_info['notion_workspace_id'],
'pages': pages
}
info_list.append(notion_info)
if dataset.data_source_type == 'upload_file':
file_details = db.session.query(UploadFile).filter(
UploadFile.tenant_id == current_user.current_tenant_id,
UploadFile.id in info_list
).all()
if file_details is None:
raise NotFound("File not found.")
indexing_runner = IndexingRunner()
response = indexing_runner.file_indexing_estimate(file_details, data_process_rule_dict)
elif dataset.data_source_type:
indexing_runner = IndexingRunner()
response = indexing_runner.notion_indexing_estimate(info_list,
data_process_rule_dict)
else:
raise ValueError('Data source type not support')
return response
class DocumentBatchIndexingStatusApi(DocumentResource): class DocumentBatchIndexingStatusApi(DocumentResource):
document_status_fields = { document_status_fields = {
'id': fields.String, 'id': fields.String,
...@@ -750,6 +817,8 @@ api.add_resource(DatasetInitApi, ...@@ -750,6 +817,8 @@ api.add_resource(DatasetInitApi,
'/datasets/init') '/datasets/init')
api.add_resource(DocumentIndexingEstimateApi, api.add_resource(DocumentIndexingEstimateApi,
'/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate') '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate')
api.add_resource(DocumentBatchIndexingEstimateApi,
'/datasets/<uuid:dataset_id>/batch/<uuid:batch>/indexing-estimate')
api.add_resource(DocumentBatchIndexingStatusApi, api.add_resource(DocumentBatchIndexingStatusApi,
'/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status') '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status')
api.add_resource(DocumentIndexingStatusApi, api.add_resource(DocumentIndexingStatusApi,
......
...@@ -210,7 +210,7 @@ class IndexingRunner: ...@@ -210,7 +210,7 @@ class IndexingRunner:
"preview": preview_texts "preview": preview_texts
} }
def notion_indexing_estimate(self, notion_info_list: dict, tmp_processing_rule: dict) -> dict: def notion_indexing_estimate(self, notion_info_list: list, tmp_processing_rule: dict) -> dict:
""" """
Estimate the indexing for the document. Estimate the indexing for the document.
""" """
......
...@@ -384,6 +384,7 @@ class DocumentService: ...@@ -384,6 +384,7 @@ class DocumentService:
if dataset.indexing_technique == 'high_quality': if dataset.indexing_technique == 'high_quality':
IndexBuilder.get_default_service_context(dataset.tenant_id) IndexBuilder.get_default_service_context(dataset.tenant_id)
documents = [] documents = []
batch = time.strftime('%Y%m%d%H%M%S') + str(random.randint(100000, 999999))
if 'original_document_id' in document_data and document_data["original_document_id"]: if 'original_document_id' in document_data and document_data["original_document_id"]:
document = DocumentService.update_document_with_dataset_id(dataset, document_data, account) document = DocumentService.update_document_with_dataset_id(dataset, document_data, account)
documents.append(document) documents.append(document)
...@@ -408,7 +409,6 @@ class DocumentService: ...@@ -408,7 +409,6 @@ class DocumentService:
db.session.add(dataset_process_rule) db.session.add(dataset_process_rule)
db.session.commit() db.session.commit()
position = DocumentService.get_documents_position(dataset.id) position = DocumentService.get_documents_position(dataset.id)
batch = time.strftime('%Y%m%d%H%M%S') + str(random.randint(100000, 999999))
document_ids = [] document_ids = []
if document_data["data_source"]["type"] == "upload_file": if document_data["data_source"]["type"] == "upload_file":
upload_file_list = document_data["data_source"]["info_list"]['file_info_list']['file_ids'] upload_file_list = document_data["data_source"]["info_list"]['file_info_list']['file_ids']
...@@ -466,14 +466,25 @@ class DocumentService: ...@@ -466,14 +466,25 @@ class DocumentService:
if page['page_id'] not in exist_page_ids: if page['page_id'] not in exist_page_ids:
data_source_info = { data_source_info = {
"notion_workspace_id": workspace_id, "notion_workspace_id": workspace_id,
"notion_page_id": page['page_id'] "notion_page_id": page['page_id'],
"type": page['type']
} }
document = DocumentService.save_document(dataset, dataset_process_rule.id, document = DocumentService.save_document(dataset, dataset_process_rule.id,
document_data["data_source"]["type"], document_data["data_source"]["type"],
data_source_info, created_from, position, data_source_info, created_from, position,
account, page['page_name'], batch) account, page['page_name'], batch)
if page['type'] == 'database':
document.splitting_completed_at = datetime.datetime.utcnow()
document.cleaning_completed_at = datetime.datetime.utcnow()
document.parsing_completed_at = datetime.datetime.utcnow()
document.completed_at = datetime.datetime.utcnow()
document.indexing_status = 'completed'
document.word_count = 0
document.tokens = 0
document.indexing_latency = 0
db.session.add(document) db.session.add(document)
db.session.flush() db.session.flush()
if page['type'] != 'database':
document_ids.append(document.id) document_ids.append(document.id)
documents.append(document) documents.append(document)
position += 1 position += 1
...@@ -571,6 +582,7 @@ class DocumentService: ...@@ -571,6 +582,7 @@ class DocumentService:
data_source_info = { data_source_info = {
"notion_workspace_id": workspace_id, "notion_workspace_id": workspace_id,
"notion_page_id": page['page_id'], "notion_page_id": page['page_id'],
"type": page['type']
} }
document.data_source_type = document_data["data_source"]["type"] document.data_source_type = document_data["data_source"]["type"]
document.data_source_info = json.dumps(data_source_info) document.data_source_info = json.dumps(data_source_info)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment