Commit a1aee4e7 authored by jyong's avatar jyong

add integration dataset and document sync from notion

parent a842f1c2
...@@ -63,5 +63,25 @@ class OAuthDataSourceCallback(Resource): ...@@ -63,5 +63,25 @@ class OAuthDataSourceCallback(Resource):
return redirect(f'{current_app.config.get("CONSOLE_URL")}?oauth_data_source=success') return redirect(f'{current_app.config.get("CONSOLE_URL")}?oauth_data_source=success')
class OAuthDataSourceSync(Resource):
def get(self, provider, binding_id):
provider = str(provider)
binding_id = str(binding_id)
OAUTH_DATASOURCE_PROVIDERS = get_oauth_providers()
with current_app.app_context():
oauth_provider = OAUTH_DATASOURCE_PROVIDERS.get(provider)
if not oauth_provider:
return {'error': 'Invalid provider'}, 400
try:
oauth_provider.sync_data_source(binding_id)
except requests.exceptions.HTTPError as e:
logging.exception(
f"An error occurred during the OAuthCallback process with {provider}: {e.response.text}")
return {'error': 'OAuth data source process failed'}, 400
return {'result': 'success'}, 200
api.add_resource(OAuthDataSource, '/oauth/data-source/<string:provider>') api.add_resource(OAuthDataSource, '/oauth/data-source/<string:provider>')
api.add_resource(OAuthDataSourceCallback, '/oauth/data-source/callback/<string:provider>') api.add_resource(OAuthDataSourceCallback, '/oauth/data-source/callback/<string:provider>')
api.add_resource(OAuthDataSourceSync, '/oauth/data-source/<string:provider>/<uuid:binding_id>/sync')
...@@ -18,6 +18,7 @@ from libs.oauth_data_source import NotionOAuth ...@@ -18,6 +18,7 @@ from libs.oauth_data_source import NotionOAuth
from models.dataset import Document from models.dataset import Document
from models.source import DataSourceBinding from models.source import DataSourceBinding
from services.dataset_service import DatasetService, DocumentService from services.dataset_service import DatasetService, DocumentService
from tasks.document_indexing_sync_task import document_indexing_sync_task
cache = TTLCache(maxsize=None, ttl=30) cache = TTLCache(maxsize=None, ttl=30)
...@@ -231,7 +232,7 @@ class DataSourceNotionApi(Resource): ...@@ -231,7 +232,7 @@ class DataSourceNotionApi(Resource):
return response, 200 return response, 200
class DataSourceNotionSyncApi(Resource): class DataSourceNotionDatasetSyncApi(Resource):
@setup_required @setup_required
@login_required @login_required
...@@ -244,7 +245,26 @@ class DataSourceNotionSyncApi(Resource): ...@@ -244,7 +245,26 @@ class DataSourceNotionSyncApi(Resource):
documents = DocumentService.get_document_by_dataset_id(dataset_id_str) documents = DocumentService.get_document_by_dataset_id(dataset_id_str)
for document in documents: for document in documents:
document_indexing_sync_task.delay(dataset_id, document.id)
return 200
class DataSourceNotionDocumentSyncApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self, dataset_id, document_id):
dataset_id_str = str(dataset_id)
document_id_str = str(document_id)
dataset = DatasetService.get_dataset(dataset_id_str)
if dataset is None:
raise NotFound("Dataset not found.")
document = DocumentService.get_document(dataset_id_str, document_id_str)
if document is None:
raise NotFound("Document not found.")
document_indexing_sync_task.delay(dataset_id, document.id)
return 200 return 200
...@@ -252,4 +272,5 @@ api.add_resource(DataSourceApi, '/data-source/integrates', '/data-source/integra ...@@ -252,4 +272,5 @@ api.add_resource(DataSourceApi, '/data-source/integrates', '/data-source/integra
api.add_resource(DataSourceNotionListApi, '/notion/pre-import/pages') api.add_resource(DataSourceNotionListApi, '/notion/pre-import/pages')
api.add_resource(DataSourceNotionApi, '/notion/workspaces/<uuid:workspace_id>/pages/<uuid:page_id>/preview', api.add_resource(DataSourceNotionApi, '/notion/workspaces/<uuid:workspace_id>/pages/<uuid:page_id>/preview',
'/datasets/notion-indexing-estimate') '/datasets/notion-indexing-estimate')
api.add_resource(DataSourceNotionSyncApi, '/datasets/<uuid:dataset_id>/notion/sync') api.add_resource(DataSourceNotionDatasetSyncApi, '/datasets/<uuid:dataset_id>/notion/sync')
api.add_resource(DataSourceNotionDocumentSyncApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/notion/sync')
...@@ -84,6 +84,29 @@ class NotionOAuth(OAuthDataSource): ...@@ -84,6 +84,29 @@ class NotionOAuth(OAuthDataSource):
db.session.add(new_data_source_binding) db.session.add(new_data_source_binding)
db.session.commit() db.session.commit()
def sync_data_source(self, binding_id: str):
# save data source binding
data_source_binding = DataSourceBinding.query.filter(
db.and_(
DataSourceBinding.tenant_id == current_user.current_tenant_id,
DataSourceBinding.provider == 'notion',
DataSourceBinding.id == binding_id,
DataSourceBinding.disabled == False
)
).first()
if data_source_binding:
# get all authorized pages
pages = self.get_authorized_pages(data_source_binding.access_token)
source_info = json.loads(data_source_binding.source_info)
source_info['pages'] = pages
source_info['total'] = len(pages)
data_source_binding.source_info = source_info
data_source_binding.disabled = False
db.session.add(data_source_binding)
db.session.commit()
else:
raise ValueError('Data source binding not found')
def get_authorized_pages(self, access_token: str): def get_authorized_pages(self, access_token: str):
pages = [] pages = []
data = { data = {
......
...@@ -6,12 +6,14 @@ import click ...@@ -6,12 +6,14 @@ import click
from celery import shared_task from celery import shared_task
from werkzeug.exceptions import NotFound from werkzeug.exceptions import NotFound
from core.data_source.notion import NotionPageReader
from core.index.keyword_table_index import KeywordTableIndex from core.index.keyword_table_index import KeywordTableIndex
from core.index.vector_index import VectorIndex from core.index.vector_index import VectorIndex
from core.indexing_runner import IndexingRunner, DocumentIsPausedException from core.indexing_runner import IndexingRunner, DocumentIsPausedException
from core.llm.error import ProviderTokenNotInitError from core.llm.error import ProviderTokenNotInitError
from extensions.ext_database import db from extensions.ext_database import db
from models.dataset import Document, Dataset, DocumentSegment from models.dataset import Document, Dataset, DocumentSegment
from models.source import DataSourceBinding
@shared_task @shared_task
...@@ -21,9 +23,9 @@ def document_indexing_sync_task(dataset_id: str, document_id: str): ...@@ -21,9 +23,9 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
:param dataset_id: :param dataset_id:
:param document_id: :param document_id:
Usage: document_indexing_update_task.delay(dataset_id, document_id) Usage: document_indexing_sync_task.delay(dataset_id, document_id)
""" """
logging.info(click.style('Start update document: {}'.format(document_id), fg='green')) logging.info(click.style('Start sync document: {}'.format(document_id), fg='green'))
start_at = time.perf_counter() start_at = time.perf_counter()
document = db.session.query(Document).filter( document = db.session.query(Document).filter(
...@@ -34,6 +36,28 @@ def document_indexing_sync_task(dataset_id: str, document_id: str): ...@@ -34,6 +36,28 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
if not document: if not document:
raise NotFound('Document not found') raise NotFound('Document not found')
data_source_info = document.data_source_info_dict
if document.data_source_type == 'notion_import':
if not data_source_info or 'notion_page_id' not in data_source_info \
or 'notion_workspace_id' not in data_source_info:
raise ValueError("no notion page found")
workspace_id = data_source_info['notion_workspace_id']
page_id = data_source_info['notion_page_id']
page_edited_time = data_source_info['last_edited_time']
data_source_binding = DataSourceBinding.query.filter(
db.and_(
DataSourceBinding.tenant_id == document.tenant_id,
DataSourceBinding.provider == 'notion',
DataSourceBinding.disabled == False,
DataSourceBinding.source_info['workspace_id'] == f'"{workspace_id}"'
)
).first()
if not data_source_binding:
raise ValueError('Data source binding not found.')
reader = NotionPageReader(integration_token=data_source_binding.access_token)
last_edited_time = reader.get_page_last_edited_time(page_id)
# check the page is updated
if last_edited_time != page_edited_time:
document.indexing_status = 'parsing' document.indexing_status = 'parsing'
document.processing_started_at = datetime.datetime.utcnow() document.processing_started_at = datetime.datetime.utcnow()
db.session.commit() db.session.commit()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment