Commit d6c08ca6 authored by StyleZhang's avatar StyleZhang

merge main

parent 586c90c9
......@@ -39,11 +39,6 @@ class OAuthDataSource(Resource):
print(vars(oauth_provider))
if not oauth_provider:
return {'error': 'Invalid provider'}, 400
<<<<<<< HEAD
auth_url = oauth_provider.get_authorization_url()
return redirect(auth_url)
=======
if current_app.config.get('NOTION_INTEGRATION_TYPE') == 'internal':
internal_secret = current_app.config.get('NOTION_INTERNAL_SECRET')
oauth_provider.save_internal_access_token(internal_secret)
......@@ -53,7 +48,6 @@ class OAuthDataSource(Resource):
return redirect(auth_url)
>>>>>>> main
class OAuthDataSourceCallback(Resource):
......
......@@ -219,11 +219,7 @@ class DataSourceNotionApi(Resource):
@setup_required
@login_required
@account_initialization_required
<<<<<<< HEAD
def get(self, workspace_id, page_id):
=======
def get(self, workspace_id, page_id, page_type):
>>>>>>> main
workspace_id = str(workspace_id)
page_id = str(page_id)
data_source_binding = DataSourceBinding.query.filter(
......@@ -237,16 +233,12 @@ class DataSourceNotionApi(Resource):
if not data_source_binding:
raise NotFound('Data source binding not found.')
reader = NotionPageReader(integration_token=data_source_binding.access_token)
<<<<<<< HEAD
page_content = reader.read_page(page_id)
=======
if page_type == 'page':
page_content = reader.read_page(page_id)
elif page_type == 'database':
page_content = reader.query_database_data(page_id)
else:
page_content = ""
>>>>>>> main
return {
'content': page_content
}, 200
......@@ -304,12 +296,8 @@ class DataSourceNotionDocumentSyncApi(Resource):
api.add_resource(DataSourceApi, '/data-source/integrates', '/data-source/integrates/<uuid:binding_id>/<string:action>')
api.add_resource(DataSourceNotionListApi, '/notion/pre-import/pages')
<<<<<<< HEAD
api.add_resource(DataSourceNotionApi, '/notion/workspaces/<uuid:workspace_id>/pages/<uuid:page_id>/preview',
=======
api.add_resource(DataSourceNotionApi,
'/notion/workspaces/<uuid:workspace_id>/pages/<uuid:page_id>/<string:page_type>/preview',
>>>>>>> main
'/datasets/notion-indexing-estimate')
api.add_resource(DataSourceNotionDatasetSyncApi, '/datasets/<uuid:dataset_id>/notion/sync')
api.add_resource(DataSourceNotionDocumentSyncApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/notion/sync')
......@@ -143,14 +143,10 @@ class FilePreviewApi(Resource):
with open(filepath, "rb") as fp:
data = fp.read()
encoding = chardet.detect(data)['encoding']
<<<<<<< HEAD
text = data.decode(encoding=encoding).strip() if data else ''
=======
if encoding:
text = data.decode(encoding=encoding).strip() if data else ''
else:
text = data.decode(encoding='utf-8').strip() if data else ''
>>>>>>> main
text = text[0:PREVIEW_WORDS_LIMIT] if text else ''
return {'content': text}
......
"""Notion reader."""
<<<<<<< HEAD
=======
import json
>>>>>>> main
import logging
import os
from datetime import datetime
......@@ -18,10 +15,7 @@ BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
SEARCH_URL = "https://api.notion.com/v1/search"
RETRIEVE_PAGE_URL_TMPL = "https://api.notion.com/v1/pages/{page_id}"
<<<<<<< HEAD
=======
RETRIEVE_DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}"
>>>>>>> main
HEADING_TYPE = ['heading_1', 'heading_2', 'heading_3']
logger = logging.getLogger(__name__)
......@@ -66,11 +60,7 @@ class NotionPageReader(BaseReader):
"GET", block_url, headers=self.headers, json=query_dict
)
data = res.json()
<<<<<<< HEAD
if data["results"] is None:
=======
if 'results' not in data or data["results"] is None:
>>>>>>> main
done = True
break
heading = ''
......@@ -94,12 +84,8 @@ class NotionPageReader(BaseReader):
heading = text
result_block_id = result["id"]
has_children = result["has_children"]
<<<<<<< HEAD
if has_children:
=======
block_type = result["type"]
if has_children and block_type != 'child_page':
>>>>>>> main
children_text = self._read_block(
result_block_id, num_tabs=num_tabs + 1
)
......@@ -199,12 +185,8 @@ class NotionPageReader(BaseReader):
result_block_id = result["id"]
has_children = result["has_children"]
<<<<<<< HEAD
if has_children:
=======
block_type = result["type"]
if has_children and block_type != 'child_page':
>>>>>>> main
children_text = self._read_block(
result_block_id, num_tabs=num_tabs + 1
)
......@@ -232,8 +214,6 @@ class NotionPageReader(BaseReader):
"""Read a page as documents."""
return self._read_parent_blocks(page_id)
<<<<<<< HEAD
=======
def query_database_data(
self, database_id: str, query_dict: Dict[str, Any] = {}
) -> str:
......@@ -275,7 +255,6 @@ class NotionPageReader(BaseReader):
return "\n\n".join(database_content_list)
>>>>>>> main
def query_database(
self, database_id: str, query_dict: Dict[str, Any] = {}
) -> List[str]:
......@@ -354,15 +333,8 @@ class NotionPageReader(BaseReader):
docs = []
if database_id is not None:
# get all the pages in the database
<<<<<<< HEAD
page_ids = self.query_database(database_id)
for page_id in page_ids:
page_text = self.read_page(page_id)
docs.append(Document(page_text))
=======
page_text = self.query_database_data(database_id)
docs.append(Document(page_text))
>>>>>>> main
else:
for page_id in page_ids:
page_text_list = self.read_page_as_documents(page_id)
......@@ -379,9 +351,6 @@ class NotionPageReader(BaseReader):
"GET", retrieve_page_url, headers=self.headers, json=query_dict
)
data = res.json()
<<<<<<< HEAD
# last_edited_time = datetime.fromisoformat(data["last_edited_time"])
=======
return data["last_edited_time"]
def get_database_last_edited_time(self, database_id: str) -> str:
......@@ -392,7 +361,6 @@ class NotionPageReader(BaseReader):
"GET", retrieve_page_url, headers=self.headers, json=query_dict
)
data = res.json()
>>>>>>> main
return data["last_edited_time"]
......
......@@ -235,29 +235,6 @@ class IndexingRunner:
if page['type'] == 'page':
page_ids = [page['page_id']]
documents = reader.load_data_as_documents(page_ids=page_ids)
<<<<<<< HEAD
processing_rule = DatasetProcessRule(
mode=tmp_processing_rule["mode"],
rules=json.dumps(tmp_processing_rule["rules"])
)
# get node parser for splitting
node_parser = self._get_node_parser(processing_rule)
# split to nodes
nodes = self._split_to_nodes(
text_docs=documents,
node_parser=node_parser,
processing_rule=processing_rule
)
total_segments += len(nodes)
for node in nodes:
if len(preview_texts) < 5:
preview_texts.append(node.get_text())
tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, node.get_text())
=======
elif page['type'] == 'database':
documents = reader.load_data_as_documents(database_id=page['page_id'])
else:
......@@ -282,7 +259,6 @@ class IndexingRunner:
preview_texts.append(node.get_text())
tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, node.get_text())
>>>>>>> main
return {
"total_segments": total_segments,
......@@ -314,10 +290,7 @@ class IndexingRunner:
raise ValueError("no notion page found")
workspace_id = data_source_info['notion_workspace_id']
page_id = data_source_info['notion_page_id']
<<<<<<< HEAD
=======
page_type = data_source_info['type']
>>>>>>> main
data_source_binding = DataSourceBinding.query.filter(
db.and_(
DataSourceBinding.tenant_id == document.tenant_id,
......@@ -328,11 +301,6 @@ class IndexingRunner:
).first()
if not data_source_binding:
raise ValueError('Data source binding not found.')
<<<<<<< HEAD
# add page last_edited_time to data_source_info
self._get_notion_page_last_edited_time(page_id, data_source_binding.access_token, document)
text_docs = self._load_data_from_notion(page_id, data_source_binding.access_token)
=======
if page_type == 'page':
# add page last_edited_time to data_source_info
self._get_notion_page_last_edited_time(page_id, data_source_binding.access_token, document)
......@@ -341,7 +309,6 @@ class IndexingRunner:
# add page last_edited_time to data_source_info
self._get_notion_database_last_edited_time(page_id, data_source_binding.access_token, document)
text_docs = self._load_database_data_from_notion(page_id, data_source_binding.access_token)
>>>>>>> main
# update document status to splitting
self._update_document_index_status(
document_id=document.id,
......@@ -383,24 +350,17 @@ class IndexingRunner:
return text_docs
<<<<<<< HEAD
def _load_data_from_notion(self, page_id: str, access_token: str) -> List[Document]:
=======
def _load_page_data_from_notion(self, page_id: str, access_token: str) -> List[Document]:
>>>>>>> main
page_ids = [page_id]
reader = NotionPageReader(integration_token=access_token)
text_docs = reader.load_data_as_documents(page_ids=page_ids)
return text_docs
<<<<<<< HEAD
=======
def _load_database_data_from_notion(self, database_id: str, access_token: str) -> List[Document]:
reader = NotionPageReader(integration_token=access_token)
text_docs = reader.load_data_as_documents(database_id=database_id)
return text_docs
>>>>>>> main
def _get_notion_page_last_edited_time(self, page_id: str, access_token: str, document: Document):
reader = NotionPageReader(integration_token=access_token)
last_edited_time = reader.get_page_last_edited_time(page_id)
......@@ -413,8 +373,6 @@ class IndexingRunner:
Document.query.filter_by(id=document.id).update(update_params)
db.session.commit()
<<<<<<< HEAD
=======
def _get_notion_database_last_edited_time(self, page_id: str, access_token: str, document: Document):
reader = NotionPageReader(integration_token=access_token)
last_edited_time = reader.get_database_last_edited_time(page_id)
......@@ -427,7 +385,6 @@ class IndexingRunner:
Document.query.filter_by(id=document.id).update(update_params)
db.session.commit()
>>>>>>> main
def _get_node_parser(self, processing_rule: DatasetProcessRule) -> NodeParser:
"""
Get the NodeParser object according to the processing rule.
......
......@@ -26,10 +26,7 @@ class NotionOAuth(OAuthDataSource):
_TOKEN_URL = 'https://api.notion.com/v1/oauth/token'
_NOTION_PAGE_SEARCH = "https://api.notion.com/v1/search"
_NOTION_BLOCK_SEARCH = "https://api.notion.com/v1/blocks"
<<<<<<< HEAD
=======
_NOTION_BOT_USER = "https://api.notion.com/v1/users/me"
>>>>>>> main
def get_authorization_url(self):
params = {
......@@ -88,8 +85,6 @@ class NotionOAuth(OAuthDataSource):
db.session.add(new_data_source_binding)
db.session.commit()
<<<<<<< HEAD
=======
def save_internal_access_token(self, access_token: str):
workspace_name = self.notion_workspace_name(access_token)
workspace_icon = None
......@@ -125,7 +120,6 @@ class NotionOAuth(OAuthDataSource):
db.session.add(new_data_source_binding)
db.session.commit()
>>>>>>> main
def sync_data_source(self, binding_id: str):
# save data source binding
data_source_binding = DataSourceBinding.query.filter(
......@@ -170,14 +164,11 @@ class NotionOAuth(OAuthDataSource):
page_name = page_result['properties']['title']['title'][0]['plain_text']
else:
page_name = 'Untitled'
<<<<<<< HEAD
=======
elif 'Title' in page_result['properties']:
if len(page_result['properties']['Title']['title']) > 0:
page_name = page_result['properties']['Title']['title'][0]['plain_text']
else:
page_name = 'Untitled'
>>>>>>> main
else:
page_name = 'Untitled'
page_icon = page_result['icon']
......@@ -267,14 +258,10 @@ class NotionOAuth(OAuthDataSource):
}
response = requests.post(url=self._NOTION_PAGE_SEARCH, json=data, headers=headers)
response_json = response.json()
<<<<<<< HEAD
results = response_json['results']
=======
if 'results' in response_json:
results = response_json['results']
else:
results = []
>>>>>>> main
return results
def notion_block_parent_page_id(self, access_token: str, block_id: str):
......@@ -290,8 +277,6 @@ class NotionOAuth(OAuthDataSource):
return self.notion_block_parent_page_id(access_token, parent[parent_type])
return parent[parent_type]
<<<<<<< HEAD
=======
def notion_workspace_name(self, access_token: str):
headers = {
'Authorization': f"Bearer {access_token}",
......@@ -306,7 +291,6 @@ class NotionOAuth(OAuthDataSource):
return user_info['workspace_name']
return 'workspace'
>>>>>>> main
def notion_database_search(self, access_token: str):
data = {
'filter': {
......@@ -321,12 +305,8 @@ class NotionOAuth(OAuthDataSource):
}
response = requests.post(url=self._NOTION_PAGE_SEARCH, json=data, headers=headers)
response_json = response.json()
<<<<<<< HEAD
results = response_json['results']
=======
if 'results' in response_json:
results = response_json['results']
else:
results = []
>>>>>>> main
return results
......@@ -479,21 +479,6 @@ class DocumentService:
document_data["data_source"]["type"],
data_source_info, created_from, position,
account, page['page_name'], batch)
<<<<<<< HEAD
if page['type'] == 'database':
document.splitting_completed_at = datetime.datetime.utcnow()
document.cleaning_completed_at = datetime.datetime.utcnow()
document.parsing_completed_at = datetime.datetime.utcnow()
document.completed_at = datetime.datetime.utcnow()
document.indexing_status = 'completed'
document.word_count = 0
document.tokens = 0
document.indexing_latency = 0
db.session.add(document)
db.session.flush()
if page['type'] != 'database':
document_ids.append(document.id)
=======
# if page['type'] == 'database':
# document.splitting_completed_at = datetime.datetime.utcnow()
# document.cleaning_completed_at = datetime.datetime.utcnow()
......@@ -507,7 +492,6 @@ class DocumentService:
db.session.flush()
# if page['type'] != 'database':
document_ids.append(document.id)
>>>>>>> main
documents.append(document)
position += 1
else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment