merge main

d6c08ca6 · StyleZhang · 586c90c9 · d6c08ca6 · d6c08ca6 · d6c08ca6
Commit d6c08ca6 authored Jun 19, 2023 by StyleZhang
7 changed files
--- a/api/controllers/console/auth/data_source_oauth.py
+++ b/api/controllers/console/auth/data_source_oauth.py
@@ -39,11 +39,6 @@ class OAuthDataSource(Resource):
            print(vars(oauth_provider))
        if not oauth_provider:
            return {'error': 'Invalid provider'}, 400
-<<<<<<< HEAD
-
-        auth_url = oauth_provider.get_authorization_url()
-        return redirect(auth_url)
-=======
        if current_app.config.get('NOTION_INTEGRATION_TYPE') == 'internal':
            internal_secret = current_app.config.get('NOTION_INTERNAL_SECRET')
            oauth_provider.save_internal_access_token(internal_secret)
@@ -53,7 +48,6 @@ class OAuthDataSource(Resource):
            return redirect(auth_url)


->>>>>>> main


 class OAuthDataSourceCallback(Resource):

--- a/api/controllers/console/datasets/data_source.py
+++ b/api/controllers/console/datasets/data_source.py
@@ -219,11 +219,7 @@ class DataSourceNotionApi(Resource):
    @setup_required
    @login_required
    @account_initialization_required
-<<<<<<< HEAD
-    def get(self, workspace_id, page_id):
-=======
    def get(self, workspace_id, page_id, page_type):
->>>>>>> main
        workspace_id = str(workspace_id)
        page_id = str(page_id)
        data_source_binding = DataSourceBinding.query.filter(
@@ -237,16 +233,12 @@ class DataSourceNotionApi(Resource):
        if not data_source_binding:
            raise NotFound('Data source binding not found.')
        reader = NotionPageReader(integration_token=data_source_binding.access_token)
-<<<<<<< HEAD
-        page_content = reader.read_page(page_id)
-=======
        if page_type == 'page':
            page_content = reader.read_page(page_id)
        elif page_type == 'database':
            page_content = reader.query_database_data(page_id)
        else:
            page_content = ""
->>>>>>> main
        return {
            'content': page_content
        }, 200
@@ -304,12 +296,8 @@ class DataSourceNotionDocumentSyncApi(Resource):

 api.add_resource(DataSourceApi, '/data-source/integrates', '/data-source/integrates/<uuid:binding_id>/<string:action>')
 api.add_resource(DataSourceNotionListApi, '/notion/pre-import/pages')
-<<<<<<< HEAD
-api.add_resource(DataSourceNotionApi, '/notion/workspaces/<uuid:workspace_id>/pages/<uuid:page_id>/preview',
-=======
 api.add_resource(DataSourceNotionApi,
                 '/notion/workspaces/<uuid:workspace_id>/pages/<uuid:page_id>/<string:page_type>/preview',
->>>>>>> main
                 '/datasets/notion-indexing-estimate')
 api.add_resource(DataSourceNotionDatasetSyncApi, '/datasets/<uuid:dataset_id>/notion/sync')
 api.add_resource(DataSourceNotionDocumentSyncApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/notion/sync')
--- a/api/controllers/console/datasets/file.py
+++ b/api/controllers/console/datasets/file.py
@@ -143,14 +143,10 @@ class FilePreviewApi(Resource):
                with open(filepath, "rb") as fp:
                    data = fp.read()
                    encoding = chardet.detect(data)['encoding']
-<<<<<<< HEAD
-                    text = data.decode(encoding=encoding).strip() if data else ''
-=======
                    if encoding:
                        text = data.decode(encoding=encoding).strip() if data else ''
                    else:
                        text = data.decode(encoding='utf-8').strip() if data else ''
->>>>>>> main

        text = text[0:PREVIEW_WORDS_LIMIT] if text else ''
        return {'content': text}

--- a/api/core/data_source/notion.py
+++ b/api/core/data_source/notion.py
 """Notion reader."""
-<<<<<<< HEAD
-=======
 import json
->>>>>>> main
 import logging
 import os
 from datetime import datetime
@@ -18,10 +15,7 @@ BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
 DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
 SEARCH_URL = "https://api.notion.com/v1/search"
 RETRIEVE_PAGE_URL_TMPL = "https://api.notion.com/v1/pages/{page_id}"
-<<<<<<< HEAD
-=======
 RETRIEVE_DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}"
->>>>>>> main
 HEADING_TYPE = ['heading_1', 'heading_2', 'heading_3']
 logger = logging.getLogger(__name__)

@@ -66,11 +60,7 @@ class NotionPageReader(BaseReader):
                "GET", block_url, headers=self.headers, json=query_dict
            )
            data = res.json()
-<<<<<<< HEAD
-            if data["results"] is None:
-=======
            if 'results' not in data or data["results"] is None:
->>>>>>> main
                done = True
                break
            heading = ''
@@ -94,12 +84,8 @@ class NotionPageReader(BaseReader):
                                    heading = text
                    result_block_id = result["id"]
                    has_children = result["has_children"]
-<<<<<<< HEAD
-                    if has_children:
-=======
                    block_type = result["type"]
                    if has_children and block_type != 'child_page':
->>>>>>> main
                        children_text = self._read_block(
                            result_block_id, num_tabs=num_tabs + 1
                        )
@@ -199,12 +185,8 @@ class NotionPageReader(BaseReader):

                    result_block_id = result["id"]
                    has_children = result["has_children"]
-<<<<<<< HEAD
-                    if has_children:
-=======
                    block_type = result["type"]
                    if has_children and block_type != 'child_page':
->>>>>>> main
                        children_text = self._read_block(
                            result_block_id, num_tabs=num_tabs + 1
                        )
@@ -232,8 +214,6 @@ class NotionPageReader(BaseReader):
        """Read a page as documents."""
        return self._read_parent_blocks(page_id)

-<<<<<<< HEAD
-=======
    def query_database_data(
            self, database_id: str, query_dict: Dict[str, Any] = {}
    ) -> str:
@@ -275,7 +255,6 @@ class NotionPageReader(BaseReader):

        return "\n\n".join(database_content_list)

->>>>>>> main
    def query_database(
            self, database_id: str, query_dict: Dict[str, Any] = {}
    ) -> List[str]:
@@ -354,15 +333,8 @@ class NotionPageReader(BaseReader):
        docs = []
        if database_id is not None:
            # get all the pages in the database
-<<<<<<< HEAD
-            page_ids = self.query_database(database_id)
-            for page_id in page_ids:
-                page_text = self.read_page(page_id)
-                docs.append(Document(page_text))
-=======
            page_text = self.query_database_data(database_id)
            docs.append(Document(page_text))
->>>>>>> main
        else:
            for page_id in page_ids:
                page_text_list = self.read_page_as_documents(page_id)
@@ -379,9 +351,6 @@ class NotionPageReader(BaseReader):
            "GET", retrieve_page_url, headers=self.headers, json=query_dict
        )
        data = res.json()
-<<<<<<< HEAD
-        # last_edited_time = datetime.fromisoformat(data["last_edited_time"])
-=======
        return data["last_edited_time"]

    def get_database_last_edited_time(self, database_id: str) -> str:
@@ -392,7 +361,6 @@ class NotionPageReader(BaseReader):
            "GET", retrieve_page_url, headers=self.headers, json=query_dict
        )
        data = res.json()
->>>>>>> main
        return data["last_edited_time"]



--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -235,29 +235,6 @@ class IndexingRunner:
                if page['type'] == 'page':
                    page_ids = [page['page_id']]
                    documents = reader.load_data_as_documents(page_ids=page_ids)
-<<<<<<< HEAD
-
-                    processing_rule = DatasetProcessRule(
-                        mode=tmp_processing_rule["mode"],
-                        rules=json.dumps(tmp_processing_rule["rules"])
-                    )
-
-                    # get node parser for splitting
-                    node_parser = self._get_node_parser(processing_rule)
-
-                    # split to nodes
-                    nodes = self._split_to_nodes(
-                        text_docs=documents,
-                        node_parser=node_parser,
-                        processing_rule=processing_rule
-                    )
-                    total_segments += len(nodes)
-                    for node in nodes:
-                        if len(preview_texts) < 5:
-                            preview_texts.append(node.get_text())
-
-                        tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, node.get_text())
-=======
                elif page['type'] == 'database':
                    documents = reader.load_data_as_documents(database_id=page['page_id'])
                else:
@@ -282,7 +259,6 @@ class IndexingRunner:
                        preview_texts.append(node.get_text())

                    tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, node.get_text())
->>>>>>> main

        return {
            "total_segments": total_segments,
@@ -314,10 +290,7 @@ class IndexingRunner:
                raise ValueError("no notion page found")
            workspace_id = data_source_info['notion_workspace_id']
            page_id = data_source_info['notion_page_id']
-<<<<<<< HEAD
-=======
            page_type = data_source_info['type']
->>>>>>> main
            data_source_binding = DataSourceBinding.query.filter(
                db.and_(
                    DataSourceBinding.tenant_id == document.tenant_id,
@@ -328,11 +301,6 @@ class IndexingRunner:
            ).first()
            if not data_source_binding:
                raise ValueError('Data source binding not found.')
-<<<<<<< HEAD
-            # add page last_edited_time to data_source_info
-            self._get_notion_page_last_edited_time(page_id, data_source_binding.access_token, document)
-            text_docs = self._load_data_from_notion(page_id, data_source_binding.access_token)
-=======
            if page_type == 'page':
                # add page last_edited_time to data_source_info
                self._get_notion_page_last_edited_time(page_id, data_source_binding.access_token, document)
@@ -341,7 +309,6 @@ class IndexingRunner:
                # add page last_edited_time to data_source_info
                self._get_notion_database_last_edited_time(page_id, data_source_binding.access_token, document)
                text_docs = self._load_database_data_from_notion(page_id, data_source_binding.access_token)
->>>>>>> main
        # update document status to splitting
        self._update_document_index_status(
            document_id=document.id,
@@ -383,24 +350,17 @@ class IndexingRunner:

            return text_docs

-<<<<<<< HEAD
-    def _load_data_from_notion(self, page_id: str, access_token: str) -> List[Document]:
-=======
    def _load_page_data_from_notion(self, page_id: str, access_token: str) -> List[Document]:
->>>>>>> main
        page_ids = [page_id]
        reader = NotionPageReader(integration_token=access_token)
        text_docs = reader.load_data_as_documents(page_ids=page_ids)
        return text_docs

-<<<<<<< HEAD
-=======
    def _load_database_data_from_notion(self, database_id: str, access_token: str) -> List[Document]:
        reader = NotionPageReader(integration_token=access_token)
        text_docs = reader.load_data_as_documents(database_id=database_id)
        return text_docs

->>>>>>> main
    def _get_notion_page_last_edited_time(self, page_id: str, access_token: str, document: Document):
        reader = NotionPageReader(integration_token=access_token)
        last_edited_time = reader.get_page_last_edited_time(page_id)
@@ -413,8 +373,6 @@ class IndexingRunner:
        Document.query.filter_by(id=document.id).update(update_params)
        db.session.commit()

-<<<<<<< HEAD
-=======
    def _get_notion_database_last_edited_time(self, page_id: str, access_token: str, document: Document):
        reader = NotionPageReader(integration_token=access_token)
        last_edited_time = reader.get_database_last_edited_time(page_id)
@@ -427,7 +385,6 @@ class IndexingRunner:
        Document.query.filter_by(id=document.id).update(update_params)
        db.session.commit()

->>>>>>> main
    def _get_node_parser(self, processing_rule: DatasetProcessRule) -> NodeParser:
        """
        Get the NodeParser object according to the processing rule.

--- a/api/libs/oauth_data_source.py
+++ b/api/libs/oauth_data_source.py
@@ -26,10 +26,7 @@ class NotionOAuth(OAuthDataSource):
    _TOKEN_URL = 'https://api.notion.com/v1/oauth/token'
    _NOTION_PAGE_SEARCH = "https://api.notion.com/v1/search"
    _NOTION_BLOCK_SEARCH = "https://api.notion.com/v1/blocks"
-<<<<<<< HEAD
-=======
    _NOTION_BOT_USER = "https://api.notion.com/v1/users/me"
->>>>>>> main

    def get_authorization_url(self):
        params = {
@@ -88,8 +85,6 @@ class NotionOAuth(OAuthDataSource):
            db.session.add(new_data_source_binding)
            db.session.commit()

-<<<<<<< HEAD
-=======
    def save_internal_access_token(self, access_token: str):
        workspace_name = self.notion_workspace_name(access_token)
        workspace_icon = None
@@ -125,7 +120,6 @@ class NotionOAuth(OAuthDataSource):
            db.session.add(new_data_source_binding)
            db.session.commit()

->>>>>>> main
    def sync_data_source(self, binding_id: str):
        # save data source binding
        data_source_binding = DataSourceBinding.query.filter(
@@ -170,14 +164,11 @@ class NotionOAuth(OAuthDataSource):
                    page_name = page_result['properties']['title']['title'][0]['plain_text']
                else:
                    page_name = 'Untitled'
-<<<<<<< HEAD
-=======
            elif 'Title' in page_result['properties']:
                if len(page_result['properties']['Title']['title']) > 0:
                    page_name = page_result['properties']['Title']['title'][0]['plain_text']
                else:
                    page_name = 'Untitled'
->>>>>>> main
            else:
                page_name = 'Untitled'
            page_icon = page_result['icon']
@@ -267,14 +258,10 @@ class NotionOAuth(OAuthDataSource):
        }
        response = requests.post(url=self._NOTION_PAGE_SEARCH, json=data, headers=headers)
        response_json = response.json()
-<<<<<<< HEAD
-        results = response_json['results']
-=======
        if 'results' in response_json:
            results = response_json['results']
        else:
            results = []
->>>>>>> main
        return results

    def notion_block_parent_page_id(self, access_token: str, block_id: str):
@@ -290,8 +277,6 @@ class NotionOAuth(OAuthDataSource):
            return self.notion_block_parent_page_id(access_token, parent[parent_type])
        return parent[parent_type]

-<<<<<<< HEAD
-=======
    def notion_workspace_name(self, access_token: str):
        headers = {
            'Authorization': f"Bearer {access_token}",
@@ -306,7 +291,6 @@ class NotionOAuth(OAuthDataSource):
                return user_info['workspace_name']
        return 'workspace'

->>>>>>> main
    def notion_database_search(self, access_token: str):
        data = {
            'filter': {
@@ -321,12 +305,8 @@ class NotionOAuth(OAuthDataSource):
        }
        response = requests.post(url=self._NOTION_PAGE_SEARCH, json=data, headers=headers)
        response_json = response.json()
-<<<<<<< HEAD
-        results = response_json['results']
-=======
        if 'results' in response_json:
            results = response_json['results']
        else:
            results = []
->>>>>>> main
        return results
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -479,21 +479,6 @@ class DocumentService:
                                                                     document_data["data_source"]["type"],
                                                                     data_source_info, created_from, position,
                                                                     account, page['page_name'], batch)
-<<<<<<< HEAD
-                            if page['type'] == 'database':
-                                document.splitting_completed_at = datetime.datetime.utcnow()
-                                document.cleaning_completed_at = datetime.datetime.utcnow()
-                                document.parsing_completed_at = datetime.datetime.utcnow()
-                                document.completed_at = datetime.datetime.utcnow()
-                                document.indexing_status = 'completed'
-                                document.word_count = 0
-                                document.tokens = 0
-                                document.indexing_latency = 0
-                            db.session.add(document)
-                            db.session.flush()
-                            if page['type'] != 'database':
-                                document_ids.append(document.id)
-=======
                            # if page['type'] == 'database':
                            #     document.splitting_completed_at = datetime.datetime.utcnow()
                            #     document.cleaning_completed_at = datetime.datetime.utcnow()
@@ -507,7 +492,6 @@ class DocumentService:
                            db.session.flush()
                            # if page['type'] != 'database':
                            document_ids.append(document.id)
->>>>>>> main
                            documents.append(document)
                            position += 1
                        else: