Commit 7241253a authored by jyong's avatar jyong

add notion table support

parent d3713c51
...@@ -14,7 +14,7 @@ BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children" ...@@ -14,7 +14,7 @@ BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query" DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
SEARCH_URL = "https://api.notion.com/v1/search" SEARCH_URL = "https://api.notion.com/v1/search"
RETRIEVE_PAGE_URL_TMPL = "https://api.notion.com/v1/pages/{page_id}" RETRIEVE_PAGE_URL_TMPL = "https://api.notion.com/v1/pages/{page_id}"
HEADING_TYPE = ['heading_1', 'heading_2', 'heading_3']
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -58,12 +58,16 @@ class NotionPageReader(BaseReader): ...@@ -58,12 +58,16 @@ class NotionPageReader(BaseReader):
"GET", block_url, headers=self.headers, json=query_dict "GET", block_url, headers=self.headers, json=query_dict
) )
data = res.json() data = res.json()
heading = ''
for result in data["results"]: for result in data["results"]:
result_type = result["type"] result_type = result["type"]
result_obj = result[result_type] result_obj = result[result_type]
cur_result_text_arr = [] cur_result_text_arr = []
if result_type == 'table':
result_block_id = result["id"]
text = self._read_table_rows(result_block_id)
result_lines_arr.append(text)
else:
if "rich_text" in result_obj: if "rich_text" in result_obj:
for rich_text in result_obj["rich_text"]: for rich_text in result_obj["rich_text"]:
# skip if doesn't have text object # skip if doesn't have text object
...@@ -71,7 +75,8 @@ class NotionPageReader(BaseReader): ...@@ -71,7 +75,8 @@ class NotionPageReader(BaseReader):
text = rich_text["text"]["content"] text = rich_text["text"]["content"]
prefix = "\t" * num_tabs prefix = "\t" * num_tabs
cur_result_text_arr.append(prefix + text) cur_result_text_arr.append(prefix + text)
if result_type in HEADING_TYPE:
heading = text
result_block_id = result["id"] result_block_id = result["id"]
has_children = result["has_children"] has_children = result["has_children"]
if has_children: if has_children:
...@@ -81,7 +86,10 @@ class NotionPageReader(BaseReader): ...@@ -81,7 +86,10 @@ class NotionPageReader(BaseReader):
cur_result_text_arr.append(children_text) cur_result_text_arr.append(children_text)
cur_result_text = "\n".join(cur_result_text_arr) cur_result_text = "\n".join(cur_result_text_arr)
if result_type in HEADING_TYPE:
result_lines_arr.append(cur_result_text) result_lines_arr.append(cur_result_text)
else:
result_lines_arr.append(f'{heading}\n{cur_result_text}')
if data["next_cursor"] is None: if data["next_cursor"] is None:
done = True done = True
...@@ -92,6 +100,49 @@ class NotionPageReader(BaseReader): ...@@ -92,6 +100,49 @@ class NotionPageReader(BaseReader):
result_lines = "\n".join(result_lines_arr) result_lines = "\n".join(result_lines_arr)
return result_lines return result_lines
def _read_table_rows(self, block_id: str) -> str:
"""Read table rows."""
done = False
result_lines_arr = []
cur_block_id = block_id
while not done:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: Dict[str, Any] = {}
res = requests.request(
"GET", block_url, headers=self.headers, json=query_dict
)
data = res.json()
# get table headers text
table_header_cell_texts = []
tabel_header_cells = data["results"][0]['table_row']['cells']
for tabel_header_cell in tabel_header_cells:
if tabel_header_cell:
for table_header_cell_text in tabel_header_cell:
text = table_header_cell_text["text"]["content"]
table_header_cell_texts.append(text)
# get table columns text and format
results = data["results"]
for i in range(len(results-1)):
column_texts = []
tabel_column_cells = data["results"][i+1]['table_row']['cells']
for j in range(len(tabel_column_cells)):
if tabel_column_cells[j]:
for table_column_cell_text in tabel_column_cells[j]:
column_text = table_column_cell_text["text"]["content"]
column_texts.append(f'{table_header_cell_texts[j]}:{column_text}')
cur_result_text = "\n".join(column_texts)
result_lines_arr.append(cur_result_text)
if data["next_cursor"] is None:
done = True
break
else:
cur_block_id = data["next_cursor"]
result_lines = "\n".join(result_lines_arr)
return result_lines
def _read_parent_blocks(self, block_id: str, num_tabs: int = 0) -> List[str]: def _read_parent_blocks(self, block_id: str, num_tabs: int = 0) -> List[str]:
"""Read a block.""" """Read a block."""
done = False done = False
...@@ -105,19 +156,26 @@ class NotionPageReader(BaseReader): ...@@ -105,19 +156,26 @@ class NotionPageReader(BaseReader):
"GET", block_url, headers=self.headers, json=query_dict "GET", block_url, headers=self.headers, json=query_dict
) )
data = res.json() data = res.json()
# current block's heading
heading = ''
for result in data["results"]: for result in data["results"]:
result_type = result["type"] result_type = result["type"]
result_obj = result[result_type] result_obj = result[result_type]
cur_result_text_arr = [] cur_result_text_arr = []
if result_type == 'table':
result_block_id = result["id"]
text = self._read_table_rows(result_block_id)
text += "\n\n"
result_lines_arr.append(text)
else:
if "rich_text" in result_obj: if "rich_text" in result_obj:
for rich_text in result_obj["rich_text"]: for rich_text in result_obj["rich_text"]:
# skip if doesn't have text object # skip if doesn't have text object
if "text" in rich_text: if "text" in rich_text:
text = rich_text["text"]["content"] text = rich_text["text"]["content"]
prefix = "\t" * num_tabs cur_result_text_arr.append(text)
cur_result_text_arr.append(prefix + text) if result_type in HEADING_TYPE:
heading = text
result_block_id = result["id"] result_block_id = result["id"]
has_children = result["has_children"] has_children = result["has_children"]
...@@ -129,7 +187,10 @@ class NotionPageReader(BaseReader): ...@@ -129,7 +187,10 @@ class NotionPageReader(BaseReader):
cur_result_text = "\n".join(cur_result_text_arr) cur_result_text = "\n".join(cur_result_text_arr)
cur_result_text += "\n\n" cur_result_text += "\n\n"
if result_type in HEADING_TYPE:
result_lines_arr.append(cur_result_text) result_lines_arr.append(cur_result_text)
else:
result_lines_arr.append(f'{heading}\n{cur_result_text}')
if data["next_cursor"] is None: if data["next_cursor"] is None:
done = True done = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment