Commit 7241253a authored by jyong's avatar jyong

add notion table support

parent d3713c51
...@@ -14,7 +14,7 @@ BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children" ...@@ -14,7 +14,7 @@ BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query" DATABASE_URL_TMPL = "https://api.notion.com/v1/databases/{database_id}/query"
SEARCH_URL = "https://api.notion.com/v1/search" SEARCH_URL = "https://api.notion.com/v1/search"
RETRIEVE_PAGE_URL_TMPL = "https://api.notion.com/v1/pages/{page_id}" RETRIEVE_PAGE_URL_TMPL = "https://api.notion.com/v1/pages/{page_id}"
HEADING_TYPE = ['heading_1', 'heading_2', 'heading_3']
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -58,30 +58,38 @@ class NotionPageReader(BaseReader): ...@@ -58,30 +58,38 @@ class NotionPageReader(BaseReader):
"GET", block_url, headers=self.headers, json=query_dict "GET", block_url, headers=self.headers, json=query_dict
) )
data = res.json() data = res.json()
heading = ''
for result in data["results"]: for result in data["results"]:
result_type = result["type"] result_type = result["type"]
result_obj = result[result_type] result_obj = result[result_type]
cur_result_text_arr = [] cur_result_text_arr = []
if "rich_text" in result_obj: if result_type == 'table':
for rich_text in result_obj["rich_text"]: result_block_id = result["id"]
# skip if doesn't have text object text = self._read_table_rows(result_block_id)
if "text" in rich_text: result_lines_arr.append(text)
text = rich_text["text"]["content"] else:
prefix = "\t" * num_tabs if "rich_text" in result_obj:
cur_result_text_arr.append(prefix + text) for rich_text in result_obj["rich_text"]:
# skip if doesn't have text object
result_block_id = result["id"] if "text" in rich_text:
has_children = result["has_children"] text = rich_text["text"]["content"]
if has_children: prefix = "\t" * num_tabs
children_text = self._read_block( cur_result_text_arr.append(prefix + text)
result_block_id, num_tabs=num_tabs + 1 if result_type in HEADING_TYPE:
) heading = text
cur_result_text_arr.append(children_text) result_block_id = result["id"]
has_children = result["has_children"]
cur_result_text = "\n".join(cur_result_text_arr) if has_children:
result_lines_arr.append(cur_result_text) children_text = self._read_block(
result_block_id, num_tabs=num_tabs + 1
)
cur_result_text_arr.append(children_text)
cur_result_text = "\n".join(cur_result_text_arr)
if result_type in HEADING_TYPE:
result_lines_arr.append(cur_result_text)
else:
result_lines_arr.append(f'{heading}\n{cur_result_text}')
if data["next_cursor"] is None: if data["next_cursor"] is None:
done = True done = True
...@@ -92,6 +100,49 @@ class NotionPageReader(BaseReader): ...@@ -92,6 +100,49 @@ class NotionPageReader(BaseReader):
result_lines = "\n".join(result_lines_arr) result_lines = "\n".join(result_lines_arr)
return result_lines return result_lines
def _read_table_rows(self, block_id: str) -> str:
"""Read table rows."""
done = False
result_lines_arr = []
cur_block_id = block_id
while not done:
block_url = BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
query_dict: Dict[str, Any] = {}
res = requests.request(
"GET", block_url, headers=self.headers, json=query_dict
)
data = res.json()
# get table headers text
table_header_cell_texts = []
tabel_header_cells = data["results"][0]['table_row']['cells']
for tabel_header_cell in tabel_header_cells:
if tabel_header_cell:
for table_header_cell_text in tabel_header_cell:
text = table_header_cell_text["text"]["content"]
table_header_cell_texts.append(text)
# get table columns text and format
results = data["results"]
for i in range(len(results-1)):
column_texts = []
tabel_column_cells = data["results"][i+1]['table_row']['cells']
for j in range(len(tabel_column_cells)):
if tabel_column_cells[j]:
for table_column_cell_text in tabel_column_cells[j]:
column_text = table_column_cell_text["text"]["content"]
column_texts.append(f'{table_header_cell_texts[j]}:{column_text}')
cur_result_text = "\n".join(column_texts)
result_lines_arr.append(cur_result_text)
if data["next_cursor"] is None:
done = True
break
else:
cur_block_id = data["next_cursor"]
result_lines = "\n".join(result_lines_arr)
return result_lines
def _read_parent_blocks(self, block_id: str, num_tabs: int = 0) -> List[str]: def _read_parent_blocks(self, block_id: str, num_tabs: int = 0) -> List[str]:
"""Read a block.""" """Read a block."""
done = False done = False
...@@ -105,31 +156,41 @@ class NotionPageReader(BaseReader): ...@@ -105,31 +156,41 @@ class NotionPageReader(BaseReader):
"GET", block_url, headers=self.headers, json=query_dict "GET", block_url, headers=self.headers, json=query_dict
) )
data = res.json() data = res.json()
# current block's heading
heading = ''
for result in data["results"]: for result in data["results"]:
result_type = result["type"] result_type = result["type"]
result_obj = result[result_type] result_obj = result[result_type]
cur_result_text_arr = [] cur_result_text_arr = []
if "rich_text" in result_obj: if result_type == 'table':
for rich_text in result_obj["rich_text"]: result_block_id = result["id"]
# skip if doesn't have text object text = self._read_table_rows(result_block_id)
if "text" in rich_text: text += "\n\n"
text = rich_text["text"]["content"] result_lines_arr.append(text)
prefix = "\t" * num_tabs else:
cur_result_text_arr.append(prefix + text) if "rich_text" in result_obj:
for rich_text in result_obj["rich_text"]:
result_block_id = result["id"] # skip if doesn't have text object
has_children = result["has_children"] if "text" in rich_text:
if has_children: text = rich_text["text"]["content"]
children_text = self._read_block( cur_result_text_arr.append(text)
result_block_id, num_tabs=num_tabs + 1 if result_type in HEADING_TYPE:
) heading = text
cur_result_text_arr.append(children_text)
result_block_id = result["id"]
cur_result_text = "\n".join(cur_result_text_arr) has_children = result["has_children"]
cur_result_text += "\n\n" if has_children:
result_lines_arr.append(cur_result_text) children_text = self._read_block(
result_block_id, num_tabs=num_tabs + 1
)
cur_result_text_arr.append(children_text)
cur_result_text = "\n".join(cur_result_text_arr)
cur_result_text += "\n\n"
if result_type in HEADING_TYPE:
result_lines_arr.append(cur_result_text)
else:
result_lines_arr.append(f'{heading}\n{cur_result_text}')
if data["next_cursor"] is None: if data["next_cursor"] is None:
done = True done = True
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment