Commit 8f27ea10 authored by John Wang's avatar John Wang

fix: notion error

parent 7a7fb8c6
......@@ -54,7 +54,7 @@ class CSVLoader(LCCSVLoader):
source = (
row[self.source_column]
if self.source_column is not None
else self.file_path
else ''
)
except KeyError:
raise ValueError(
......
......@@ -40,5 +40,4 @@ class ExcelLoader(BaseLoader):
row_dict = {k: v for k, v in row_dict.items() if v}
data.append(json.dumps(row_dict, ensure_ascii=False))
metadata = {"source": self._file_path}
return [Document(page_content='\n\n'.join(data), metadata=metadata)]
return [Document(page_content='\n\n'.join(data))]
......@@ -24,8 +24,7 @@ class HTMLLoader(BaseLoader):
self._file_path = file_path
def load(self) -> List[Document]:
metadata = {"source": self._file_path}
return [Document(page_content=self._load_as_text(), metadata=metadata)]
return [Document(page_content=self._load_as_text())]
def _load_as_text(self) -> str:
with open(self._file_path, "rb") as fp:
......
......@@ -45,13 +45,12 @@ class MarkdownLoader(BaseLoader):
def load(self) -> List[Document]:
tups = self.parse_tups(self._file_path)
documents = []
metadata = {"source": self._file_path}
for header, value in tups:
value = value.strip()
if header is None:
documents.append(Document(page_content=value, metadata=metadata))
documents.append(Document(page_content=value))
else:
documents.append(Document(page_content=f"\n\n{header}\n{value}", metadata=metadata))
documents.append(Document(page_content=f"\n\n{header}\n{value}"))
return documents
......
......@@ -182,7 +182,7 @@ class NotionLoader(BaseLoader):
block_type = result["type"]
if has_children and block_type != 'child_page':
children_text = self._read_block(
result_block_id, num_tabs=num_tabs + 1
result_block_id, num_tabs=1
)
cur_result_text_arr.append(children_text)
......
......@@ -4,7 +4,6 @@ from typing import List, Optional
from langchain.document_loaders import PyPDFium2Loader
from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document
from pypdf import PdfReader
from extensions.ext_storage import storage
from models.model import UploadFile
......@@ -39,8 +38,7 @@ class PdfLoader(BaseLoader):
try:
text = storage.load(plaintext_file_key).decode('utf-8')
plaintext_file_exists = True
metadata = {"source": self._file_path}
return [Document(page_content=text, metadata=metadata)]
return [Document(page_content=text)]
except FileNotFoundError:
pass
documents = PyPDFium2Loader(file_path=self._file_path).load()
......@@ -53,6 +51,5 @@ class PdfLoader(BaseLoader):
if not plaintext_file_exists and plaintext_file_key:
storage.save(plaintext_file_key, text.encode('utf-8'))
metadata = {"source": self._file_path}
return documents
......@@ -28,7 +28,6 @@ sentry-sdk[flask]~=1.21.1
jieba==0.42.1
celery==5.2.7
redis~=4.5.4
pypdf==3.8.1
openpyxl==3.1.2
chardet~=5.1.0
docx2txt==0.8
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment