Commit 8f27ea10 authored by John Wang's avatar John Wang

fix: notion error

parent 7a7fb8c6
...@@ -54,7 +54,7 @@ class CSVLoader(LCCSVLoader): ...@@ -54,7 +54,7 @@ class CSVLoader(LCCSVLoader):
source = ( source = (
row[self.source_column] row[self.source_column]
if self.source_column is not None if self.source_column is not None
else self.file_path else ''
) )
except KeyError: except KeyError:
raise ValueError( raise ValueError(
......
...@@ -40,5 +40,4 @@ class ExcelLoader(BaseLoader): ...@@ -40,5 +40,4 @@ class ExcelLoader(BaseLoader):
row_dict = {k: v for k, v in row_dict.items() if v} row_dict = {k: v for k, v in row_dict.items() if v}
data.append(json.dumps(row_dict, ensure_ascii=False)) data.append(json.dumps(row_dict, ensure_ascii=False))
metadata = {"source": self._file_path} return [Document(page_content='\n\n'.join(data))]
return [Document(page_content='\n\n'.join(data), metadata=metadata)]
...@@ -24,8 +24,7 @@ class HTMLLoader(BaseLoader): ...@@ -24,8 +24,7 @@ class HTMLLoader(BaseLoader):
self._file_path = file_path self._file_path = file_path
def load(self) -> List[Document]: def load(self) -> List[Document]:
metadata = {"source": self._file_path} return [Document(page_content=self._load_as_text())]
return [Document(page_content=self._load_as_text(), metadata=metadata)]
def _load_as_text(self) -> str: def _load_as_text(self) -> str:
with open(self._file_path, "rb") as fp: with open(self._file_path, "rb") as fp:
......
...@@ -45,13 +45,12 @@ class MarkdownLoader(BaseLoader): ...@@ -45,13 +45,12 @@ class MarkdownLoader(BaseLoader):
def load(self) -> List[Document]: def load(self) -> List[Document]:
tups = self.parse_tups(self._file_path) tups = self.parse_tups(self._file_path)
documents = [] documents = []
metadata = {"source": self._file_path}
for header, value in tups: for header, value in tups:
value = value.strip() value = value.strip()
if header is None: if header is None:
documents.append(Document(page_content=value, metadata=metadata)) documents.append(Document(page_content=value))
else: else:
documents.append(Document(page_content=f"\n\n{header}\n{value}", metadata=metadata)) documents.append(Document(page_content=f"\n\n{header}\n{value}"))
return documents return documents
......
...@@ -182,7 +182,7 @@ class NotionLoader(BaseLoader): ...@@ -182,7 +182,7 @@ class NotionLoader(BaseLoader):
block_type = result["type"] block_type = result["type"]
if has_children and block_type != 'child_page': if has_children and block_type != 'child_page':
children_text = self._read_block( children_text = self._read_block(
result_block_id, num_tabs=num_tabs + 1 result_block_id, num_tabs=1
) )
cur_result_text_arr.append(children_text) cur_result_text_arr.append(children_text)
......
...@@ -4,7 +4,6 @@ from typing import List, Optional ...@@ -4,7 +4,6 @@ from typing import List, Optional
from langchain.document_loaders import PyPDFium2Loader from langchain.document_loaders import PyPDFium2Loader
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document from langchain.schema import Document
from pypdf import PdfReader
from extensions.ext_storage import storage from extensions.ext_storage import storage
from models.model import UploadFile from models.model import UploadFile
...@@ -39,8 +38,7 @@ class PdfLoader(BaseLoader): ...@@ -39,8 +38,7 @@ class PdfLoader(BaseLoader):
try: try:
text = storage.load(plaintext_file_key).decode('utf-8') text = storage.load(plaintext_file_key).decode('utf-8')
plaintext_file_exists = True plaintext_file_exists = True
metadata = {"source": self._file_path} return [Document(page_content=text)]
return [Document(page_content=text, metadata=metadata)]
except FileNotFoundError: except FileNotFoundError:
pass pass
documents = PyPDFium2Loader(file_path=self._file_path).load() documents = PyPDFium2Loader(file_path=self._file_path).load()
...@@ -53,6 +51,5 @@ class PdfLoader(BaseLoader): ...@@ -53,6 +51,5 @@ class PdfLoader(BaseLoader):
if not plaintext_file_exists and plaintext_file_key: if not plaintext_file_exists and plaintext_file_key:
storage.save(plaintext_file_key, text.encode('utf-8')) storage.save(plaintext_file_key, text.encode('utf-8'))
metadata = {"source": self._file_path}
return documents return documents
...@@ -28,7 +28,6 @@ sentry-sdk[flask]~=1.21.1 ...@@ -28,7 +28,6 @@ sentry-sdk[flask]~=1.21.1
jieba==0.42.1 jieba==0.42.1
celery==5.2.7 celery==5.2.7
redis~=4.5.4 redis~=4.5.4
pypdf==3.8.1
openpyxl==3.1.2 openpyxl==3.1.2
chardet~=5.1.0 chardet~=5.1.0
docx2txt==0.8 docx2txt==0.8
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment