Commit 0039be67 authored by John Wang's avatar John Wang

fix: loader bugs

parent 1d1c56b5
...@@ -23,6 +23,7 @@ class FileExtractor: ...@@ -23,6 +23,7 @@ class FileExtractor:
storage.download(upload_file.key, file_path) storage.download(upload_file.key, file_path)
input_file = Path(file_path) input_file = Path(file_path)
delimiter = '\n'
if input_file.suffix == '.xlsx': if input_file.suffix == '.xlsx':
loader = ExcelLoader(file_path) loader = ExcelLoader(file_path)
elif input_file.suffix == '.pdf': elif input_file.suffix == '.pdf':
...@@ -39,4 +40,4 @@ class FileExtractor: ...@@ -39,4 +40,4 @@ class FileExtractor:
# txt # txt
loader = TextLoader(file_path, autodetect_encoding=True) loader = TextLoader(file_path, autodetect_encoding=True)
return '\n'.join([document.page_content for document in loader.load()]) if return_text else loader.load() return delimiter.join([document.page_content for document in loader.load()]) if return_text else loader.load()
...@@ -47,6 +47,7 @@ class MarkdownLoader(BaseLoader): ...@@ -47,6 +47,7 @@ class MarkdownLoader(BaseLoader):
documents = [] documents = []
metadata = {"source": self._file_path} metadata = {"source": self._file_path}
for header, value in tups: for header, value in tups:
value = value.strip()
if header is None: if header is None:
documents.append(Document(page_content=value, metadata=metadata)) documents.append(Document(page_content=value, metadata=metadata))
else: else:
......
...@@ -56,7 +56,7 @@ class PdfLoader(BaseLoader): ...@@ -56,7 +56,7 @@ class PdfLoader(BaseLoader):
# Extract the text from the page # Extract the text from the page
page_text = pdf.pages[page].extract_text() page_text = pdf.pages[page].extract_text()
text_list.append(page_text) text_list.append(page_text)
text = "\n".join(text_list) text = "\n\n".join(text_list)
# save plaintext file for caching # save plaintext file for caching
if not plaintext_file_exists and plaintext_file_key: if not plaintext_file_exists and plaintext_file_key:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment