Commit 0039be67 authored by John Wang's avatar John Wang

fix: loader bugs

parent 1d1c56b5
......@@ -23,6 +23,7 @@ class FileExtractor:
storage.download(upload_file.key, file_path)
input_file = Path(file_path)
delimiter = '\n'
if input_file.suffix == '.xlsx':
loader = ExcelLoader(file_path)
elif input_file.suffix == '.pdf':
......@@ -39,4 +40,4 @@ class FileExtractor:
# txt
loader = TextLoader(file_path, autodetect_encoding=True)
return '\n'.join([document.page_content for document in loader.load()]) if return_text else loader.load()
return delimiter.join([document.page_content for document in loader.load()]) if return_text else loader.load()
......@@ -47,6 +47,7 @@ class MarkdownLoader(BaseLoader):
documents = []
metadata = {"source": self._file_path}
for header, value in tups:
value = value.strip()
if header is None:
documents.append(Document(page_content=value, metadata=metadata))
else:
......
......@@ -56,7 +56,7 @@ class PdfLoader(BaseLoader):
# Extract the text from the page
page_text = pdf.pages[page].extract_text()
text_list.append(page_text)
text = "\n".join(text_list)
text = "\n\n".join(text_list)
# save plaintext file for caching
if not plaintext_file_exists and plaintext_file_key:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment