Commit 7a7fb8c6 authored by jyong's avatar jyong

improve pdf import

parent a92b028a
import logging import logging
from typing import List, Optional from typing import List, Optional
from langchain.document_loaders import PyPDFium2Loader
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document from langchain.schema import Document
from pypdf import PdfReader from pypdf import PdfReader
...@@ -42,20 +43,10 @@ class PdfLoader(BaseLoader): ...@@ -42,20 +43,10 @@ class PdfLoader(BaseLoader):
return [Document(page_content=text, metadata=metadata)] return [Document(page_content=text, metadata=metadata)]
except FileNotFoundError: except FileNotFoundError:
pass pass
documents = PyPDFium2Loader(file_path=self._file_path).load()
text_list = [] text_list = []
with open(self._file_path, "rb") as fp: for document in documents:
# Create a PDF object text_list.append(document.page_content)
pdf = PdfReader(fp)
# Get the number of pages in the PDF document
num_pages = len(pdf.pages)
# Iterate over every page
for page in range(num_pages):
# Extract the text from the page
page_text = pdf.pages[page].extract_text()
text_list.append(page_text)
text = "\n\n".join(text_list) text = "\n\n".join(text_list)
# save plaintext file for caching # save plaintext file for caching
...@@ -63,5 +54,5 @@ class PdfLoader(BaseLoader): ...@@ -63,5 +54,5 @@ class PdfLoader(BaseLoader):
storage.save(plaintext_file_key, text.encode('utf-8')) storage.save(plaintext_file_key, text.encode('utf-8'))
metadata = {"source": self._file_path} metadata = {"source": self._file_path}
return [Document(page_content=text, metadata=metadata)] return documents
...@@ -32,3 +32,4 @@ pypdf==3.8.1 ...@@ -32,3 +32,4 @@ pypdf==3.8.1
openpyxl==3.1.2 openpyxl==3.1.2
chardet~=5.1.0 chardet~=5.1.0
docx2txt==0.8 docx2txt==0.8
pypdfium2==4.16.0
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment