Unverified Commit b1635457 authored by Bowen Liang's avatar Bowen Liang Committed by GitHub

Use `python-docx` to extract docx files (#2654)

parent c0b82f8e
...@@ -10,7 +10,7 @@ from core.rag.models.document import Document ...@@ -10,7 +10,7 @@ from core.rag.models.document import Document
class WordExtractor(BaseExtractor): class WordExtractor(BaseExtractor):
"""Load pdf files. """Load docx files.
Args: Args:
...@@ -46,14 +46,16 @@ class WordExtractor(BaseExtractor): ...@@ -46,14 +46,16 @@ class WordExtractor(BaseExtractor):
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
"""Load given path as single page.""" """Load given path as single page."""
import docx2txt from docx import Document as docx_Document
return [ document = docx_Document(self.file_path)
Document( doc_texts = [paragraph.text for paragraph in document.paragraphs]
page_content=docx2txt.process(self.file_path), content = '\n'.join(doc_texts)
metadata={"source": self.file_path},
) return [Document(
] page_content=content,
metadata={"source": self.file_path},
)]
@staticmethod @staticmethod
def _is_valid_url(url: str) -> bool: def _is_valid_url(url: str) -> bool:
......
...@@ -32,7 +32,7 @@ celery==5.2.7 ...@@ -32,7 +32,7 @@ celery==5.2.7
redis~=4.5.4 redis~=4.5.4
openpyxl==3.1.2 openpyxl==3.1.2
chardet~=5.1.0 chardet~=5.1.0
docx2txt==0.8 python-docx~=1.1.0
pypdfium2==4.16.0 pypdfium2==4.16.0
resend~=0.7.0 resend~=0.7.0
pyjwt~=2.8.0 pyjwt~=2.8.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment