Unverified Commit 64642fab authored by Charlie.Wei's avatar Charlie.Wei Committed by GitHub

Parse base64 eml file (#1796)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: 's avatarcrazywoola <427733928@qq.com>
Co-authored-by: 's avatarcrazywoola <100913391+crazywoola@users.noreply.github.com>
parent 7083a05a
import logging import logging
import re import base64
from typing import Optional, List, Tuple, cast from typing import List
from bs4 import BeautifulSoup
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document from langchain.schema import Document
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -11,8 +10,6 @@ logger = logging.getLogger(__name__) ...@@ -11,8 +10,6 @@ logger = logging.getLogger(__name__)
class UnstructuredEmailLoader(BaseLoader): class UnstructuredEmailLoader(BaseLoader):
"""Load msg files. """Load msg files.
Args: Args:
file_path: Path to the file to load. file_path: Path to the file to load.
""" """
...@@ -26,16 +23,28 @@ class UnstructuredEmailLoader(BaseLoader): ...@@ -26,16 +23,28 @@ class UnstructuredEmailLoader(BaseLoader):
self._file_path = file_path self._file_path = file_path
self._api_url = api_url self._api_url = api_url
def load(self) -> List[Document]: def load(self) -> List[Document]:
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
elements = partition_email(filename=self._file_path, api_url=self._api_url) elements = partition_email(filename=self._file_path, api_url=self._api_url)
# noinspection PyBroadException
try:
for element in elements:
element_text = element.text.strip()
padding_needed = 4 - len(element_text) % 4
element_text += '=' * padding_needed
element_decode = base64.b64decode(element_text)
soup = BeautifulSoup(element_decode.decode('utf-8'), 'html.parser')
element.text = soup.get_text()
except Exception:
pass
from unstructured.chunking.title import chunk_by_title from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
documents = [] documents = []
for chunk in chunks: for chunk in chunks:
text = chunk.text.strip() text = chunk.text.strip()
documents.append(Document(page_content=text)) documents.append(Document(page_content=text))
return documents return documents
...@@ -55,4 +55,6 @@ pymilvus==2.3.0 ...@@ -55,4 +55,6 @@ pymilvus==2.3.0
qdrant-client==1.6.4 qdrant-client==1.6.4
cohere~=4.32 cohere~=4.32
unstructured~=0.10.27 unstructured~=0.10.27
unstructured[docx,pptx]~=0.10.27 unstructured[docx,pptx]~=0.10.27
\ No newline at end of file bs4~=0.0.1
markdown~=3.5.1
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment