Unverified Commit 1f48e3d4 authored by crazywoola's avatar crazywoola Committed by GitHub

feat: support legacy doc (#2100)

parent 0113627d
...@@ -9,7 +9,7 @@ from flask import current_app, request ...@@ -9,7 +9,7 @@ from flask import current_app, request
from flask_login import current_user from flask_login import current_user
from flask_restful import Resource, marshal_with from flask_restful import Resource, marshal_with
from libs.login import login_required from libs.login import login_required
from services.file_service import FileService from services.file_service import FileService, ALLOWED_EXTENSIONS, UNSTRUSTURED_ALLOWED_EXTENSIONS
PREVIEW_WORDS_LIMIT = 3000 PREVIEW_WORDS_LIMIT = 3000
...@@ -71,11 +71,7 @@ class FileSupportTypeApi(Resource): ...@@ -71,11 +71,7 @@ class FileSupportTypeApi(Resource):
@account_initialization_required @account_initialization_required
def get(self): def get(self):
etl_type = current_app.config['ETL_TYPE'] etl_type = current_app.config['ETL_TYPE']
if etl_type == 'Unstructured': allowed_extensions = UNSTRUSTURED_ALLOWED_EXTENSIONS if etl_type == 'Unstructured' else ALLOWED_EXTENSIONS
allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
else:
allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
return {'allowed_extensions': allowed_extensions} return {'allowed_extensions': allowed_extensions}
......
...@@ -27,7 +27,7 @@ USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTM ...@@ -27,7 +27,7 @@ USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTM
class FileExtractor: class FileExtractor:
@classmethod @classmethod
def load(cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False) -> Union[List[Document] | str]: def load(cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False) -> Union[List[Document], str]:
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
suffix = Path(upload_file.key).suffix suffix = Path(upload_file.key).suffix
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
...@@ -36,7 +36,7 @@ class FileExtractor: ...@@ -36,7 +36,7 @@ class FileExtractor:
return cls.load_from_file(file_path, return_text, upload_file, is_automatic) return cls.load_from_file(file_path, return_text, upload_file, is_automatic)
@classmethod @classmethod
def load_from_url(cls, url: str, return_text: bool = False) -> Union[List[Document] | str]: def load_from_url(cls, url: str, return_text: bool = False) -> Union[List[Document], str]:
response = requests.get(url, headers={ response = requests.get(url, headers={
"User-Agent": USER_AGENT "User-Agent": USER_AGENT
}) })
...@@ -52,7 +52,7 @@ class FileExtractor: ...@@ -52,7 +52,7 @@ class FileExtractor:
@classmethod @classmethod
def load_from_file(cls, file_path: str, return_text: bool = False, def load_from_file(cls, file_path: str, return_text: bool = False,
upload_file: Optional[UploadFile] = None, upload_file: Optional[UploadFile] = None,
is_automatic: bool = False) -> Union[List[Document] | str]: is_automatic: bool = False) -> Union[List[Document], str]:
input_file = Path(file_path) input_file = Path(file_path)
delimiter = '\n' delimiter = '\n'
file_extension = input_file.suffix.lower() file_extension = input_file.suffix.lower()
...@@ -68,7 +68,7 @@ class FileExtractor: ...@@ -68,7 +68,7 @@ class FileExtractor:
else MarkdownLoader(file_path, autodetect_encoding=True) else MarkdownLoader(file_path, autodetect_encoding=True)
elif file_extension in ['.htm', '.html']: elif file_extension in ['.htm', '.html']:
loader = HTMLLoader(file_path) loader = HTMLLoader(file_path)
elif file_extension == '.docx': elif file_extension in ['.docx', '.doc']:
loader = Docx2txtLoader(file_path) loader = Docx2txtLoader(file_path)
elif file_extension == '.csv': elif file_extension == '.csv':
loader = CSVLoader(file_path, autodetect_encoding=True) loader = CSVLoader(file_path, autodetect_encoding=True)
...@@ -95,7 +95,7 @@ class FileExtractor: ...@@ -95,7 +95,7 @@ class FileExtractor:
loader = MarkdownLoader(file_path, autodetect_encoding=True) loader = MarkdownLoader(file_path, autodetect_encoding=True)
elif file_extension in ['.htm', '.html']: elif file_extension in ['.htm', '.html']:
loader = HTMLLoader(file_path) loader = HTMLLoader(file_path)
elif file_extension == '.docx': elif file_extension in ['.docx', '.doc']:
loader = Docx2txtLoader(file_path) loader = Docx2txtLoader(file_path)
elif file_extension == '.csv': elif file_extension == '.csv':
loader = CSVLoader(file_path, autodetect_encoding=True) loader = CSVLoader(file_path, autodetect_encoding=True)
......
import logging import logging
import re from typing import List
from typing import List, Optional, Tuple, cast
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document from langchain.schema import Document
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
import logging import logging
import re from typing import List
from typing import List, Optional, Tuple, cast
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document from langchain.schema import Document
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class UnstructuredPPTLoader(BaseLoader): class UnstructuredPPTLoader(BaseLoader):
"""Load msg files. """Load msg files.
......
import logging import logging
import re from typing import List
from typing import List, Optional, Tuple, cast
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document from langchain.schema import Document
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class UnstructuredPPTXLoader(BaseLoader): class UnstructuredPPTXLoader(BaseLoader):
"""Load msg files. """Load msg files.
......
import logging import logging
import re from typing import List
from typing import List, Optional, Tuple, cast
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document from langchain.schema import Document
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
import logging import logging
import re from typing import List
from typing import List, Optional, Tuple, cast
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.schema import Document from langchain.schema import Document
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
...@@ -15,9 +15,10 @@ from services.errors.file import FileTooLargeError, UnsupportedFileTypeError ...@@ -15,9 +15,10 @@ from services.errors.file import FileTooLargeError, UnsupportedFileTypeError
from werkzeug.datastructures import FileStorage from werkzeug.datastructures import FileStorage
from werkzeug.exceptions import NotFound from werkzeug.exceptions import NotFound
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv',
'jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg'] IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'doc', 'csv'] + IMAGE_EXTENSIONS
UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
'docx', 'doc', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] + IMAGE_EXTENSIONS
PREVIEW_WORDS_LIMIT = 3000 PREVIEW_WORDS_LIMIT = 3000
...@@ -27,13 +28,7 @@ class FileService: ...@@ -27,13 +28,7 @@ class FileService:
def upload_file(file: FileStorage, user: Union[Account, EndUser], only_image: bool = False) -> UploadFile: def upload_file(file: FileStorage, user: Union[Account, EndUser], only_image: bool = False) -> UploadFile:
extension = file.filename.split('.')[-1] extension = file.filename.split('.')[-1]
etl_type = current_app.config['ETL_TYPE'] etl_type = current_app.config['ETL_TYPE']
if etl_type == 'Unstructured': allowed_extensions = UNSTRUSTURED_ALLOWED_EXTENSIONS if etl_type == 'Unstructured' else ALLOWED_EXTENSIONS
allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml',
'jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
else:
allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv',
'jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
if extension.lower() not in allowed_extensions: if extension.lower() not in allowed_extensions:
raise UnsupportedFileTypeError() raise UnsupportedFileTypeError()
elif only_image and extension.lower() not in IMAGE_EXTENSIONS: elif only_image and extension.lower() not in IMAGE_EXTENSIONS:
...@@ -133,13 +128,7 @@ class FileService: ...@@ -133,13 +128,7 @@ class FileService:
# extract text from file # extract text from file
extension = upload_file.extension extension = upload_file.extension
etl_type = current_app.config['ETL_TYPE'] etl_type = current_app.config['ETL_TYPE']
if etl_type == 'Unstructured': allowed_extensions = UNSTRUSTURED_ALLOWED_EXTENSIONS if etl_type == 'Unstructured' else ALLOWED_EXTENSIONS
allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml',
'jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
else:
allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv',
'jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
if extension.lower() not in allowed_extensions: if extension.lower() not in allowed_extensions:
raise UnsupportedFileTypeError() raise UnsupportedFileTypeError()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment