Commit 66342741 authored by jyong's avatar jyong

Fix the issue of decoding a non-UTF-8 encoded file using UTF-8

parent ba1fa57d
import datetime import datetime
import hashlib import hashlib
import tempfile import tempfile
import chardet
import time import time
import uuid import uuid
from pathlib import Path from pathlib import Path
...@@ -141,7 +142,8 @@ class FilePreviewApi(Resource): ...@@ -141,7 +142,8 @@ class FilePreviewApi(Resource):
# ['txt', 'markdown', 'md'] # ['txt', 'markdown', 'md']
with open(filepath, "rb") as fp: with open(filepath, "rb") as fp:
data = fp.read() data = fp.read()
text = data.decode(encoding='utf-8').strip() if data else '' encoding = chardet.detect(data)['encoding']
text = data.decode(encoding=encoding).strip() if data else ''
text = text[0:PREVIEW_WORDS_LIMIT] if text else '' text = text[0:PREVIEW_WORDS_LIMIT] if text else ''
return {'content': text} return {'content': text}
......
...@@ -30,4 +30,16 @@ jieba==0.42.1 ...@@ -30,4 +30,16 @@ jieba==0.42.1
celery==5.2.7 celery==5.2.7
redis~=4.5.4 redis~=4.5.4
pypdf==3.8.1 pypdf==3.8.1
openpyxl==3.1.2 openpyxl==3.1.2
\ No newline at end of file requests~=2.28.2
pydantic~=1.10.8
SQLAlchemy~=1.4.48
Werkzeug~=2.3.4
click~=8.1.3
blinker~=1.6.2
numpy~=1.24.3
itsdangerous~=2.1.2
botocore~=1.29.146
alembic~=1.11.1
pytz~=2022.7.1
chardet~=5.1.0
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment