Commit 66342741 authored by jyong's avatar jyong

Fix the issue of decoding a non-UTF-8 encoded file using UTF-8

parent ba1fa57d
import datetime
import hashlib
import tempfile
import chardet
import time
import uuid
from pathlib import Path
......@@ -141,7 +142,8 @@ class FilePreviewApi(Resource):
# ['txt', 'markdown', 'md']
with open(filepath, "rb") as fp:
data = fp.read()
text = data.decode(encoding='utf-8').strip() if data else ''
encoding = chardet.detect(data)['encoding']
text = data.decode(encoding=encoding).strip() if data else ''
text = text[0:PREVIEW_WORDS_LIMIT] if text else ''
return {'content': text}
......
......@@ -31,3 +31,15 @@ celery==5.2.7
redis~=4.5.4
pypdf==3.8.1
openpyxl==3.1.2
requests~=2.28.2
pydantic~=1.10.8
SQLAlchemy~=1.4.48
Werkzeug~=2.3.4
click~=8.1.3
blinker~=1.6.2
numpy~=1.24.3
itsdangerous~=2.1.2
botocore~=1.29.146
alembic~=1.11.1
pytz~=2022.7.1
chardet~=5.1.0
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment