Fix the issue of decoding a non-UTF-8 encoded file using UTF-8

66342741 · jyong · ba1fa57d · 66342741 · 66342741
Commit 66342741 authored Jun 16, 2023 by jyong
Show whitespace changes
Inline Side-by-side

Showing with 16 additions and 2 deletions

file.py api/controllers/console/datasets/file.py +3 -1

requirements.txt api/requirements.txt +13 -1

No files found.
--- a/api/controllers/console/datasets/file.py
+++ b/api/controllers/console/datasets/file.py
 import datetime
 import hashlib
 import tempfile
+import chardet
 import time
 import uuid
 from pathlib import Path
@@ -141,7 +142,8 @@ class FilePreviewApi(Resource):
                # ['txt', 'markdown', 'md']
                with open(filepath, "rb") as fp:
                    data = fp.read()
-                    text = data.decode(encoding='utf-8').strip() if data else ''
+                    encoding = chardet.detect(data)['encoding']
+                    text = data.decode(encoding=encoding).strip() if data else ''

        text = text[0:PREVIEW_WORDS_LIMIT] if text else ''
        return {'content': text}

--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -31,3 +31,15 @@ celery==5.2.7
 redis~=4.5.4
 pypdf==3.8.1
 openpyxl==3.1.2
+requests~=2.28.2
+pydantic~=1.10.8
+SQLAlchemy~=1.4.48
+Werkzeug~=2.3.4
+click~=8.1.3
+blinker~=1.6.2
+numpy~=1.24.3
+itsdangerous~=2.1.2
+botocore~=1.29.146
+alembic~=1.11.1
+pytz~=2022.7.1
+chardet~=5.1.0
\ No newline at end of file