Unverified Commit 4a3d15b6 authored by Jyong's avatar Jyong Committed by GitHub

fix customer spliter character (#1915)

Co-authored-by: 's avatarjyong <jyong@dify.ai>
parent a798dcfa
......@@ -65,7 +65,8 @@ class FileExtractor:
elif file_extension == '.pdf':
loader = PdfLoader(file_path, upload_file=upload_file)
elif file_extension in ['.md', '.markdown']:
loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url)
loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url) if is_automatic \
else MarkdownLoader(file_path, autodetect_encoding=True)
elif file_extension in ['.htm', '.html']:
loader = HTMLLoader(file_path)
elif file_extension == '.docx':
......@@ -84,7 +85,8 @@ class FileExtractor:
loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
else:
# txt
loader = UnstructuredTextLoader(file_path, unstructured_api_url)
loader = UnstructuredTextLoader(file_path, unstructured_api_url) if is_automatic \
else TextLoader(file_path, autodetect_encoding=True)
else:
if file_extension == '.xlsx':
loader = ExcelLoader(file_path)
......
......@@ -59,7 +59,7 @@ class IndexingRunner:
first()
# load file
text_docs = self._load_data(dataset_document)
text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic')
# get splitter
splitter = self._get_splitter(processing_rule)
......@@ -113,15 +113,14 @@ class IndexingRunner:
for document_segment in document_segments:
db.session.delete(document_segment)
db.session.commit()
# load file
text_docs = self._load_data(dataset_document)
# get the process rule
processing_rule = db.session.query(DatasetProcessRule). \
filter(DatasetProcessRule.id == dataset_document.dataset_process_rule_id). \
first()
# load file
text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic')
# get splitter
splitter = self._get_splitter(processing_rule)
......@@ -238,14 +237,15 @@ class IndexingRunner:
preview_texts = []
total_segments = 0
for file_detail in file_details:
# load data from file
text_docs = FileExtractor.load(file_detail)
processing_rule = DatasetProcessRule(
mode=tmp_processing_rule["mode"],
rules=json.dumps(tmp_processing_rule["rules"])
)
# load data from file
text_docs = FileExtractor.load(file_detail, is_automatic=processing_rule.mode == 'automatic')
# get splitter
splitter = self._get_splitter(processing_rule)
......@@ -459,7 +459,7 @@ class IndexingRunner:
one_or_none()
if file_detail:
text_docs = FileExtractor.load(file_detail, is_automatic=True)
text_docs = FileExtractor.load(file_detail, is_automatic=automatic)
elif dataset_document.data_source_type == 'notion_import':
loader = NotionLoader.from_document(dataset_document)
text_docs = loader.load()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment