Commit f2605278 authored by jyong's avatar jyong

ascii filter Unicode U+FFFE

parent b0e114b9
......@@ -528,11 +528,13 @@ class IndexingRunner:
return text_docs
65534
def filter_string(self, text):
text = re.sub(r'<\|', '<', text)
text = re.sub(r'\|>', '>', text)
text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xA0-\xFF\xEF\xBF\xBE]', '', text)
# Unicode U+FFFE
text = re.sub(u'\uFFFE', '', text)
return text
def _get_splitter(self, processing_rule: DatasetProcessRule,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment