Unverified Commit 998f819b authored by crazywoola's avatar crazywoola Committed by GitHub

use sub to operate all (#475)

parent 6194b827
...@@ -346,10 +346,10 @@ class IndexingRunner: ...@@ -346,10 +346,10 @@ class IndexingRunner:
return text_docs return text_docs
def filter_string(self, text): def filter_string(self, text):
text = text.replace('<|', '<') text = re.sub(r'<\|', '<', text)
text = text.replace('|>', '>') text = re.sub(r'\|>', '>', text)
pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]') text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]', '', text)
return pattern.sub('', text) return text
def _get_splitter(self, processing_rule: DatasetProcessRule) -> TextSplitter: def _get_splitter(self, processing_rule: DatasetProcessRule) -> TextSplitter:
""" """
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment