Unverified Commit 2eea114a authored by Jyong's avatar Jyong Committed by GitHub

fix special code (#473)

parent 97e9ebd2
......@@ -235,7 +235,8 @@ class IndexingRunner:
if len(preview_texts) < 5:
preview_texts.append(document.page_content)
tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, document.page_content)
tokens += TokenCalculator.get_num_tokens(self.embedding_model_name,
self.filter_string(document.page_content))
return {
"total_segments": total_segments,
......@@ -345,6 +346,8 @@ class IndexingRunner:
return text_docs
def filter_string(self, text):
text = text.replace('<|', '<')
text = text.replace('|>', '>')
pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]')
return pattern.sub('', text)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment