Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
2bf48514
Unverified
Commit
2bf48514
authored
Jun 06, 2023
by
Jyong
Committed by
GitHub
Jun 06, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix markdown parser (#230)
parent
c109b1a9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
113 additions
and
2 deletions
+113
-2
markdown_parser.py
api/core/index/readers/markdown_parser.py
+111
-0
indexing_runner.py
api/core/indexing_runner.py
+2
-2
No files found.
api/core/index/readers/markdown_parser.py
0 → 100644
View file @
2bf48514
"""Markdown parser.
Contains parser for md files.
"""
import
re
from
pathlib
import
Path
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Union
,
cast
from
llama_index.readers.file.base_parser
import
BaseParser
class
MarkdownParser
(
BaseParser
):
"""Markdown parser.
Extract text from markdown files.
Returns dictionary with keys as headers and values as the text between headers.
"""
def
__init__
(
self
,
*
args
:
Any
,
remove_hyperlinks
:
bool
=
True
,
remove_images
:
bool
=
True
,
**
kwargs
:
Any
,
)
->
None
:
"""Init params."""
super
()
.
__init__
(
*
args
,
**
kwargs
)
self
.
_remove_hyperlinks
=
remove_hyperlinks
self
.
_remove_images
=
remove_images
def
markdown_to_tups
(
self
,
markdown_text
:
str
)
->
List
[
Tuple
[
Optional
[
str
],
str
]]:
"""Convert a markdown file to a dictionary.
The keys are the headers and the values are the text under each header.
"""
markdown_tups
:
List
[
Tuple
[
Optional
[
str
],
str
]]
=
[]
lines
=
markdown_text
.
split
(
"
\n
"
)
current_header
=
None
current_text
=
""
for
line
in
lines
:
header_match
=
re
.
match
(
r"^#+\s"
,
line
)
if
header_match
:
if
current_header
is
not
None
:
markdown_tups
.
append
((
current_header
,
current_text
))
current_header
=
line
current_text
=
""
else
:
current_text
+=
line
+
"
\n
"
markdown_tups
.
append
((
current_header
,
current_text
))
if
current_header
is
not
None
:
# pass linting, assert keys are defined
markdown_tups
=
[
(
re
.
sub
(
r"#"
,
""
,
cast
(
str
,
key
))
.
strip
(),
re
.
sub
(
r"<.*?>"
,
""
,
value
))
for
key
,
value
in
markdown_tups
]
else
:
markdown_tups
=
[
(
key
,
re
.
sub
(
"
\n
"
,
""
,
value
))
for
key
,
value
in
markdown_tups
]
return
markdown_tups
def
remove_images
(
self
,
content
:
str
)
->
str
:
"""Get a dictionary of a markdown file from its path."""
pattern
=
r"!{1}\[\[(.*)\]\]"
content
=
re
.
sub
(
pattern
,
""
,
content
)
return
content
def
remove_hyperlinks
(
self
,
content
:
str
)
->
str
:
"""Get a dictionary of a markdown file from its path."""
pattern
=
r"\[(.*?)\]\((.*?)\)"
content
=
re
.
sub
(
pattern
,
r"\1"
,
content
)
return
content
def
_init_parser
(
self
)
->
Dict
:
"""Initialize the parser with the config."""
return
{}
def
parse_tups
(
self
,
filepath
:
Path
,
errors
:
str
=
"ignore"
)
->
List
[
Tuple
[
Optional
[
str
],
str
]]:
"""Parse file into tuples."""
with
open
(
filepath
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
content
=
f
.
read
()
if
self
.
_remove_hyperlinks
:
content
=
self
.
remove_hyperlinks
(
content
)
if
self
.
_remove_images
:
content
=
self
.
remove_images
(
content
)
markdown_tups
=
self
.
markdown_to_tups
(
content
)
return
markdown_tups
def
parse_file
(
self
,
filepath
:
Path
,
errors
:
str
=
"ignore"
)
->
Union
[
str
,
List
[
str
]]:
"""Parse file into string."""
tups
=
self
.
parse_tups
(
filepath
,
errors
=
errors
)
results
=
[]
# TODO: don't include headers right now
for
header
,
value
in
tups
:
if
header
is
None
:
results
.
append
(
value
)
else
:
results
.
append
(
f
"
\n\n
{header}
\n
{value}"
)
return
results
api/core/indexing_runner.py
View file @
2bf48514
...
@@ -12,11 +12,10 @@ from llama_index.data_structs import Node
...
@@ -12,11 +12,10 @@ from llama_index.data_structs import Node
from
llama_index.data_structs.node_v2
import
DocumentRelationship
from
llama_index.data_structs.node_v2
import
DocumentRelationship
from
llama_index.node_parser
import
SimpleNodeParser
,
NodeParser
from
llama_index.node_parser
import
SimpleNodeParser
,
NodeParser
from
llama_index.readers.file.base
import
DEFAULT_FILE_EXTRACTOR
from
llama_index.readers.file.base
import
DEFAULT_FILE_EXTRACTOR
from
llama_index.readers.file.markdown_parser
import
MarkdownParser
from
core.docstore.dataset_docstore
import
DatesetDocumentStore
from
core.docstore.dataset_docstore
import
DatesetDocumentStore
from
core.index.keyword_table_index
import
KeywordTableIndex
from
core.index.keyword_table_index
import
KeywordTableIndex
from
core.index.readers.html_parser
import
HTMLParser
from
core.index.readers.html_parser
import
HTMLParser
from
core.index.readers.markdown_parser
import
MarkdownParser
from
core.index.readers.pdf_parser
import
PDFParser
from
core.index.readers.pdf_parser
import
PDFParser
from
core.index.spiltter.fixed_text_splitter
import
FixedRecursiveCharacterTextSplitter
from
core.index.spiltter.fixed_text_splitter
import
FixedRecursiveCharacterTextSplitter
from
core.index.vector_index
import
VectorIndex
from
core.index.vector_index
import
VectorIndex
...
@@ -247,6 +246,7 @@ class IndexingRunner:
...
@@ -247,6 +246,7 @@ class IndexingRunner:
file_extractor
=
DEFAULT_FILE_EXTRACTOR
.
copy
()
file_extractor
=
DEFAULT_FILE_EXTRACTOR
.
copy
()
file_extractor
[
".markdown"
]
=
MarkdownParser
()
file_extractor
[
".markdown"
]
=
MarkdownParser
()
file_extractor
[
".md"
]
=
MarkdownParser
()
file_extractor
[
".html"
]
=
HTMLParser
()
file_extractor
[
".html"
]
=
HTMLParser
()
file_extractor
[
".htm"
]
=
HTMLParser
()
file_extractor
[
".htm"
]
=
HTMLParser
()
file_extractor
[
".pdf"
]
=
PDFParser
({
'upload_file'
:
upload_file
})
file_extractor
[
".pdf"
]
=
PDFParser
({
'upload_file'
:
upload_file
})
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment