Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
0039be67
Commit
0039be67
authored
Jun 20, 2023
by
John Wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: loader bugs
parent
1d1c56b5
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
4 additions
and
2 deletions
+4
-2
file_extractor.py
api/core/data_loader/file_extractor.py
+2
-1
markdown.py
api/core/data_loader/loader/markdown.py
+1
-0
pdf.py
api/core/data_loader/loader/pdf.py
+1
-1
No files found.
api/core/data_loader/file_extractor.py
View file @
0039be67
...
...
@@ -23,6 +23,7 @@ class FileExtractor:
storage
.
download
(
upload_file
.
key
,
file_path
)
input_file
=
Path
(
file_path
)
delimiter
=
'
\n
'
if
input_file
.
suffix
==
'.xlsx'
:
loader
=
ExcelLoader
(
file_path
)
elif
input_file
.
suffix
==
'.pdf'
:
...
...
@@ -39,4 +40,4 @@ class FileExtractor:
# txt
loader
=
TextLoader
(
file_path
,
autodetect_encoding
=
True
)
return
'
\n
'
.
join
([
document
.
page_content
for
document
in
loader
.
load
()])
if
return_text
else
loader
.
load
()
return
delimiter
.
join
([
document
.
page_content
for
document
in
loader
.
load
()])
if
return_text
else
loader
.
load
()
api/core/data_loader/loader/markdown.py
View file @
0039be67
...
...
@@ -47,6 +47,7 @@ class MarkdownLoader(BaseLoader):
documents
=
[]
metadata
=
{
"source"
:
self
.
_file_path
}
for
header
,
value
in
tups
:
value
=
value
.
strip
()
if
header
is
None
:
documents
.
append
(
Document
(
page_content
=
value
,
metadata
=
metadata
))
else
:
...
...
api/core/data_loader/loader/pdf.py
View file @
0039be67
...
...
@@ -56,7 +56,7 @@ class PdfLoader(BaseLoader):
# Extract the text from the page
page_text
=
pdf
.
pages
[
page
]
.
extract_text
()
text_list
.
append
(
page_text
)
text
=
"
\n
"
.
join
(
text_list
)
text
=
"
\n
\n
"
.
join
(
text_list
)
# save plaintext file for caching
if
not
plaintext_file_exists
and
plaintext_file_key
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment