Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
df150998
Unverified
Commit
df150998
authored
Dec 19, 2023
by
Jyong
Committed by
GitHub
Dec 19, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ppt & pptx improve (#1790)
Co-authored-by:
jyong
<
jyong@dify.ai
>
parent
185c2f86
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
31 additions
and
9 deletions
+31
-9
unstructured_ppt.py
api/core/data_loader/loader/unstructured/unstructured_ppt.py
+12
-5
unstructured_pptx.py
...core/data_loader/loader/unstructured/unstructured_pptx.py
+12
-4
indexing_runner.py
api/core/indexing_runner.py
+7
-0
No files found.
api/core/data_loader/loader/unstructured/unstructured_ppt.py
View file @
df150998
...
@@ -30,11 +30,18 @@ class UnstructuredPPTLoader(BaseLoader):
...
@@ -30,11 +30,18 @@ class UnstructuredPPTLoader(BaseLoader):
from
unstructured.partition.ppt
import
partition_ppt
from
unstructured.partition.ppt
import
partition_ppt
elements
=
partition_ppt
(
filename
=
self
.
_file_path
,
api_url
=
self
.
_api_url
)
elements
=
partition_ppt
(
filename
=
self
.
_file_path
,
api_url
=
self
.
_api_url
)
from
unstructured.chunking.title
import
chunk_by_title
text_by_page
=
{}
chunks
=
chunk_by_title
(
elements
,
max_characters
=
2000
,
combine_text_under_n_chars
=
0
)
for
element
in
elements
:
page
=
element
.
metadata
.
page_number
text
=
element
.
text
if
page
in
text_by_page
:
text_by_page
[
page
]
+=
"
\n
"
+
text
else
:
text_by_page
[
page
]
=
text
combined_texts
=
list
(
text_by_page
.
values
())
documents
=
[]
documents
=
[]
for
c
hunk
in
chunk
s
:
for
c
ombined_text
in
combined_text
s
:
text
=
c
hunk
.
text
.
strip
()
text
=
c
ombined_
text
.
strip
()
documents
.
append
(
Document
(
page_content
=
text
))
documents
.
append
(
Document
(
page_content
=
text
))
return
documents
return
documents
api/core/data_loader/loader/unstructured/unstructured_pptx.py
View file @
df150998
...
@@ -30,11 +30,19 @@ class UnstructuredPPTXLoader(BaseLoader):
...
@@ -30,11 +30,19 @@ class UnstructuredPPTXLoader(BaseLoader):
from
unstructured.partition.pptx
import
partition_pptx
from
unstructured.partition.pptx
import
partition_pptx
elements
=
partition_pptx
(
filename
=
self
.
_file_path
,
api_url
=
self
.
_api_url
)
elements
=
partition_pptx
(
filename
=
self
.
_file_path
,
api_url
=
self
.
_api_url
)
from
unstructured.chunking.title
import
chunk_by_title
text_by_page
=
{}
chunks
=
chunk_by_title
(
elements
,
max_characters
=
2000
,
combine_text_under_n_chars
=
0
)
for
element
in
elements
:
page
=
element
.
metadata
.
page_number
text
=
element
.
text
if
page
in
text_by_page
:
text_by_page
[
page
]
+=
"
\n
"
+
text
else
:
text_by_page
[
page
]
=
text
combined_texts
=
list
(
text_by_page
.
values
())
documents
=
[]
documents
=
[]
for
c
hunk
in
chunk
s
:
for
c
ombined_text
in
combined_text
s
:
text
=
c
hunk
.
text
.
strip
()
text
=
c
ombined_
text
.
strip
()
documents
.
append
(
Document
(
page_content
=
text
))
documents
.
append
(
Document
(
page_content
=
text
))
return
documents
return
documents
api/core/indexing_runner.py
View file @
df150998
...
@@ -529,6 +529,13 @@ class IndexingRunner:
...
@@ -529,6 +529,13 @@ class IndexingRunner:
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_hash'
]
=
hash
document_node
.
metadata
[
'doc_hash'
]
=
hash
# delete Spliter character
page_content
=
document_node
.
page_content
if
page_content
.
startswith
(
"."
)
or
page_content
.
startswith
(
"。"
):
page_content
=
page_content
[
1
:]
else
:
page_content
=
page_content
document_node
.
page_content
=
page_content
split_documents
.
append
(
document_node
)
split_documents
.
append
(
document_node
)
all_documents
.
extend
(
split_documents
)
all_documents
.
extend
(
split_documents
)
# processing qa document
# processing qa document
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment