Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
5e34f938
Unverified
Commit
5e34f938
authored
Dec 18, 2023
by
Jyong
Committed by
GitHub
Dec 18, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Feat/add unstructured support (#1780)
Co-authored-by:
jyong
<
jyong@dify.ai
>
parent
2fd56cb0
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
361 additions
and
14 deletions
+361
-14
.env.example
api/.env.example
+3
-0
config.py
api/config.py
+5
-1
file.py
api/controllers/console/datasets/file.py
+15
-0
file_extractor.py
api/core/data_loader/file_extractor.py
+37
-9
unstructured_eml.py
api/core/data_loader/loader/unstructured/unstructured_eml.py
+41
-0
unstructured_markdown.py
.../data_loader/loader/unstructured/unstructured_markdown.py
+48
-0
unstructured_msg.py
api/core/data_loader/loader/unstructured/unstructured_msg.py
+40
-0
unstructured_ppt.py
api/core/data_loader/loader/unstructured/unstructured_ppt.py
+40
-0
unstructured_pptx.py
...core/data_loader/loader/unstructured/unstructured_pptx.py
+40
-0
unstructured_text.py
...core/data_loader/loader/unstructured/unstructured_text.py
+40
-0
unstructured_xml.py
api/core/data_loader/loader/unstructured/unstructured_xml.py
+40
-0
indexing_runner.py
api/core/indexing_runner.py
+1
-1
dataset.py
api/models/dataset.py
+1
-1
requirements.txt
api/requirements.txt
+3
-1
file_service.py
api/services/file_service.py
+7
-1
No files found.
api/.env.example
View file @
5e34f938
...
...
@@ -117,3 +117,6 @@ HOSTED_ANTHROPIC_API_BASE=
HOSTED_ANTHROPIC_API_KEY=
HOSTED_ANTHROPIC_QUOTA_LIMIT=600000
HOSTED_ANTHROPIC_PAID_ENABLED=false
ETL_TYPE=dify
UNSTRUCTURED_API_URL=
\ No newline at end of file
api/config.py
View file @
5e34f938
...
...
@@ -54,7 +54,8 @@ DEFAULTS = {
'UPLOAD_IMAGE_FILE_SIZE_LIMIT'
:
10
,
'OUTPUT_MODERATION_BUFFER_SIZE'
:
300
,
'MULTIMODAL_SEND_IMAGE_FORMAT'
:
'base64'
,
'INVITE_EXPIRY_HOURS'
:
72
'INVITE_EXPIRY_HOURS'
:
72
,
'ETL_TYPE'
:
'dify'
,
}
...
...
@@ -276,6 +277,9 @@ class Config:
self
.
HOSTED_MODERATION_ENABLED
=
get_bool_env
(
'HOSTED_MODERATION_ENABLED'
)
self
.
HOSTED_MODERATION_PROVIDERS
=
get_env
(
'HOSTED_MODERATION_PROVIDERS'
)
self
.
ETL_TYPE
=
get_env
(
'ETL_TYPE'
)
self
.
UNSTRUCTURED_API_URL
=
get_env
(
'UNSTRUCTURED_API_URL'
)
class
CloudEditionConfig
(
Config
):
...
...
api/controllers/console/datasets/file.py
View file @
5e34f938
...
...
@@ -69,5 +69,20 @@ class FilePreviewApi(Resource):
return
{
'content'
:
text
}
class
FileeSupportTypApi
(
Resource
):
@
setup_required
@
login_required
@
account_initialization_required
def
get
(
self
):
etl_type
=
current_app
.
config
[
'ETL_TYPE'
]
if
etl_type
==
'Unstructured'
:
allowed_extensions
=
[
'txt'
,
'markdown'
,
'md'
,
'pdf'
,
'html'
,
'htm'
,
'xlsx'
,
'docx'
,
'csv'
,
'eml'
,
'msg'
,
'pptx'
,
'ppt'
,
'xml'
]
else
:
allowed_extensions
=
[
'txt'
,
'markdown'
,
'md'
,
'pdf'
,
'html'
,
'htm'
,
'xlsx'
,
'docx'
,
'csv'
]
return
{
'allowed_extensions'
:
allowed_extensions
}
api
.
add_resource
(
FileApi
,
'/files/upload'
)
api
.
add_resource
(
FilePreviewApi
,
'/files/<uuid:file_id>/preview'
)
api
.
add_resource
(
FileeSupportTypApi
,
'/files/support-type'
)
api/core/data_loader/file_extractor.py
View file @
5e34f938
...
...
@@ -3,7 +3,8 @@ from pathlib import Path
from
typing
import
List
,
Union
,
Optional
import
requests
from
langchain.document_loaders
import
TextLoader
,
Docx2txtLoader
,
UnstructuredFileLoader
,
UnstructuredAPIFileLoader
from
flask
import
current_app
from
langchain.document_loaders
import
TextLoader
,
Docx2txtLoader
from
langchain.schema
import
Document
from
core.data_loader.loader.csv_loader
import
CSVLoader
...
...
@@ -11,6 +12,13 @@ from core.data_loader.loader.excel import ExcelLoader
from
core.data_loader.loader.html
import
HTMLLoader
from
core.data_loader.loader.markdown
import
MarkdownLoader
from
core.data_loader.loader.pdf
import
PdfLoader
from
core.data_loader.loader.unstructured.unstructured_eml
import
UnstructuredEmailLoader
from
core.data_loader.loader.unstructured.unstructured_markdown
import
UnstructuredMarkdownLoader
from
core.data_loader.loader.unstructured.unstructured_msg
import
UnstructuredMsgLoader
from
core.data_loader.loader.unstructured.unstructured_ppt
import
UnstructuredPPTLoader
from
core.data_loader.loader.unstructured.unstructured_pptx
import
UnstructuredPPTXLoader
from
core.data_loader.loader.unstructured.unstructured_text
import
UnstructuredTextLoader
from
core.data_loader.loader.unstructured.unstructured_xml
import
UnstructuredXmlLoader
from
extensions.ext_storage
import
storage
from
models.model
import
UploadFile
...
...
@@ -49,14 +57,34 @@ class FileExtractor:
input_file
=
Path
(
file_path
)
delimiter
=
'
\n
'
file_extension
=
input_file
.
suffix
.
lower
()
if
is_automatic
:
loader
=
UnstructuredFileLoader
(
file_path
,
strategy
=
"hi_res"
,
mode
=
"elements"
)
# loader = UnstructuredAPIFileLoader(
# file_path=filenames[0],
# api_key="FAKE_API_KEY",
# )
etl_type
=
current_app
.
config
[
'ETL_TYPE'
]
unstructured_api_url
=
current_app
.
config
[
'UNSTRUCTURED_API_URL'
]
if
etl_type
==
'Unstructured'
:
if
file_extension
==
'.xlsx'
:
loader
=
ExcelLoader
(
file_path
)
elif
file_extension
==
'.pdf'
:
loader
=
PdfLoader
(
file_path
,
upload_file
=
upload_file
)
elif
file_extension
in
[
'.md'
,
'.markdown'
]:
loader
=
UnstructuredMarkdownLoader
(
file_path
,
unstructured_api_url
)
elif
file_extension
in
[
'.htm'
,
'.html'
]:
loader
=
HTMLLoader
(
file_path
)
elif
file_extension
==
'.docx'
:
loader
=
Docx2txtLoader
(
file_path
)
elif
file_extension
==
'.csv'
:
loader
=
CSVLoader
(
file_path
,
autodetect_encoding
=
True
)
elif
file_extension
==
'.msg'
:
loader
=
UnstructuredMsgLoader
(
file_path
,
unstructured_api_url
)
elif
file_extension
==
'.eml'
:
loader
=
UnstructuredEmailLoader
(
file_path
,
unstructured_api_url
)
elif
file_extension
==
'.ppt'
:
loader
=
UnstructuredPPTLoader
(
file_path
,
unstructured_api_url
)
elif
file_extension
==
'.pptx'
:
loader
=
UnstructuredPPTXLoader
(
file_path
,
unstructured_api_url
)
elif
file_extension
==
'.xml'
:
loader
=
UnstructuredXmlLoader
(
file_path
,
unstructured_api_url
)
else
:
# txt
loader
=
UnstructuredTextLoader
(
file_path
,
unstructured_api_url
)
else
:
if
file_extension
==
'.xlsx'
:
loader
=
ExcelLoader
(
file_path
)
...
...
api/core/data_loader/loader/unstructured/unstructured_eml.py
0 → 100644
View file @
5e34f938
import
logging
import
re
from
typing
import
Optional
,
List
,
Tuple
,
cast
from
langchain.document_loaders.base
import
BaseLoader
from
langchain.document_loaders.helpers
import
detect_file_encodings
from
langchain.schema
import
Document
logger
=
logging
.
getLogger
(
__name__
)
class
UnstructuredEmailLoader
(
BaseLoader
):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def
__init__
(
self
,
file_path
:
str
,
api_url
:
str
,
):
"""Initialize with file path."""
self
.
_file_path
=
file_path
self
.
_api_url
=
api_url
def
load
(
self
)
->
List
[
Document
]:
from
unstructured.partition.email
import
partition_email
elements
=
partition_email
(
filename
=
self
.
_file_path
,
api_url
=
self
.
_api_url
)
from
unstructured.chunking.title
import
chunk_by_title
chunks
=
chunk_by_title
(
elements
,
max_characters
=
2000
,
combine_text_under_n_chars
=
0
)
documents
=
[]
for
chunk
in
chunks
:
text
=
chunk
.
text
.
strip
()
documents
.
append
(
Document
(
page_content
=
text
))
return
documents
api/core/data_loader/loader/unstructured/unstructured_markdown.py
0 → 100644
View file @
5e34f938
import
logging
from
typing
import
List
from
langchain.document_loaders.base
import
BaseLoader
from
langchain.schema
import
Document
logger
=
logging
.
getLogger
(
__name__
)
class
UnstructuredMarkdownLoader
(
BaseLoader
):
"""Load md files.
Args:
file_path: Path to the file to load.
remove_hyperlinks: Whether to remove hyperlinks from the text.
remove_images: Whether to remove images from the text.
encoding: File encoding to use. If `None`, the file will be loaded
with the default system encoding.
autodetect_encoding: Whether to try to autodetect the file encoding
if the specified encoding fails.
"""
def
__init__
(
self
,
file_path
:
str
,
api_url
:
str
,
):
"""Initialize with file path."""
self
.
_file_path
=
file_path
self
.
_api_url
=
api_url
def
load
(
self
)
->
List
[
Document
]:
from
unstructured.partition.md
import
partition_md
elements
=
partition_md
(
filename
=
self
.
_file_path
,
api_url
=
self
.
_api_url
)
from
unstructured.chunking.title
import
chunk_by_title
chunks
=
chunk_by_title
(
elements
,
max_characters
=
2000
,
combine_text_under_n_chars
=
0
)
documents
=
[]
for
chunk
in
chunks
:
text
=
chunk
.
text
.
strip
()
documents
.
append
(
Document
(
page_content
=
text
))
return
documents
api/core/data_loader/loader/unstructured/unstructured_msg.py
0 → 100644
View file @
5e34f938
import
logging
import
re
from
typing
import
Optional
,
List
,
Tuple
,
cast
from
langchain.document_loaders.base
import
BaseLoader
from
langchain.document_loaders.helpers
import
detect_file_encodings
from
langchain.schema
import
Document
logger
=
logging
.
getLogger
(
__name__
)
class
UnstructuredMsgLoader
(
BaseLoader
):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def
__init__
(
self
,
file_path
:
str
,
api_url
:
str
):
"""Initialize with file path."""
self
.
_file_path
=
file_path
self
.
_api_url
=
api_url
def
load
(
self
)
->
List
[
Document
]:
from
unstructured.partition.msg
import
partition_msg
elements
=
partition_msg
(
filename
=
self
.
_file_path
,
api_url
=
self
.
_api_url
)
from
unstructured.chunking.title
import
chunk_by_title
chunks
=
chunk_by_title
(
elements
,
max_characters
=
2000
,
combine_text_under_n_chars
=
0
)
documents
=
[]
for
chunk
in
chunks
:
text
=
chunk
.
text
.
strip
()
documents
.
append
(
Document
(
page_content
=
text
))
return
documents
api/core/data_loader/loader/unstructured/unstructured_ppt.py
0 → 100644
View file @
5e34f938
import
logging
import
re
from
typing
import
Optional
,
List
,
Tuple
,
cast
from
langchain.document_loaders.base
import
BaseLoader
from
langchain.document_loaders.helpers
import
detect_file_encodings
from
langchain.schema
import
Document
logger
=
logging
.
getLogger
(
__name__
)
class
UnstructuredPPTLoader
(
BaseLoader
):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def
__init__
(
self
,
file_path
:
str
,
api_url
:
str
):
"""Initialize with file path."""
self
.
_file_path
=
file_path
self
.
_api_url
=
api_url
def
load
(
self
)
->
List
[
Document
]:
from
unstructured.partition.ppt
import
partition_ppt
elements
=
partition_ppt
(
filename
=
self
.
_file_path
,
api_url
=
self
.
_api_url
)
from
unstructured.chunking.title
import
chunk_by_title
chunks
=
chunk_by_title
(
elements
,
max_characters
=
2000
,
combine_text_under_n_chars
=
0
)
documents
=
[]
for
chunk
in
chunks
:
text
=
chunk
.
text
.
strip
()
documents
.
append
(
Document
(
page_content
=
text
))
return
documents
api/core/data_loader/loader/unstructured/unstructured_pptx.py
0 → 100644
View file @
5e34f938
import
logging
import
re
from
typing
import
Optional
,
List
,
Tuple
,
cast
from
langchain.document_loaders.base
import
BaseLoader
from
langchain.document_loaders.helpers
import
detect_file_encodings
from
langchain.schema
import
Document
logger
=
logging
.
getLogger
(
__name__
)
class
UnstructuredPPTXLoader
(
BaseLoader
):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def
__init__
(
self
,
file_path
:
str
,
api_url
:
str
):
"""Initialize with file path."""
self
.
_file_path
=
file_path
self
.
_api_url
=
api_url
def
load
(
self
)
->
List
[
Document
]:
from
unstructured.partition.pptx
import
partition_pptx
elements
=
partition_pptx
(
filename
=
self
.
_file_path
,
api_url
=
self
.
_api_url
)
from
unstructured.chunking.title
import
chunk_by_title
chunks
=
chunk_by_title
(
elements
,
max_characters
=
2000
,
combine_text_under_n_chars
=
0
)
documents
=
[]
for
chunk
in
chunks
:
text
=
chunk
.
text
.
strip
()
documents
.
append
(
Document
(
page_content
=
text
))
return
documents
api/core/data_loader/loader/unstructured/unstructured_text.py
0 → 100644
View file @
5e34f938
import
logging
import
re
from
typing
import
Optional
,
List
,
Tuple
,
cast
from
langchain.document_loaders.base
import
BaseLoader
from
langchain.document_loaders.helpers
import
detect_file_encodings
from
langchain.schema
import
Document
logger
=
logging
.
getLogger
(
__name__
)
class
UnstructuredTextLoader
(
BaseLoader
):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def
__init__
(
self
,
file_path
:
str
,
api_url
:
str
):
"""Initialize with file path."""
self
.
_file_path
=
file_path
self
.
_api_url
=
api_url
def
load
(
self
)
->
List
[
Document
]:
from
unstructured.partition.text
import
partition_text
elements
=
partition_text
(
filename
=
self
.
_file_path
,
api_url
=
self
.
_api_url
)
from
unstructured.chunking.title
import
chunk_by_title
chunks
=
chunk_by_title
(
elements
,
max_characters
=
2000
,
combine_text_under_n_chars
=
0
)
documents
=
[]
for
chunk
in
chunks
:
text
=
chunk
.
text
.
strip
()
documents
.
append
(
Document
(
page_content
=
text
))
return
documents
api/core/data_loader/loader/unstructured/unstructured_xml.py
0 → 100644
View file @
5e34f938
import
logging
import
re
from
typing
import
Optional
,
List
,
Tuple
,
cast
from
langchain.document_loaders.base
import
BaseLoader
from
langchain.document_loaders.helpers
import
detect_file_encodings
from
langchain.schema
import
Document
logger
=
logging
.
getLogger
(
__name__
)
class
UnstructuredXmlLoader
(
BaseLoader
):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def
__init__
(
self
,
file_path
:
str
,
api_url
:
str
):
"""Initialize with file path."""
self
.
_file_path
=
file_path
self
.
_api_url
=
api_url
def
load
(
self
)
->
List
[
Document
]:
from
unstructured.partition.xml
import
partition_xml
elements
=
partition_xml
(
filename
=
self
.
_file_path
,
xml_keep_tags
=
True
,
api_url
=
self
.
_api_url
)
from
unstructured.chunking.title
import
chunk_by_title
chunks
=
chunk_by_title
(
elements
,
max_characters
=
2000
,
combine_text_under_n_chars
=
0
)
documents
=
[]
for
chunk
in
chunks
:
text
=
chunk
.
text
.
strip
()
documents
.
append
(
Document
(
page_content
=
text
))
return
documents
api/core/indexing_runner.py
View file @
5e34f938
...
...
@@ -397,7 +397,7 @@ class IndexingRunner:
one_or_none
()
if
file_detail
:
text_docs
=
FileExtractor
.
load
(
file_detail
,
is_automatic
=
Fals
e
)
text_docs
=
FileExtractor
.
load
(
file_detail
,
is_automatic
=
Tru
e
)
elif
dataset_document
.
data_source_type
==
'notion_import'
:
loader
=
NotionLoader
.
from_document
(
dataset_document
)
text_docs
=
loader
.
load
()
...
...
api/models/dataset.py
View file @
5e34f938
...
...
@@ -135,7 +135,7 @@ class DatasetProcessRule(db.Model):
],
'segmentation'
:
{
'delimiter'
:
'
\n
'
,
'max_tokens'
:
512
'max_tokens'
:
1000
}
}
...
...
api/requirements.txt
View file @
5e34f938
...
...
@@ -53,4 +53,6 @@ zhipuai==1.0.7
werkzeug==2.3.7
pymilvus==2.3.0
qdrant-client==1.6.4
cohere~=4.32
\ No newline at end of file
cohere~=4.32
unstructured~=0.10.27
unstructured[docx,pptx]~=0.10.27
\ No newline at end of file
api/services/file_service.py
View file @
5e34f938
...
...
@@ -27,7 +27,13 @@ class FileService:
@
staticmethod
def
upload_file
(
file
:
FileStorage
,
user
:
Union
[
Account
,
EndUser
],
only_image
:
bool
=
False
)
->
UploadFile
:
extension
=
file
.
filename
.
split
(
'.'
)[
-
1
]
if
extension
.
lower
()
not
in
ALLOWED_EXTENSIONS
:
etl_type
=
current_app
.
config
[
'ETL_TYPE'
]
if
etl_type
==
'Unstructured'
:
allowed_extensions
=
[
'txt'
,
'markdown'
,
'md'
,
'pdf'
,
'html'
,
'htm'
,
'xlsx'
,
'docx'
,
'csv'
,
'eml'
,
'msg'
,
'pptx'
,
'ppt'
,
'xml'
]
else
:
allowed_extensions
=
[
'txt'
,
'markdown'
,
'md'
,
'pdf'
,
'html'
,
'htm'
,
'xlsx'
,
'docx'
,
'csv'
]
if
extension
.
lower
()
not
in
allowed_extensions
:
raise
UnsupportedFileTypeError
()
elif
only_image
and
extension
.
lower
()
not
in
IMAGE_EXTENSIONS
:
raise
UnsupportedFileTypeError
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment