Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
5b953c1e
Unverified
Commit
5b953c1e
authored
Feb 27, 2024
by
Jyong
Committed by
GitHub
Feb 27, 2024
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix some RAG bugs (#2570)
Co-authored-by:
jyong
<
jyong@dify.ai
>
parent
562ca45e
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
33 additions
and
59 deletions
+33
-59
data_source.py
api/controllers/console/datasets/data_source.py
+4
-2
datasets.py
api/controllers/console/datasets/datasets.py
+2
-1
datasets_document.py
api/controllers/console/datasets/datasets_document.py
+2
-1
indexing_runner.py
api/core/indexing_runner.py
+2
-1
retrieval_service.py
api/core/rag/datasource/retrieval_service.py
+2
-1
extract_setting.py
api/core/rag/extractor/entity/extract_setting.py
+1
-0
extract_processor.py
api/core/rag/extractor/extract_processor.py
+2
-1
html_extractor.py
api/core/rag/extractor/html_extractor.py
+13
-50
notion_extractor.py
api/core/rag/extractor/notion_extractor.py
+3
-1
document_indexing_sync_task.py
api/tasks/document_indexing_sync_task.py
+2
-1
No files found.
api/controllers/console/datasets/data_source.py
View file @
5b953c1e
...
@@ -178,7 +178,8 @@ class DataSourceNotionApi(Resource):
...
@@ -178,7 +178,8 @@ class DataSourceNotionApi(Resource):
notion_workspace_id
=
workspace_id
,
notion_workspace_id
=
workspace_id
,
notion_obj_id
=
page_id
,
notion_obj_id
=
page_id
,
notion_page_type
=
page_type
,
notion_page_type
=
page_type
,
notion_access_token
=
data_source_binding
.
access_token
notion_access_token
=
data_source_binding
.
access_token
,
tenant_id
=
current_user
.
current_tenant_id
)
)
text_docs
=
extractor
.
extract
()
text_docs
=
extractor
.
extract
()
...
@@ -208,7 +209,8 @@ class DataSourceNotionApi(Resource):
...
@@ -208,7 +209,8 @@ class DataSourceNotionApi(Resource):
notion_info
=
{
notion_info
=
{
"notion_workspace_id"
:
workspace_id
,
"notion_workspace_id"
:
workspace_id
,
"notion_obj_id"
:
page
[
'page_id'
],
"notion_obj_id"
:
page
[
'page_id'
],
"notion_page_type"
:
page
[
'type'
]
"notion_page_type"
:
page
[
'type'
],
"tenant_id"
:
current_user
.
current_tenant_id
},
},
document_model
=
args
[
'doc_form'
]
document_model
=
args
[
'doc_form'
]
)
)
...
...
api/controllers/console/datasets/datasets.py
View file @
5b953c1e
...
@@ -298,7 +298,8 @@ class DatasetIndexingEstimateApi(Resource):
...
@@ -298,7 +298,8 @@ class DatasetIndexingEstimateApi(Resource):
notion_info
=
{
notion_info
=
{
"notion_workspace_id"
:
workspace_id
,
"notion_workspace_id"
:
workspace_id
,
"notion_obj_id"
:
page
[
'page_id'
],
"notion_obj_id"
:
page
[
'page_id'
],
"notion_page_type"
:
page
[
'type'
]
"notion_page_type"
:
page
[
'type'
],
"tenant_id"
:
current_user
.
current_tenant_id
},
},
document_model
=
args
[
'doc_form'
]
document_model
=
args
[
'doc_form'
]
)
)
...
...
api/controllers/console/datasets/datasets_document.py
View file @
5b953c1e
...
@@ -455,7 +455,8 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
...
@@ -455,7 +455,8 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
notion_info
=
{
notion_info
=
{
"notion_workspace_id"
:
data_source_info
[
'notion_workspace_id'
],
"notion_workspace_id"
:
data_source_info
[
'notion_workspace_id'
],
"notion_obj_id"
:
data_source_info
[
'notion_page_id'
],
"notion_obj_id"
:
data_source_info
[
'notion_page_id'
],
"notion_page_type"
:
data_source_info
[
'type'
]
"notion_page_type"
:
data_source_info
[
'type'
],
"tenant_id"
:
current_user
.
current_tenant_id
},
},
document_model
=
document
.
doc_form
document_model
=
document
.
doc_form
)
)
...
...
api/core/indexing_runner.py
View file @
5b953c1e
...
@@ -366,7 +366,8 @@ class IndexingRunner:
...
@@ -366,7 +366,8 @@ class IndexingRunner:
"notion_workspace_id"
:
data_source_info
[
'notion_workspace_id'
],
"notion_workspace_id"
:
data_source_info
[
'notion_workspace_id'
],
"notion_obj_id"
:
data_source_info
[
'notion_page_id'
],
"notion_obj_id"
:
data_source_info
[
'notion_page_id'
],
"notion_page_type"
:
data_source_info
[
'type'
],
"notion_page_type"
:
data_source_info
[
'type'
],
"document"
:
dataset_document
"document"
:
dataset_document
,
"tenant_id"
:
dataset_document
.
tenant_id
},
},
document_model
=
dataset_document
.
doc_form
document_model
=
dataset_document
.
doc_form
)
)
...
...
api/core/rag/datasource/retrieval_service.py
View file @
5b953c1e
...
@@ -39,7 +39,8 @@ class RetrievalService:
...
@@ -39,7 +39,8 @@ class RetrievalService:
'flask_app'
:
current_app
.
_get_current_object
(),
'flask_app'
:
current_app
.
_get_current_object
(),
'dataset_id'
:
dataset_id
,
'dataset_id'
:
dataset_id
,
'query'
:
query
,
'query'
:
query
,
'top_k'
:
top_k
'top_k'
:
top_k
,
'all_documents'
:
all_documents
})
})
threads
.
append
(
keyword_thread
)
threads
.
append
(
keyword_thread
)
keyword_thread
.
start
()
keyword_thread
.
start
()
...
...
api/core/rag/extractor/entity/extract_setting.py
View file @
5b953c1e
...
@@ -12,6 +12,7 @@ class NotionInfo(BaseModel):
...
@@ -12,6 +12,7 @@ class NotionInfo(BaseModel):
notion_obj_id
:
str
notion_obj_id
:
str
notion_page_type
:
str
notion_page_type
:
str
document
:
Document
=
None
document
:
Document
=
None
tenant_id
:
str
class
Config
:
class
Config
:
arbitrary_types_allowed
=
True
arbitrary_types_allowed
=
True
...
...
api/core/rag/extractor/extract_processor.py
View file @
5b953c1e
...
@@ -132,7 +132,8 @@ class ExtractProcessor:
...
@@ -132,7 +132,8 @@ class ExtractProcessor:
notion_workspace_id
=
extract_setting
.
notion_info
.
notion_workspace_id
,
notion_workspace_id
=
extract_setting
.
notion_info
.
notion_workspace_id
,
notion_obj_id
=
extract_setting
.
notion_info
.
notion_obj_id
,
notion_obj_id
=
extract_setting
.
notion_info
.
notion_obj_id
,
notion_page_type
=
extract_setting
.
notion_info
.
notion_page_type
,
notion_page_type
=
extract_setting
.
notion_info
.
notion_page_type
,
document_model
=
extract_setting
.
notion_info
.
document
document_model
=
extract_setting
.
notion_info
.
document
,
tenant_id
=
extract_setting
.
notion_info
.
tenant_id
,
)
)
return
extractor
.
extract
()
return
extractor
.
extract
()
else
:
else
:
...
...
api/core/rag/extractor/html_extractor.py
View file @
5b953c1e
"""Abstract interface for document loader implementations."""
"""Abstract interface for document loader implementations."""
from
typing
import
Optional
from
bs4
import
BeautifulSoup
from
core.rag.extractor.extractor_base
import
BaseExtractor
from
core.rag.extractor.extractor_base
import
BaseExtractor
from
core.rag.extractor.helpers
import
detect_file_encodings
from
core.rag.models.document
import
Document
from
core.rag.models.document
import
Document
class
HtmlExtractor
(
BaseExtractor
):
class
HtmlExtractor
(
BaseExtractor
):
"""Load html files.
"""
Load html files.
Args:
Args:
...
@@ -15,57 +16,19 @@ class HtmlExtractor(BaseExtractor):
...
@@ -15,57 +16,19 @@ class HtmlExtractor(BaseExtractor):
"""
"""
def
__init__
(
def
__init__
(
self
,
self
,
file_path
:
str
,
file_path
:
str
encoding
:
Optional
[
str
]
=
None
,
autodetect_encoding
:
bool
=
False
,
source_column
:
Optional
[
str
]
=
None
,
csv_args
:
Optional
[
dict
]
=
None
,
):
):
"""Initialize with file path."""
"""Initialize with file path."""
self
.
_file_path
=
file_path
self
.
_file_path
=
file_path
self
.
_encoding
=
encoding
self
.
_autodetect_encoding
=
autodetect_encoding
self
.
source_column
=
source_column
self
.
csv_args
=
csv_args
or
{}
def
extract
(
self
)
->
list
[
Document
]:
def
extract
(
self
)
->
list
[
Document
]:
"""Load data into document objects."""
return
[
Document
(
page_content
=
self
.
_load_as_text
())]
try
:
with
open
(
self
.
_file_path
,
newline
=
""
,
encoding
=
self
.
_encoding
)
as
csvfile
:
docs
=
self
.
_read_from_file
(
csvfile
)
except
UnicodeDecodeError
as
e
:
if
self
.
_autodetect_encoding
:
detected_encodings
=
detect_file_encodings
(
self
.
_file_path
)
for
encoding
in
detected_encodings
:
try
:
with
open
(
self
.
_file_path
,
newline
=
""
,
encoding
=
encoding
.
encoding
)
as
csvfile
:
docs
=
self
.
_read_from_file
(
csvfile
)
break
except
UnicodeDecodeError
:
continue
else
:
raise
RuntimeError
(
f
"Error loading {self._file_path}"
)
from
e
return
docs
def
_read_from_file
(
self
,
csvfile
)
->
list
[
Document
]:
def
_load_as_text
(
self
)
->
str
:
docs
=
[]
with
open
(
self
.
_file_path
,
"rb"
)
as
fp
:
csv_reader
=
csv
.
DictReader
(
csvfile
,
**
self
.
csv_args
)
# type: ignore
soup
=
BeautifulSoup
(
fp
,
'html.parser'
)
for
i
,
row
in
enumerate
(
csv_reader
):
text
=
soup
.
get_text
()
content
=
"
\n
"
.
join
(
f
"{k.strip()}: {v.strip()}"
for
k
,
v
in
row
.
items
())
text
=
text
.
strip
()
if
text
else
''
try
:
source
=
(
row
[
self
.
source_column
]
if
self
.
source_column
is
not
None
else
''
)
except
KeyError
:
raise
ValueError
(
f
"Source column '{self.source_column}' not found in CSV file."
)
metadata
=
{
"source"
:
source
,
"row"
:
i
}
doc
=
Document
(
page_content
=
content
,
metadata
=
metadata
)
docs
.
append
(
doc
)
return
docs
return
text
\ No newline at end of file
api/core/rag/extractor/notion_extractor.py
View file @
5b953c1e
...
@@ -30,8 +30,10 @@ class NotionExtractor(BaseExtractor):
...
@@ -30,8 +30,10 @@ class NotionExtractor(BaseExtractor):
notion_workspace_id
:
str
,
notion_workspace_id
:
str
,
notion_obj_id
:
str
,
notion_obj_id
:
str
,
notion_page_type
:
str
,
notion_page_type
:
str
,
tenant_id
:
str
,
document_model
:
Optional
[
DocumentModel
]
=
None
,
document_model
:
Optional
[
DocumentModel
]
=
None
,
notion_access_token
:
Optional
[
str
]
=
None
notion_access_token
:
Optional
[
str
]
=
None
,
):
):
self
.
_notion_access_token
=
None
self
.
_notion_access_token
=
None
self
.
_document_model
=
document_model
self
.
_document_model
=
document_model
...
...
api/tasks/document_indexing_sync_task.py
View file @
5b953c1e
...
@@ -58,7 +58,8 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
...
@@ -58,7 +58,8 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
notion_workspace_id
=
workspace_id
,
notion_workspace_id
=
workspace_id
,
notion_obj_id
=
page_id
,
notion_obj_id
=
page_id
,
notion_page_type
=
page_type
,
notion_page_type
=
page_type
,
notion_access_token
=
data_source_binding
.
access_token
notion_access_token
=
data_source_binding
.
access_token
,
tenant_id
=
document
.
tenant_id
)
)
last_edited_time
=
loader
.
get_notion_last_edited_time
()
last_edited_time
=
loader
.
get_notion_last_edited_time
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment