Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
5b953c1e
Unverified
Commit
5b953c1e
authored
Feb 27, 2024
by
Jyong
Committed by
GitHub
Feb 27, 2024
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix some RAG bugs (#2570)
Co-authored-by:
jyong
<
jyong@dify.ai
>
parent
562ca45e
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
33 additions
and
59 deletions
+33
-59
data_source.py
api/controllers/console/datasets/data_source.py
+4
-2
datasets.py
api/controllers/console/datasets/datasets.py
+2
-1
datasets_document.py
api/controllers/console/datasets/datasets_document.py
+2
-1
indexing_runner.py
api/core/indexing_runner.py
+2
-1
retrieval_service.py
api/core/rag/datasource/retrieval_service.py
+2
-1
extract_setting.py
api/core/rag/extractor/entity/extract_setting.py
+1
-0
extract_processor.py
api/core/rag/extractor/extract_processor.py
+2
-1
html_extractor.py
api/core/rag/extractor/html_extractor.py
+13
-50
notion_extractor.py
api/core/rag/extractor/notion_extractor.py
+3
-1
document_indexing_sync_task.py
api/tasks/document_indexing_sync_task.py
+2
-1
No files found.
api/controllers/console/datasets/data_source.py
View file @
5b953c1e
...
...
@@ -178,7 +178,8 @@ class DataSourceNotionApi(Resource):
notion_workspace_id
=
workspace_id
,
notion_obj_id
=
page_id
,
notion_page_type
=
page_type
,
notion_access_token
=
data_source_binding
.
access_token
notion_access_token
=
data_source_binding
.
access_token
,
tenant_id
=
current_user
.
current_tenant_id
)
text_docs
=
extractor
.
extract
()
...
...
@@ -208,7 +209,8 @@ class DataSourceNotionApi(Resource):
notion_info
=
{
"notion_workspace_id"
:
workspace_id
,
"notion_obj_id"
:
page
[
'page_id'
],
"notion_page_type"
:
page
[
'type'
]
"notion_page_type"
:
page
[
'type'
],
"tenant_id"
:
current_user
.
current_tenant_id
},
document_model
=
args
[
'doc_form'
]
)
...
...
api/controllers/console/datasets/datasets.py
View file @
5b953c1e
...
...
@@ -298,7 +298,8 @@ class DatasetIndexingEstimateApi(Resource):
notion_info
=
{
"notion_workspace_id"
:
workspace_id
,
"notion_obj_id"
:
page
[
'page_id'
],
"notion_page_type"
:
page
[
'type'
]
"notion_page_type"
:
page
[
'type'
],
"tenant_id"
:
current_user
.
current_tenant_id
},
document_model
=
args
[
'doc_form'
]
)
...
...
api/controllers/console/datasets/datasets_document.py
View file @
5b953c1e
...
...
@@ -455,7 +455,8 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
notion_info
=
{
"notion_workspace_id"
:
data_source_info
[
'notion_workspace_id'
],
"notion_obj_id"
:
data_source_info
[
'notion_page_id'
],
"notion_page_type"
:
data_source_info
[
'type'
]
"notion_page_type"
:
data_source_info
[
'type'
],
"tenant_id"
:
current_user
.
current_tenant_id
},
document_model
=
document
.
doc_form
)
...
...
api/core/indexing_runner.py
View file @
5b953c1e
...
...
@@ -366,7 +366,8 @@ class IndexingRunner:
"notion_workspace_id"
:
data_source_info
[
'notion_workspace_id'
],
"notion_obj_id"
:
data_source_info
[
'notion_page_id'
],
"notion_page_type"
:
data_source_info
[
'type'
],
"document"
:
dataset_document
"document"
:
dataset_document
,
"tenant_id"
:
dataset_document
.
tenant_id
},
document_model
=
dataset_document
.
doc_form
)
...
...
api/core/rag/datasource/retrieval_service.py
View file @
5b953c1e
...
...
@@ -39,7 +39,8 @@ class RetrievalService:
'flask_app'
:
current_app
.
_get_current_object
(),
'dataset_id'
:
dataset_id
,
'query'
:
query
,
'top_k'
:
top_k
'top_k'
:
top_k
,
'all_documents'
:
all_documents
})
threads
.
append
(
keyword_thread
)
keyword_thread
.
start
()
...
...
api/core/rag/extractor/entity/extract_setting.py
View file @
5b953c1e
...
...
@@ -12,6 +12,7 @@ class NotionInfo(BaseModel):
notion_obj_id
:
str
notion_page_type
:
str
document
:
Document
=
None
tenant_id
:
str
class
Config
:
arbitrary_types_allowed
=
True
...
...
api/core/rag/extractor/extract_processor.py
View file @
5b953c1e
...
...
@@ -132,7 +132,8 @@ class ExtractProcessor:
notion_workspace_id
=
extract_setting
.
notion_info
.
notion_workspace_id
,
notion_obj_id
=
extract_setting
.
notion_info
.
notion_obj_id
,
notion_page_type
=
extract_setting
.
notion_info
.
notion_page_type
,
document_model
=
extract_setting
.
notion_info
.
document
document_model
=
extract_setting
.
notion_info
.
document
,
tenant_id
=
extract_setting
.
notion_info
.
tenant_id
,
)
return
extractor
.
extract
()
else
:
...
...
api/core/rag/extractor/html_extractor.py
View file @
5b953c1e
"""Abstract interface for document loader implementations."""
from
typing
import
Optional
from
bs4
import
BeautifulSoup
from
core.rag.extractor.extractor_base
import
BaseExtractor
from
core.rag.extractor.helpers
import
detect_file_encodings
from
core.rag.models.document
import
Document
class
HtmlExtractor
(
BaseExtractor
):
"""Load html files.
"""
Load html files.
Args:
...
...
@@ -15,57 +16,19 @@ class HtmlExtractor(BaseExtractor):
"""
def
__init__
(
self
,
file_path
:
str
,
encoding
:
Optional
[
str
]
=
None
,
autodetect_encoding
:
bool
=
False
,
source_column
:
Optional
[
str
]
=
None
,
csv_args
:
Optional
[
dict
]
=
None
,
self
,
file_path
:
str
):
"""Initialize with file path."""
self
.
_file_path
=
file_path
self
.
_encoding
=
encoding
self
.
_autodetect_encoding
=
autodetect_encoding
self
.
source_column
=
source_column
self
.
csv_args
=
csv_args
or
{}
def
extract
(
self
)
->
list
[
Document
]:
"""Load data into document objects."""
try
:
with
open
(
self
.
_file_path
,
newline
=
""
,
encoding
=
self
.
_encoding
)
as
csvfile
:
docs
=
self
.
_read_from_file
(
csvfile
)
except
UnicodeDecodeError
as
e
:
if
self
.
_autodetect_encoding
:
detected_encodings
=
detect_file_encodings
(
self
.
_file_path
)
for
encoding
in
detected_encodings
:
try
:
with
open
(
self
.
_file_path
,
newline
=
""
,
encoding
=
encoding
.
encoding
)
as
csvfile
:
docs
=
self
.
_read_from_file
(
csvfile
)
break
except
UnicodeDecodeError
:
continue
else
:
raise
RuntimeError
(
f
"Error loading {self._file_path}"
)
from
e
return
docs
return
[
Document
(
page_content
=
self
.
_load_as_text
())]
def
_read_from_file
(
self
,
csvfile
)
->
list
[
Document
]:
docs
=
[]
csv_reader
=
csv
.
DictReader
(
csvfile
,
**
self
.
csv_args
)
# type: ignore
for
i
,
row
in
enumerate
(
csv_reader
):
content
=
"
\n
"
.
join
(
f
"{k.strip()}: {v.strip()}"
for
k
,
v
in
row
.
items
())
try
:
source
=
(
row
[
self
.
source_column
]
if
self
.
source_column
is
not
None
else
''
)
except
KeyError
:
raise
ValueError
(
f
"Source column '{self.source_column}' not found in CSV file."
)
metadata
=
{
"source"
:
source
,
"row"
:
i
}
doc
=
Document
(
page_content
=
content
,
metadata
=
metadata
)
docs
.
append
(
doc
)
def
_load_as_text
(
self
)
->
str
:
with
open
(
self
.
_file_path
,
"rb"
)
as
fp
:
soup
=
BeautifulSoup
(
fp
,
'html.parser'
)
text
=
soup
.
get_text
()
text
=
text
.
strip
()
if
text
else
''
return
docs
return
text
\ No newline at end of file
api/core/rag/extractor/notion_extractor.py
View file @
5b953c1e
...
...
@@ -30,8 +30,10 @@ class NotionExtractor(BaseExtractor):
notion_workspace_id
:
str
,
notion_obj_id
:
str
,
notion_page_type
:
str
,
tenant_id
:
str
,
document_model
:
Optional
[
DocumentModel
]
=
None
,
notion_access_token
:
Optional
[
str
]
=
None
notion_access_token
:
Optional
[
str
]
=
None
,
):
self
.
_notion_access_token
=
None
self
.
_document_model
=
document_model
...
...
api/tasks/document_indexing_sync_task.py
View file @
5b953c1e
...
...
@@ -58,7 +58,8 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
notion_workspace_id
=
workspace_id
,
notion_obj_id
=
page_id
,
notion_page_type
=
page_type
,
notion_access_token
=
data_source_binding
.
access_token
notion_access_token
=
data_source_binding
.
access_token
,
tenant_id
=
document
.
tenant_id
)
last_edited_time
=
loader
.
get_notion_last_edited_time
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment