Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
52e6f458
Commit
52e6f458
authored
Mar 06, 2024
by
jyong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add rag test
parent
703aefbd
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
90 additions
and
88 deletions
+90
-88
test_paragraph_index_processor.py
...sts/rag/index_processor/test_paragraph_index_processor.py
+90
-88
No files found.
api/tests/integration_tests/rag/index_processor/test_paragraph_index_processor.py
View file @
52e6f458
...
@@ -3,6 +3,8 @@ import datetime
...
@@ -3,6 +3,8 @@ import datetime
import
uuid
import
uuid
from
typing
import
Optional
from
typing
import
Optional
import
pytest
from
core.rag.cleaner.clean_processor
import
CleanProcessor
from
core.rag.cleaner.clean_processor
import
CleanProcessor
from
core.rag.datasource.keyword.keyword_factory
import
Keyword
from
core.rag.datasource.keyword.keyword_factory
import
Keyword
from
core.rag.datasource.retrieval_service
import
RetrievalService
from
core.rag.datasource.retrieval_service
import
RetrievalService
...
@@ -16,98 +18,98 @@ from models.dataset import Dataset
...
@@ -16,98 +18,98 @@ from models.dataset import Dataset
from
models.model
import
UploadFile
from
models.model
import
UploadFile
class
ParagraphIndexProcessor
(
BaseIndexProcessor
):
def
extract
(
self
)
->
list
[
Document
]:
file_detail
=
UploadFile
(
tenant_id
=
'test'
,
storage_type
=
'local'
,
key
=
'test.txt'
,
name
=
'test.txt'
,
size
=
1024
,
extension
=
'txt'
,
mime_type
=
'text/plain'
,
created_by
=
'test'
,
created_at
=
datetime
.
datetime
.
utcnow
(),
used
=
True
,
used_by
=
'd48632d7-c972-484a-8ed9-262490919c79'
,
used_at
=
datetime
.
datetime
.
utcnow
()
)
extract_setting
=
ExtractSetting
(
datasource_type
=
"upload_file"
,
upload_file
=
file_detail
,
document_model
=
'text_model'
)
text_docs
=
ExtractProcessor
.
extract
(
extract_setting
=
extract_setting
,
@
pytest
.
mark
.
parametrize
(
'setup_unstructured_mock'
,
[[
'partition_md'
,
'chunk_by_title'
]],
indirect
=
True
)
is_automatic
=
False
)
def
extract
()
->
list
[
Document
]:
file_detail
=
UploadFile
(
tenant_id
=
'test'
,
storage_type
=
'local'
,
key
=
'test.txt'
,
name
=
'test.txt'
,
size
=
1024
,
extension
=
'txt'
,
mime_type
=
'text/plain'
,
created_by
=
'test'
,
created_at
=
datetime
.
datetime
.
utcnow
(),
used
=
True
,
used_by
=
'd48632d7-c972-484a-8ed9-262490919c79'
,
used_at
=
datetime
.
datetime
.
utcnow
()
)
extract_setting
=
ExtractSetting
(
datasource_type
=
"upload_file"
,
upload_file
=
file_detail
,
document_model
=
'text_model'
)
return
text_docs
text_docs
=
ExtractProcessor
.
extract
(
extract_setting
=
extract_setting
,
is_automatic
=
True
)
assert
isinstance
(
text_docs
,
list
)
return
text_docs
def
transform
(
self
,
documents
:
list
[
Document
],
**
kwargs
)
->
list
[
Document
]:
def
transform
(
self
,
documents
:
list
[
Document
],
**
kwargs
)
->
list
[
Document
]:
# Split the text documents into nodes.
# Split the text documents into nodes.
splitter
=
self
.
_get_splitter
(
processing_rule
=
kwargs
.
get
(
'process_rule'
),
splitter
=
self
.
_get_splitter
(
processing_rule
=
kwargs
.
get
(
'process_rule'
),
embedding_model_instance
=
kwargs
.
get
(
'embedding_model_instance'
))
embedding_model_instance
=
kwargs
.
get
(
'embedding_model_instance'
))
all_documents
=
[]
all_documents
=
[]
for
document
in
documents
:
for
document
in
documents
:
# document clean
# document clean
document_text
=
CleanProcessor
.
clean
(
document
.
page_content
,
kwargs
.
get
(
'process_rule'
))
document_text
=
CleanProcessor
.
clean
(
document
.
page_content
,
kwargs
.
get
(
'process_rule'
))
document
.
page_content
=
document_text
document
.
page_content
=
document_text
# parse document to nodes
# parse document to nodes
document_nodes
=
splitter
.
split_documents
([
document
])
document_nodes
=
splitter
.
split_documents
([
document
])
split_documents
=
[]
split_documents
=
[]
for
document_node
in
document_nodes
:
for
document_node
in
document_nodes
:
if
document_node
.
page_content
.
strip
():
if
document_node
.
page_content
.
strip
():
doc_id
=
str
(
uuid
.
uuid4
())
doc_id
=
str
(
uuid
.
uuid4
())
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
hash
=
helper
.
generate_text_hash
(
document_node
.
page_content
)
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_id'
]
=
doc_id
document_node
.
metadata
[
'doc_hash'
]
=
hash
document_node
.
metadata
[
'doc_hash'
]
=
hash
# delete Spliter character
# delete Spliter character
page_content
=
document_node
.
page_content
page_content
=
document_node
.
page_content
if
page_content
.
startswith
(
"."
)
or
page_content
.
startswith
(
"。"
):
if
page_content
.
startswith
(
"."
)
or
page_content
.
startswith
(
"。"
):
page_content
=
page_content
[
1
:]
page_content
=
page_content
[
1
:]
else
:
else
:
page_content
=
page_content
page_content
=
page_content
document_node
.
page_content
=
page_content
document_node
.
page_content
=
page_content
split_documents
.
append
(
document_node
)
split_documents
.
append
(
document_node
)
all_documents
.
extend
(
split_documents
)
all_documents
.
extend
(
split_documents
)
return
all_documents
return
all_documents
def
load
(
self
,
dataset
:
Dataset
,
documents
:
list
[
Document
],
with_keywords
:
bool
=
True
):
def
load
(
self
,
dataset
:
Dataset
,
documents
:
list
[
Document
],
with_keywords
:
bool
=
True
):
if
dataset
.
indexing_technique
==
'high_quality'
:
if
dataset
.
indexing_technique
==
'high_quality'
:
vector
=
Vector
(
dataset
)
vector
=
Vector
(
dataset
)
vector
.
create
(
documents
)
vector
.
create
(
documents
)
if
with_keywords
:
if
with_keywords
:
keyword
=
Keyword
(
dataset
)
keyword
=
Keyword
(
dataset
)
keyword
.
create
(
documents
)
keyword
.
create
(
documents
)
def
clean
(
self
,
dataset
:
Dataset
,
node_ids
:
Optional
[
list
[
str
]],
with_keywords
:
bool
=
True
):
def
clean
(
self
,
dataset
:
Dataset
,
node_ids
:
Optional
[
list
[
str
]],
with_keywords
:
bool
=
True
):
if
dataset
.
indexing_technique
==
'high_quality'
:
if
dataset
.
indexing_technique
==
'high_quality'
:
vector
=
Vector
(
dataset
)
vector
=
Vector
(
dataset
)
if
node_ids
:
if
node_ids
:
vector
.
delete_by_ids
(
node_ids
)
vector
.
delete_by_ids
(
node_ids
)
else
:
else
:
vector
.
delete
()
vector
.
delete
()
if
with_keywords
:
if
with_keywords
:
keyword
=
Keyword
(
dataset
)
keyword
=
Keyword
(
dataset
)
if
node_ids
:
if
node_ids
:
keyword
.
delete_by_ids
(
node_ids
)
keyword
.
delete_by_ids
(
node_ids
)
else
:
else
:
keyword
.
delete
()
keyword
.
delete
()
def
retrieve
(
self
,
retrival_method
:
str
,
query
:
str
,
dataset
:
Dataset
,
top_k
:
int
,
def
retrieve
(
self
,
retrival_method
:
str
,
query
:
str
,
dataset
:
Dataset
,
top_k
:
int
,
score_threshold
:
float
,
reranking_model
:
dict
)
->
list
[
Document
]:
score_threshold
:
float
,
reranking_model
:
dict
)
->
list
[
Document
]:
# Set search parameters.
# Set search parameters.
results
=
RetrievalService
.
retrieve
(
retrival_method
=
retrival_method
,
dataset_id
=
dataset
.
id
,
query
=
query
,
results
=
RetrievalService
.
retrieve
(
retrival_method
=
retrival_method
,
dataset_id
=
dataset
.
id
,
query
=
query
,
top_k
=
top_k
,
score_threshold
=
score_threshold
,
top_k
=
top_k
,
score_threshold
=
score_threshold
,
reranking_model
=
reranking_model
)
reranking_model
=
reranking_model
)
# Organize results.
# Organize results.
docs
=
[]
docs
=
[]
for
result
in
results
:
for
result
in
results
:
metadata
=
result
.
metadata
metadata
=
result
.
metadata
metadata
[
'score'
]
=
result
.
score
metadata
[
'score'
]
=
result
.
score
if
result
.
score
>
score_threshold
:
if
result
.
score
>
score_threshold
:
doc
=
Document
(
page_content
=
result
.
page_content
,
metadata
=
metadata
)
doc
=
Document
(
page_content
=
result
.
page_content
,
metadata
=
metadata
)
docs
.
append
(
doc
)
docs
.
append
(
doc
)
return
docs
return
docs
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment