Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
026f0bfc
Unverified
Commit
026f0bfc
authored
Jul 19, 2023
by
Jyong
Committed by
GitHub
Jul 19, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Feat/clean vector dataset (#605)
parent
d19181fb
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
69 additions
and
6 deletions
+69
-6
commands.py
api/commands.py
+66
-4
config.py
api/config.py
+3
-1
datasets.py
api/controllers/console/datasets/datasets.py
+0
-1
No files found.
api/commands.py
View file @
026f0bfc
...
@@ -2,6 +2,7 @@ import datetime
...
@@ -2,6 +2,7 @@ import datetime
import
logging
import
logging
import
random
import
random
import
string
import
string
import
time
import
click
import
click
from
flask
import
current_app
from
flask
import
current_app
...
@@ -13,7 +14,7 @@ from libs.helper import email as email_validate
...
@@ -13,7 +14,7 @@ from libs.helper import email as email_validate
from
extensions.ext_database
import
db
from
extensions.ext_database
import
db
from
libs.rsa
import
generate_key_pair
from
libs.rsa
import
generate_key_pair
from
models.account
import
InvitationCode
,
Tenant
from
models.account
import
InvitationCode
,
Tenant
from
models.dataset
import
Dataset
from
models.dataset
import
Dataset
,
DatasetQuery
,
Document
,
DocumentSegment
from
models.model
import
Account
from
models.model
import
Account
import
secrets
import
secrets
import
base64
import
base64
...
@@ -172,7 +173,7 @@ def recreate_all_dataset_indexes():
...
@@ -172,7 +173,7 @@ def recreate_all_dataset_indexes():
page
=
1
page
=
1
while
True
:
while
True
:
try
:
try
:
datasets
=
db
.
session
.
query
(
Dataset
)
.
filter
(
Dataset
.
indexing_technique
==
'high_quality'
)
\
datasets
=
db
.
session
.
query
(
Dataset
)
.
filter
(
Dataset
.
indexing_technique
==
'high_quality'
)
\
.
order_by
(
Dataset
.
created_at
.
desc
())
.
paginate
(
page
=
page
,
per_page
=
50
)
.
order_by
(
Dataset
.
created_at
.
desc
())
.
paginate
(
page
=
page
,
per_page
=
50
)
except
NotFound
:
except
NotFound
:
break
break
...
@@ -188,12 +189,70 @@ def recreate_all_dataset_indexes():
...
@@ -188,12 +189,70 @@ def recreate_all_dataset_indexes():
else
:
else
:
click
.
echo
(
'passed.'
)
click
.
echo
(
'passed.'
)
except
Exception
as
e
:
except
Exception
as
e
:
click
.
echo
(
click
.
style
(
'Recreate dataset index error: {} {}'
.
format
(
e
.
__class__
.
__name__
,
str
(
e
)),
fg
=
'red'
))
click
.
echo
(
click
.
style
(
'Recreate dataset index error: {} {}'
.
format
(
e
.
__class__
.
__name__
,
str
(
e
)),
fg
=
'red'
))
continue
continue
click
.
echo
(
click
.
style
(
'Congratulations! Recreate {} dataset indexes.'
.
format
(
recreate_count
),
fg
=
'green'
))
click
.
echo
(
click
.
style
(
'Congratulations! Recreate {} dataset indexes.'
.
format
(
recreate_count
),
fg
=
'green'
))
@
click
.
command
(
'clean-unused-dataset-indexes'
,
help
=
'Clean unused dataset indexes.'
)
def
clean_unused_dataset_indexes
():
click
.
echo
(
click
.
style
(
'Start clean unused dataset indexes.'
,
fg
=
'green'
))
clean_days
=
int
(
current_app
.
config
.
get
(
'CLEAN_DAY_SETTING'
))
start_at
=
time
.
perf_counter
()
thirty_days_ago
=
datetime
.
datetime
.
now
()
-
datetime
.
timedelta
(
days
=
clean_days
)
page
=
1
while
True
:
try
:
datasets
=
db
.
session
.
query
(
Dataset
)
.
filter
(
Dataset
.
created_at
<
thirty_days_ago
)
\
.
order_by
(
Dataset
.
created_at
.
desc
())
.
paginate
(
page
=
page
,
per_page
=
50
)
except
NotFound
:
break
page
+=
1
for
dataset
in
datasets
:
dataset_query
=
db
.
session
.
query
(
DatasetQuery
)
.
filter
(
DatasetQuery
.
created_at
>
thirty_days_ago
,
DatasetQuery
.
dataset_id
==
dataset
.
id
)
.
all
()
if
not
dataset_query
or
len
(
dataset_query
)
==
0
:
documents
=
db
.
session
.
query
(
Document
)
.
filter
(
Document
.
dataset_id
==
dataset
.
id
,
Document
.
indexing_status
==
'completed'
,
Document
.
enabled
==
True
,
Document
.
archived
==
False
,
Document
.
updated_at
>
thirty_days_ago
)
.
all
()
if
not
documents
or
len
(
documents
)
==
0
:
try
:
all_documents
=
db
.
session
.
query
(
Document
)
.
filter
(
Document
.
dataset_id
==
dataset
.
id
,
Document
.
indexing_status
==
'completed'
,
Document
.
enabled
==
True
,
Document
.
archived
==
False
,
)
.
all
()
if
all_documents
and
len
(
all_documents
)
>
0
:
update_params
=
{
Document
.
enabled
:
False
}
Document
.
query
.
filter_by
(
dataset_id
=
dataset
.
id
)
.
update
(
update_params
)
db
.
session
.
commit
()
# remove index
vector_index
=
IndexBuilder
.
get_index
(
dataset
,
'high_quality'
)
kw_index
=
IndexBuilder
.
get_index
(
dataset
,
'economy'
)
# delete from vector index
if
vector_index
:
vector_index
.
delete
()
kw_index
.
delete
()
except
Exception
as
e
:
click
.
echo
(
click
.
style
(
'clean dataset index error: {} {}'
.
format
(
e
.
__class__
.
__name__
,
str
(
e
)),
fg
=
'red'
))
end_at
=
time
.
perf_counter
()
click
.
echo
(
click
.
style
(
'Cleaned unused dataset from db success latency: {}'
.
format
(
end_at
-
start_at
),
fg
=
'green'
))
@
click
.
command
(
'sync-anthropic-hosted-providers'
,
help
=
'Sync anthropic hosted providers.'
)
@
click
.
command
(
'sync-anthropic-hosted-providers'
,
help
=
'Sync anthropic hosted providers.'
)
def
sync_anthropic_hosted_providers
():
def
sync_anthropic_hosted_providers
():
click
.
echo
(
click
.
style
(
'Start sync anthropic hosted providers.'
,
fg
=
'green'
))
click
.
echo
(
click
.
style
(
'Start sync anthropic hosted providers.'
,
fg
=
'green'
))
...
@@ -218,7 +277,9 @@ def sync_anthropic_hosted_providers():
...
@@ -218,7 +277,9 @@ def sync_anthropic_hosted_providers():
)
)
count
+=
1
count
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
click
.
echo
(
click
.
style
(
'Sync tenant anthropic hosted provider error: {} {}'
.
format
(
e
.
__class__
.
__name__
,
str
(
e
)),
fg
=
'red'
))
click
.
echo
(
click
.
style
(
'Sync tenant anthropic hosted provider error: {} {}'
.
format
(
e
.
__class__
.
__name__
,
str
(
e
)),
fg
=
'red'
))
continue
continue
click
.
echo
(
click
.
style
(
'Congratulations! Synced {} anthropic hosted providers.'
.
format
(
count
),
fg
=
'green'
))
click
.
echo
(
click
.
style
(
'Congratulations! Synced {} anthropic hosted providers.'
.
format
(
count
),
fg
=
'green'
))
...
@@ -231,3 +292,4 @@ def register_commands(app):
...
@@ -231,3 +292,4 @@ def register_commands(app):
app
.
cli
.
add_command
(
reset_encrypt_key_pair
)
app
.
cli
.
add_command
(
reset_encrypt_key_pair
)
app
.
cli
.
add_command
(
recreate_all_dataset_indexes
)
app
.
cli
.
add_command
(
recreate_all_dataset_indexes
)
app
.
cli
.
add_command
(
sync_anthropic_hosted_providers
)
app
.
cli
.
add_command
(
sync_anthropic_hosted_providers
)
app
.
cli
.
add_command
(
clean_unused_dataset_indexes
)
api/config.py
View file @
026f0bfc
...
@@ -53,7 +53,8 @@ DEFAULTS = {
...
@@ -53,7 +53,8 @@ DEFAULTS = {
'DEFAULT_LLM_PROVIDER'
:
'openai'
,
'DEFAULT_LLM_PROVIDER'
:
'openai'
,
'OPENAI_HOSTED_QUOTA_LIMIT'
:
200
,
'OPENAI_HOSTED_QUOTA_LIMIT'
:
200
,
'ANTHROPIC_HOSTED_QUOTA_LIMIT'
:
1000
,
'ANTHROPIC_HOSTED_QUOTA_LIMIT'
:
1000
,
'TENANT_DOCUMENT_COUNT'
:
100
'TENANT_DOCUMENT_COUNT'
:
100
,
'CLEAN_DAY_SETTING'
:
30
}
}
...
@@ -215,6 +216,7 @@ class Config:
...
@@ -215,6 +216,7 @@ class Config:
self
.
NOTION_INTEGRATION_TOKEN
=
get_env
(
'NOTION_INTEGRATION_TOKEN'
)
self
.
NOTION_INTEGRATION_TOKEN
=
get_env
(
'NOTION_INTEGRATION_TOKEN'
)
self
.
TENANT_DOCUMENT_COUNT
=
get_env
(
'TENANT_DOCUMENT_COUNT'
)
self
.
TENANT_DOCUMENT_COUNT
=
get_env
(
'TENANT_DOCUMENT_COUNT'
)
self
.
CLEAN_DAY_SETTING
=
get_env
(
'CLEAN_DAY_SETTING'
)
class
CloudEditionConfig
(
Config
):
class
CloudEditionConfig
(
Config
):
...
...
api/controllers/console/datasets/datasets.py
View file @
026f0bfc
...
@@ -3,7 +3,6 @@ from flask import request
...
@@ -3,7 +3,6 @@ from flask import request
from
flask_login
import
login_required
,
current_user
from
flask_login
import
login_required
,
current_user
from
flask_restful
import
Resource
,
reqparse
,
fields
,
marshal
,
marshal_with
from
flask_restful
import
Resource
,
reqparse
,
fields
,
marshal
,
marshal_with
from
werkzeug.exceptions
import
NotFound
,
Forbidden
from
werkzeug.exceptions
import
NotFound
,
Forbidden
import
services
import
services
from
controllers.console
import
api
from
controllers.console
import
api
from
controllers.console.datasets.error
import
DatasetNameDuplicateError
from
controllers.console.datasets.error
import
DatasetNameDuplicateError
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment