Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
D
dify
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ai-tech
dify
Commits
3b33e54a
Commit
3b33e54a
authored
Jul 21, 2023
by
John Wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix: web reader read failed
parent
5c522e80
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
36 additions
and
33 deletions
+36
-33
web_reader_tool.py
api/core/tool/web_reader_tool.py
+36
-33
No files found.
api/core/tool/web_reader_tool.py
View file @
3b33e54a
...
...
@@ -67,37 +67,40 @@ class WebReaderTool(BaseTool):
llm
:
BaseLanguageModel
def
_run
(
self
,
url
:
str
,
summary
:
bool
=
False
,
cursor
:
int
=
0
)
->
str
:
if
not
self
.
page_contents
or
self
.
url
!=
url
:
page_contents
=
get_url
(
url
)
self
.
page_contents
=
page_contents
self
.
url
=
url
else
:
page_contents
=
self
.
page_contents
if
summary
:
character_splitter
=
RecursiveCharacterTextSplitter
.
from_tiktoken_encoder
(
chunk_size
=
self
.
summary_chunk_tokens
,
chunk_overlap
=
self
.
summary_chunk_overlap
,
separators
=
self
.
summary_separators
)
texts
=
character_splitter
.
split_text
(
page_contents
)
docs
=
[
Document
(
page_content
=
t
)
for
t
in
texts
]
# only use first 10 docs
if
len
(
docs
)
>
10
:
docs
=
docs
[:
10
]
chain
=
load_summarize_chain
(
self
.
llm
,
chain_type
=
"refine"
,
callbacks
=
self
.
callbacks
)
page_contents
=
chain
.
run
(
docs
)
# todo use cache
else
:
page_contents
=
page_result
(
page_contents
,
cursor
,
self
.
max_chunk_length
)
if
self
.
continue_reading
and
len
(
page_contents
)
>=
self
.
max_chunk_length
:
page_contents
+=
f
"
\n
PAGE WAS TRUNCATED. IF YOU FIND INFORMATION THAT CAN ANSWER QUESTION "
\
f
"THEN DIRECT ANSWER AND STOP INVOKING web_reader TOOL, OTHERWISE USE "
\
f
"CURSOR={cursor+len(page_contents)} TO CONTINUE READING."
try
:
if
not
self
.
page_contents
or
self
.
url
!=
url
:
page_contents
=
get_url
(
url
)
self
.
page_contents
=
page_contents
self
.
url
=
url
else
:
page_contents
=
self
.
page_contents
if
summary
:
character_splitter
=
RecursiveCharacterTextSplitter
.
from_tiktoken_encoder
(
chunk_size
=
self
.
summary_chunk_tokens
,
chunk_overlap
=
self
.
summary_chunk_overlap
,
separators
=
self
.
summary_separators
)
texts
=
character_splitter
.
split_text
(
page_contents
)
docs
=
[
Document
(
page_content
=
t
)
for
t
in
texts
]
# only use first 10 docs
if
len
(
docs
)
>
10
:
docs
=
docs
[:
10
]
chain
=
load_summarize_chain
(
self
.
llm
,
chain_type
=
"refine"
,
callbacks
=
self
.
callbacks
)
page_contents
=
chain
.
run
(
docs
)
# todo use cache
else
:
page_contents
=
page_result
(
page_contents
,
cursor
,
self
.
max_chunk_length
)
if
self
.
continue_reading
and
len
(
page_contents
)
>=
self
.
max_chunk_length
:
page_contents
+=
f
"
\n
PAGE WAS TRUNCATED. IF YOU FIND INFORMATION THAT CAN ANSWER QUESTION "
\
f
"THEN DIRECT ANSWER AND STOP INVOKING web_reader TOOL, OTHERWISE USE "
\
f
"CURSOR={cursor+len(page_contents)} TO CONTINUE READING."
except
Exception
as
e
:
return
f
'failed to read the website, cause {str(e)}.'
return
page_contents
...
...
@@ -117,7 +120,7 @@ def get_url(url: str) -> str:
}
supported_content_types
=
file_extractor
.
SUPPORT_URL_CONTENT_TYPES
+
[
"text/html"
]
head_response
=
requests
.
head
(
url
,
headers
=
headers
,
allow_redirects
=
True
)
head_response
=
requests
.
head
(
url
,
headers
=
headers
,
allow_redirects
=
True
,
timeout
=
10
)
if
head_response
.
status_code
!=
200
:
return
"URL returned status code {}."
.
format
(
head_response
.
status_code
)
...
...
@@ -130,7 +133,7 @@ def get_url(url: str) -> str:
if
main_content_type
in
file_extractor
.
SUPPORT_URL_CONTENT_TYPES
:
return
FileExtractor
.
load_from_url
(
url
,
return_text
=
True
)
response
=
requests
.
get
(
url
,
headers
=
headers
,
allow_redirects
=
True
)
response
=
requests
.
get
(
url
,
headers
=
headers
,
allow_redirects
=
True
,
timeout
=
30
)
a
=
extract_using_readabilipy
(
response
.
text
)
if
not
a
[
'plain_text'
]
or
not
a
[
'plain_text'
]
.
strip
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment