fix: Some web pages are unable to be crawled (#3897)

This commit is contained in:
shaohuzhang1 2025-08-20 16:15:59 +08:00 committed by GitHub
parent 76c0b66152
commit 959187b5d2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 33 additions and 2 deletions

View File

@ -53,6 +53,28 @@ def remove_fragment(url: str) -> str:
return urlunparse(modified_url)
def remove_last_path_robust(url):
"""健壮地删除URL的最后一个路径部分"""
parsed = urlparse(url)
# 分割路径并过滤空字符串
paths = [p for p in parsed.path.split('/') if p]
if paths:
paths.pop() # 移除最后一个路径
# 重建路径
new_path = '/' + '/'.join(paths) if paths else '/'
# 重建URL
return urlunparse((
parsed.scheme,
parsed.netloc,
new_path,
parsed.params,
parsed.query,
parsed.fragment
))
class Fork:
class Response:
def __init__(self, content: str, child_link_list: List[ChildLink], status, message: str):
@ -72,7 +94,7 @@ class Fork:
def __init__(self, base_fork_url: str, selector_list: List[str]):
base_fork_url = remove_fragment(base_fork_url)
if any([True for end_str in ['index.html', '.htm', '.html'] if base_fork_url.endswith(end_str)]):
self.base_fork_url = str(Path(base_fork_url).parent)
base_fork_url =remove_last_path_robust(base_fork_url)
self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.')
parsed = urlsplit(base_fork_url)
query = parsed.query
@ -190,4 +212,4 @@ class Fork:
def handler(base_url, response: Fork.Response):
print(base_url.url, base_url.tag.text if base_url.tag else None, response.content)
# ForkManage('https://bbs.fit2cloud.com/c/de/6', ['.md-content']).fork(3, set(), handler)
# ForkManage('https://hzqcgc.htc.edu.cn/jxky.htm', ['.md-content']).fork(3, set(), handler)

View File

@ -63,6 +63,9 @@ export default {
limitMessage2: 'files',
sizeLimit: 'Each file must not exceed',
imageMessage: 'Please process the image content',
documentMessage: 'Please understand the content of the document',
audioMessage: 'Please understand the video content',
otherMessage: 'Please understand the file content',
errorMessage: 'Upload Failed'
},
executionDetails: {

View File

@ -61,6 +61,9 @@ export default {
limitMessage2: '个文件',
sizeLimit: '单个文件大小不能超过',
imageMessage: '请解析图片内容',
documentMessage: '请理解文档内容',
audioMessage: '请理解视频内容',
otherMessage: '请理解文件内容',
errorMessage: '上传失败'
},
executionDetails: {

View File

@ -61,6 +61,9 @@ export default {
limitMessage2: '個文件',
sizeLimit: '單個文件大小不能超過',
imageMessage: '請解析圖片內容',
documentMessage: '請理解檔案內容',
audioMessage: '請理解視頻內容',
otherMessage: '請理解檔案內容',
errorMessage: '上傳失敗'
},
executionDetails: {