mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
fix: Some web pages are unable to be crawled (#3897)
This commit is contained in:
parent
76c0b66152
commit
959187b5d2
|
|
@ -53,6 +53,28 @@ def remove_fragment(url: str) -> str:
|
|||
return urlunparse(modified_url)
|
||||
|
||||
|
||||
def remove_last_path_robust(url):
|
||||
"""健壮地删除URL的最后一个路径部分"""
|
||||
parsed = urlparse(url)
|
||||
|
||||
# 分割路径并过滤空字符串
|
||||
paths = [p for p in parsed.path.split('/') if p]
|
||||
|
||||
if paths:
|
||||
paths.pop() # 移除最后一个路径
|
||||
|
||||
# 重建路径
|
||||
new_path = '/' + '/'.join(paths) if paths else '/'
|
||||
|
||||
# 重建URL
|
||||
return urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc,
|
||||
new_path,
|
||||
parsed.params,
|
||||
parsed.query,
|
||||
parsed.fragment
|
||||
))
|
||||
class Fork:
|
||||
class Response:
|
||||
def __init__(self, content: str, child_link_list: List[ChildLink], status, message: str):
|
||||
|
|
@ -72,7 +94,7 @@ class Fork:
|
|||
def __init__(self, base_fork_url: str, selector_list: List[str]):
|
||||
base_fork_url = remove_fragment(base_fork_url)
|
||||
if any([True for end_str in ['index.html', '.htm', '.html'] if base_fork_url.endswith(end_str)]):
|
||||
self.base_fork_url = str(Path(base_fork_url).parent)
|
||||
base_fork_url =remove_last_path_robust(base_fork_url)
|
||||
self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.')
|
||||
parsed = urlsplit(base_fork_url)
|
||||
query = parsed.query
|
||||
|
|
@ -190,4 +212,4 @@ class Fork:
|
|||
def handler(base_url, response: Fork.Response):
|
||||
print(base_url.url, base_url.tag.text if base_url.tag else None, response.content)
|
||||
|
||||
# ForkManage('https://bbs.fit2cloud.com/c/de/6', ['.md-content']).fork(3, set(), handler)
|
||||
# ForkManage('https://hzqcgc.htc.edu.cn/jxky.htm', ['.md-content']).fork(3, set(), handler)
|
||||
|
|
|
|||
|
|
@ -63,6 +63,9 @@ export default {
|
|||
limitMessage2: 'files',
|
||||
sizeLimit: 'Each file must not exceed',
|
||||
imageMessage: 'Please process the image content',
|
||||
documentMessage: 'Please understand the content of the document',
|
||||
audioMessage: 'Please understand the video content',
|
||||
otherMessage: 'Please understand the file content',
|
||||
errorMessage: 'Upload Failed'
|
||||
},
|
||||
executionDetails: {
|
||||
|
|
|
|||
|
|
@ -61,6 +61,9 @@ export default {
|
|||
limitMessage2: '个文件',
|
||||
sizeLimit: '单个文件大小不能超过',
|
||||
imageMessage: '请解析图片内容',
|
||||
documentMessage: '请理解文档内容',
|
||||
audioMessage: '请理解视频内容',
|
||||
otherMessage: '请理解文件内容',
|
||||
errorMessage: '上传失败'
|
||||
},
|
||||
executionDetails: {
|
||||
|
|
|
|||
|
|
@ -61,6 +61,9 @@ export default {
|
|||
limitMessage2: '個文件',
|
||||
sizeLimit: '單個文件大小不能超過',
|
||||
imageMessage: '請解析圖片內容',
|
||||
documentMessage: '請理解檔案內容',
|
||||
audioMessage: '請理解視頻內容',
|
||||
otherMessage: '請理解檔案內容',
|
||||
errorMessage: '上傳失敗'
|
||||
},
|
||||
executionDetails: {
|
||||
|
|
|
|||
Loading…
Reference in New Issue