fix: 同步知识库,无法获取内容

This commit is contained in:
shaohuzhang1 2024-02-29 15:14:53 +08:00
parent 8450b3598c
commit 22c319a2bf
2 changed files with 6 additions and 2 deletions

View File

@ -7,7 +7,7 @@ from typing import List, Set
import requests
import html2text as ht
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, ParseResult
from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, parse_qs
requests.packages.urllib3.disable_warnings()
@ -60,7 +60,11 @@ class Fork:
def __init__(self, base_fork_url: str, selector_list: List[str]):
self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.')
parsed = urlsplit(base_fork_url)
query = parsed.query
self.base_fork_url = self.base_fork_url[:-1]
if query is not None and len(query) > 0:
self.base_fork_url = self.base_fork_url + '?' + query
self.selector_list = [selector for selector in selector_list if selector is not None and len(selector) > 0]
self.urlparse = urlparse(self.base_fork_url)
self.base_url = ParseResult(scheme=self.urlparse.scheme, netloc=self.urlparse.netloc, path='', params='',

View File

@ -26,7 +26,7 @@ tiktoken = "^0.5.1"
qianfan = "^0.1.1"
pycryptodome = "^3.19.0"
beautifulsoup4 = "^4.12.2"
html2text = "^2020.1.16"
html2text = "^2024.2.26"
[build-system]