From 22c319a2bf89315fc3ebc3d02d6fa58543dde9f8 Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Thu, 29 Feb 2024 15:14:53 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E5=90=8C=E6=AD=A5=E7=9F=A5=E8=AF=86?= =?UTF-8?q?=E5=BA=93,=E6=97=A0=E6=B3=95=E8=8E=B7=E5=8F=96=E5=86=85?= =?UTF-8?q?=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/util/fork.py | 6 +++++- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/apps/common/util/fork.py b/apps/common/util/fork.py index 09904cc49..4c9ac81e4 100644 --- a/apps/common/util/fork.py +++ b/apps/common/util/fork.py @@ -7,7 +7,7 @@ from typing import List, Set import requests import html2text as ht from bs4 import BeautifulSoup -from urllib.parse import urljoin, urlparse, ParseResult +from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, parse_qs requests.packages.urllib3.disable_warnings() @@ -60,7 +60,11 @@ class Fork: def __init__(self, base_fork_url: str, selector_list: List[str]): self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.') + parsed = urlsplit(base_fork_url) + query = parsed.query self.base_fork_url = self.base_fork_url[:-1] + if query is not None and len(query) > 0: + self.base_fork_url = self.base_fork_url + '?' + query self.selector_list = [selector for selector in selector_list if selector is not None and len(selector) > 0] self.urlparse = urlparse(self.base_fork_url) self.base_url = ParseResult(scheme=self.urlparse.scheme, netloc=self.urlparse.netloc, path='', params='', diff --git a/pyproject.toml b/pyproject.toml index 13fab8095..7ec3b0086 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ tiktoken = "^0.5.1" qianfan = "^0.1.1" pycryptodome = "^3.19.0" beautifulsoup4 = "^4.12.2" -html2text = "^2020.1.16" +html2text = "^2024.2.26" [build-system]