fix: improve charset detection in HTML parsing
Some checks failed
sync2gitee / repo-sync (push) Has been cancelled

This commit is contained in:
CaptainB 2025-07-31 13:35:11 +08:00
parent 37100281ee
commit 9d4679a835

View File

@ -137,18 +137,30 @@ class Fork:
html_content = response.content.decode(encoding)
beautiful_soup = BeautifulSoup(html_content, "html.parser")
meta_list = beautiful_soup.find_all('meta')
charset_list = [meta.attrs.get('charset') for meta in meta_list if
meta.attrs is not None and 'charset' in meta.attrs]
charset_list = Fork.get_charset_list(meta_list)
if len(charset_list) > 0:
charset = charset_list[0]
if charset != encoding:
try:
html_content = response.content.decode(charset)
html_content = response.content.decode(charset, errors='replace')
except Exception as e:
logging.getLogger("max_kb").error(f'{e}')
logging.getLogger("max_kb").error(f'{e}: {traceback.format_exc()}')
return BeautifulSoup(html_content, "html.parser")
return beautiful_soup
@staticmethod
def get_charset_list(meta_list):
charset_list = []
for meta in meta_list:
if meta.attrs is not None:
if 'charset' in meta.attrs:
charset_list.append(meta.attrs.get('charset'))
elif meta.attrs.get('http-equiv', '').lower() == 'content-type' and 'content' in meta.attrs:
match = re.search(r'charset=([^\s;]+)', meta.attrs['content'], re.I)
if match:
charset_list.append(match.group(1))
return charset_list
def fork(self):
try: