From 6184058e4d4f134f5fa90863d61ce6ceb07b826d Mon Sep 17 00:00:00 2001 From: CaptainB Date: Thu, 31 Jul 2025 11:38:53 +0800 Subject: [PATCH] feat: implement charset detection utility in fork.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --bug=1059829 --user=刘瑞斌 【知识库】web导入知识库中文字符乱码 https://www.tapd.cn/62980211/s/1746162 --- apps/common/utils/fork.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/apps/common/utils/fork.py b/apps/common/utils/fork.py index 7bea7d805..d9a9a8c48 100644 --- a/apps/common/utils/fork.py +++ b/apps/common/utils/fork.py @@ -1,5 +1,4 @@ import copy -import logging import re import traceback from functools import reduce @@ -139,18 +138,30 @@ class Fork: html_content = response.content.decode(encoding) beautiful_soup = BeautifulSoup(html_content, "html.parser") meta_list = beautiful_soup.find_all('meta') - charset_list = [meta.attrs.get('charset') for meta in meta_list if - meta.attrs is not None and 'charset' in meta.attrs] + charset_list = Fork.get_charset_list(meta_list) if len(charset_list) > 0: charset = charset_list[0] if charset != encoding: try: - html_content = response.content.decode(charset) + html_content = response.content.decode(charset, errors='replace') except Exception as e: - maxkb_logger.error(f'{e}') + maxkb_logger.error(f'{e}: {traceback.format_exc()}') return BeautifulSoup(html_content, "html.parser") return beautiful_soup + @staticmethod + def get_charset_list(meta_list): + charset_list = [] + for meta in meta_list: + if meta.attrs is not None: + if 'charset' in meta.attrs: + charset_list.append(meta.attrs.get('charset')) + elif meta.attrs.get('http-equiv', '').lower() == 'content-type' and 'content' in meta.attrs: + match = re.search(r'charset=([^\s;]+)', meta.attrs['content'], re.I) + if match: + charset_list.append(match.group(1)) + return charset_list + def fork(self): try: