mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
feat: implement charset detection utility in fork.py
--bug=1059829 --user=刘瑞斌 【知识库】web导入知识库中文字符乱码 https://www.tapd.cn/62980211/s/1746162
This commit is contained in:
parent
1742db681d
commit
6184058e4d
|
|
@ -1,5 +1,4 @@
|
|||
import copy
|
||||
import logging
|
||||
import re
|
||||
import traceback
|
||||
from functools import reduce
|
||||
|
|
@ -139,18 +138,30 @@ class Fork:
|
|||
html_content = response.content.decode(encoding)
|
||||
beautiful_soup = BeautifulSoup(html_content, "html.parser")
|
||||
meta_list = beautiful_soup.find_all('meta')
|
||||
charset_list = [meta.attrs.get('charset') for meta in meta_list if
|
||||
meta.attrs is not None and 'charset' in meta.attrs]
|
||||
charset_list = Fork.get_charset_list(meta_list)
|
||||
if len(charset_list) > 0:
|
||||
charset = charset_list[0]
|
||||
if charset != encoding:
|
||||
try:
|
||||
html_content = response.content.decode(charset)
|
||||
html_content = response.content.decode(charset, errors='replace')
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f'{e}')
|
||||
maxkb_logger.error(f'{e}: {traceback.format_exc()}')
|
||||
return BeautifulSoup(html_content, "html.parser")
|
||||
return beautiful_soup
|
||||
|
||||
@staticmethod
|
||||
def get_charset_list(meta_list):
|
||||
charset_list = []
|
||||
for meta in meta_list:
|
||||
if meta.attrs is not None:
|
||||
if 'charset' in meta.attrs:
|
||||
charset_list.append(meta.attrs.get('charset'))
|
||||
elif meta.attrs.get('http-equiv', '').lower() == 'content-type' and 'content' in meta.attrs:
|
||||
match = re.search(r'charset=([^\s;]+)', meta.attrs['content'], re.I)
|
||||
if match:
|
||||
charset_list.append(match.group(1))
|
||||
return charset_list
|
||||
|
||||
def fork(self):
|
||||
try:
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue