diff --git a/apps/common/util/fork.py b/apps/common/util/fork.py index 4405b9b76..90527db9c 100644 --- a/apps/common/util/fork.py +++ b/apps/common/util/fork.py @@ -137,18 +137,30 @@ class Fork: html_content = response.content.decode(encoding) beautiful_soup = BeautifulSoup(html_content, "html.parser") meta_list = beautiful_soup.find_all('meta') - charset_list = [meta.attrs.get('charset') for meta in meta_list if - meta.attrs is not None and 'charset' in meta.attrs] + charset_list = Fork.get_charset_list(meta_list) if len(charset_list) > 0: charset = charset_list[0] if charset != encoding: try: - html_content = response.content.decode(charset) + html_content = response.content.decode(charset, errors='replace') except Exception as e: - logging.getLogger("max_kb").error(f'{e}') + logging.getLogger("max_kb").error(f'{e}: {traceback.format_exc()}') return BeautifulSoup(html_content, "html.parser") return beautiful_soup + @staticmethod + def get_charset_list(meta_list): + charset_list = [] + for meta in meta_list: + if meta.attrs is not None: + if 'charset' in meta.attrs: + charset_list.append(meta.attrs.get('charset')) + elif meta.attrs.get('http-equiv', '').lower() == 'content-type' and 'content' in meta.attrs: + match = re.search(r'charset=([^\s;]+)', meta.attrs['content'], re.I) + if match: + charset_list.append(match.group(1)) + return charset_list + def fork(self): try: