fix: improve charset detection in HTML parsing

2025-12-26 01:33:05 +00:00 · 2025-07-31 13:35:11 +08:00 · 2025-07-31 13:35:11 +08:00 · 9d4679a835
parent 37100281ee
commit 9d4679a835
1 changed files with 16 additions and 4 deletions
--- a/apps/common/util/fork.py
+++ b/apps/common/util/fork.py
@ -137,18 +137,30 @@ class Fork:
        html_content = response.content.decode(encoding)
        beautiful_soup = BeautifulSoup(html_content, "html.parser")
        meta_list = beautiful_soup.find_all('meta')
-        charset_list = [meta.attrs.get('charset') for meta in meta_list if
-                        meta.attrs is not None and 'charset' in meta.attrs]
+        charset_list = Fork.get_charset_list(meta_list)
        if len(charset_list) > 0:
            charset = charset_list[0]
            if charset != encoding:
                try:
-                    html_content = response.content.decode(charset)
+                    html_content = response.content.decode(charset, errors='replace')
                except Exception as e:
-                    logging.getLogger("max_kb").error(f'{e}')
+                    logging.getLogger("max_kb").error(f'{e}: {traceback.format_exc()}')
                return BeautifulSoup(html_content, "html.parser")
        return beautiful_soup

+    @staticmethod
+    def get_charset_list(meta_list):
+        charset_list = []
+        for meta in meta_list:
+            if meta.attrs is not None:
+                if 'charset' in meta.attrs:
+                    charset_list.append(meta.attrs.get('charset'))
+                elif meta.attrs.get('http-equiv', '').lower() == 'content-type' and 'content' in meta.attrs:
+                    match = re.search(r'charset=([^\s;]+)', meta.attrs['content'], re.I)
+                    if match:
+                        charset_list.append(match.group(1))
+        return charset_list
+
    def fork(self):
        try: