fix: 同步web站点内容编码错误,导致乱码

This commit is contained in:
shaohuzhang1 2024-03-25 18:46:25 +08:00
parent a01d5beb59
commit cf003aa2d2

View File

@ -6,6 +6,7 @@ from functools import reduce
from typing import List, Set
from urllib.parse import urljoin, urlparse, ParseResult, urlsplit
import chardet
import html2text as ht
import requests
from bs4 import BeautifulSoup
@ -121,7 +122,7 @@ class Fork:
@staticmethod
def get_beautiful_soup(response):
encoding = response.apparent_encoding if response.apparent_encoding is not None else 'utf-8'
encoding = response.encoding if response.encoding and response.encoding != 'ISO-8859-1' is not None else response.apparent_encoding
html_content = response.content.decode(encoding)
return BeautifulSoup(html_content, "html.parser")