MaxKB/apps/common/handle/impl/text/html_split_handle.py

74 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
"""
@project: maxkb
@Author
@file html_split_handle.py
@date2024/5/23 10:58
@desc:
"""
import re
import traceback
from typing import List
from bs4 import BeautifulSoup
from charset_normalizer import detect
from html2text import html2text
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
def get_encoding(buffer):
beautiful_soup = BeautifulSoup(buffer, "html.parser")
meta_list = beautiful_soup.find_all('meta')
charset_list = [meta.attrs.get('charset') for meta in meta_list if
meta.attrs is not None and 'charset' in meta.attrs]
if len(charset_list) > 0:
charset = charset_list[0]
return charset
return detect(buffer)['encoding']
class HTMLSplitHandle(BaseSplitHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".html") or file_name.endswith(".HTML"):
return True
return False
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
content = html2text(content)
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}
def get_content(self, file, save_image):
buffer = file.read()
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
return html2text(content)
except BaseException as e:
traceback.print_exception(e)
return f'{e}'