MaxKB/apps/common/handle/impl/text/html_split_handle.py
CaptainB 82a2203be6 fix: handle string type for limit and improve error logging in pdf_split_handle
--bug=1057493 --user=刘瑞斌 【知识库】上传文档,使用高级分段报错 https://www.tapd.cn/62980211/s/1720110
2025-06-30 12:47:47 +08:00

79 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
"""
@project: maxkb
@Author
@file html_split_handle.py
@date2024/5/23 10:58
@desc:
"""
import re
import traceback
from typing import List
from bs4 import BeautifulSoup
from charset_normalizer import detect
from html2text import html2text
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
from common.utils.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
def get_encoding(buffer):
beautiful_soup = BeautifulSoup(buffer, "html.parser")
meta_list = beautiful_soup.find_all('meta')
charset_list = [meta.attrs.get('charset') for meta in meta_list if
meta.attrs is not None and 'charset' in meta.attrs]
if len(charset_list) > 0:
charset = charset_list[0]
return charset
return detect(buffer)['encoding']
class HTMLSplitHandle(BaseSplitHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".html") or file_name.endswith(".HTML"):
return True
return False
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file)
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
content = html2text(content)
except BaseException as e:
maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
return {
'name': file.name, 'content': []
}
return {
'name': file.name,
'content': split_model.parse(content)
}
def get_content(self, file, save_image):
buffer = file.read()
try:
encoding = get_encoding(buffer)
content = buffer.decode(encoding)
return html2text(content)
except BaseException as e:
traceback.print_exception(e)
return f'{e}'