# coding=utf-8 """ @project: maxkb @Author:虎 @file: text_split_handle.py @date:2024/3/27 18:19 @desc: """ import re import traceback from typing import List from charset_normalizer import detect from common.handle.base_split_handle import BaseSplitHandle from common.util.split_model import SplitModel default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<=\\n)(? 0.5: return True return False def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): buffer = get_buffer(file) if pattern_list is not None and len(pattern_list) > 0: split_model = SplitModel(pattern_list, with_filter, limit) else: split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) try: content = buffer.decode(detect(buffer)['encoding']) except BaseException as e: return {'name': file.name, 'content': []} return {'name': file.name, 'content': split_model.parse(content) } def get_content(self, file, save_image): buffer = file.read() try: return buffer.decode(detect(buffer)['encoding']) except BaseException as e: traceback.print_exception(e) return f'{e}'