From d147b794ce22a640bc45e3a9c65ca017361943c9 Mon Sep 17 00:00:00 2001 From: CaptainB Date: Mon, 27 Oct 2025 14:15:36 +0800 Subject: [PATCH] chore: replace split_text with smart_split_paragraph in pdf_split_handle.py --- .../handle/impl/text/pdf_split_handle.py | 29 ++----------------- apps/common/utils/split_model.py | 14 ++++----- 2 files changed, 9 insertions(+), 34 deletions(-) diff --git a/apps/common/handle/impl/text/pdf_split_handle.py b/apps/common/handle/impl/text/pdf_split_handle.py index d666796b9..eca98b0f0 100644 --- a/apps/common/handle/impl/text/pdf_split_handle.py +++ b/apps/common/handle/impl/text/pdf_split_handle.py @@ -19,7 +19,7 @@ from langchain_community.document_loaders import PyPDFLoader from common.handle.base_split_handle import BaseSplitHandle from common.utils.logger import maxkb_logger -from common.utils.split_model import SplitModel +from common.utils.split_model import SplitModel, smart_split_paragraph default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<=\\n)(?= length: - # 查找最近的句号 - last_period_index = current_segment.rfind('.') - if last_period_index != -1: - segments.append(current_segment[:last_period_index + 1]) - current_segment = current_segment[last_period_index + 1:] # 更新当前段落 - else: - segments.append(current_segment) - current_segment = "" - - # 处理剩余的部分 - if current_segment: - segments.append(current_segment) - - return segments - @staticmethod def handle_chapter_title(title): title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title) diff --git a/apps/common/utils/split_model.py b/apps/common/utils/split_model.py index b80409e10..8bf2df59e 100644 --- a/apps/common/utils/split_model.py +++ b/apps/common/utils/split_model.py @@ -276,7 +276,7 @@ def post_handler_paragraph(content: str, limit: int): def smart_split_paragraph(content: str, limit: int): """ - 智能分段:在limit前找到合适的分割点(句号、回车等) + 智能分段:在limit前找到合适的分割点(句号、回车等) :param content: 需要分段的文本 :param limit: 最大字符限制 :return: 分段后的文本列表 @@ -291,31 +291,29 @@ def smart_split_paragraph(content: str, limit: int): end = start + limit if end >= len(content): - # 剩余文本不超过限制,直接添加 + # 剩余文本不超过限制,直接添加 result.append(content[start:]) break # 在limit范围内寻找最佳分割点 best_split = end - # 优先级:句号 > 感叹号/问号 > 回车 > 分号/逗号 > 空格 + # 优先级:句号 > 感叹号/问号 > 回车 split_chars = [ - ('。', -1), ('!', -1), ('?', -1), # 句子结束符 + ('。', 0), ('!', 0), ('?', 0), # 句子结束符,包含在当前段 ('\n', 0), # 回车符 - (';', -1), (',', -1), # 标点符号 - (' ', -1) # 空格 ] # 从后往前找分割点 for i in range(end - 1, start + limit // 2, -1): # 至少保留一半内容 for char, offset in split_chars: if content[i] == char: - best_split = i + 1 + offset + best_split = i + 1 # 包含分隔符在当前段 break if best_split != end: break - # 如果找不到合适分割点,使用原始limit + # 如果找不到合适分割点,使用原始limit if best_split == end and end < len(content): best_split = end