chore: replace split_text with smart_split_paragraph in pdf_split_handle.py

This commit is contained in:
CaptainB 2025-10-27 14:15:36 +08:00
parent 8da6f7421c
commit d147b794ce
2 changed files with 9 additions and 34 deletions

View File

@ -19,7 +19,7 @@ from langchain_community.document_loaders import PyPDFLoader
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
from common.utils.split_model import SplitModel
from common.utils.split_model import SplitModel, smart_split_paragraph
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
@ -183,7 +183,7 @@ class PdfSplitHandle(BaseSplitHandle):
real_chapter_title = chapter_title[:256]
# 限制章节内容长度
if 0 < limit < len(chapter_text):
split_text = PdfSplitHandle.split_text(chapter_text, limit)
split_text = smart_split_paragraph(chapter_text, limit)
for text in split_text:
chapters.append({"title": real_chapter_title, "content": text})
else:
@ -262,7 +262,7 @@ class PdfSplitHandle(BaseSplitHandle):
# 限制章节内容长度
if 0 < limit < len(chapter_text):
split_text = PdfSplitHandle.split_text(chapter_text, limit)
split_text = smart_split_paragraph(chapter_text, limit)
for text in split_text:
chapters.append({"title": link_title, "content": text})
else:
@ -296,29 +296,6 @@ class PdfSplitHandle(BaseSplitHandle):
chapters = pre_toc + chapters
return chapters
@staticmethod
def split_text(text, length):
segments = []
current_segment = ""
for char in text:
current_segment += char
if len(current_segment) >= length:
# 查找最近的句号
last_period_index = current_segment.rfind('.')
if last_period_index != -1:
segments.append(current_segment[:last_period_index + 1])
current_segment = current_segment[last_period_index + 1:] # 更新当前段落
else:
segments.append(current_segment)
current_segment = ""
# 处理剩余的部分
if current_segment:
segments.append(current_segment)
return segments
@staticmethod
def handle_chapter_title(title):
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)

View File

@ -276,7 +276,7 @@ def post_handler_paragraph(content: str, limit: int):
def smart_split_paragraph(content: str, limit: int):
"""
智能分段在limit前找到合适的分割点句号回车等
智能分段:在limit前找到合适的分割点(句号回车等)
:param content: 需要分段的文本
:param limit: 最大字符限制
:return: 分段后的文本列表
@ -291,31 +291,29 @@ def smart_split_paragraph(content: str, limit: int):
end = start + limit
if end >= len(content):
# 剩余文本不超过限制直接添加
# 剩余文本不超过限制,直接添加
result.append(content[start:])
break
# 在limit范围内寻找最佳分割点
best_split = end
# 优先级:句号 > 感叹号/问号 > 回车 > 分号/逗号 > 空格
# 优先级:句号 > 感叹号/问号 > 回车
split_chars = [
('', -1), ('', -1), ('', -1), # 句子结束符
('', 0), ('!', 0), ('?', 0), # 句子结束符,包含在当前段
('\n', 0), # 回车符
('', -1), ('', -1), # 标点符号
(' ', -1) # 空格
]
# 从后往前找分割点
for i in range(end - 1, start + limit // 2, -1): # 至少保留一半内容
for char, offset in split_chars:
if content[i] == char:
best_split = i + 1 + offset
best_split = i + 1 # 包含分隔符在当前段
break
if best_split != end:
break
# 如果找不到合适分割点使用原始limit
# 如果找不到合适分割点,使用原始limit
if best_split == end and end < len(content):
best_split = end