mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 10:12:51 +00:00
chore: replace split_text with smart_split_paragraph in pdf_split_handle.py
This commit is contained in:
parent
8da6f7421c
commit
d147b794ce
|
|
@ -19,7 +19,7 @@ from langchain_community.document_loaders import PyPDFLoader
|
|||
|
||||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.utils.logger import maxkb_logger
|
||||
from common.utils.split_model import SplitModel
|
||||
from common.utils.split_model import SplitModel, smart_split_paragraph
|
||||
|
||||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
||||
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
||||
|
|
@ -183,7 +183,7 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
real_chapter_title = chapter_title[:256]
|
||||
# 限制章节内容长度
|
||||
if 0 < limit < len(chapter_text):
|
||||
split_text = PdfSplitHandle.split_text(chapter_text, limit)
|
||||
split_text = smart_split_paragraph(chapter_text, limit)
|
||||
for text in split_text:
|
||||
chapters.append({"title": real_chapter_title, "content": text})
|
||||
else:
|
||||
|
|
@ -262,7 +262,7 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
|
||||
# 限制章节内容长度
|
||||
if 0 < limit < len(chapter_text):
|
||||
split_text = PdfSplitHandle.split_text(chapter_text, limit)
|
||||
split_text = smart_split_paragraph(chapter_text, limit)
|
||||
for text in split_text:
|
||||
chapters.append({"title": link_title, "content": text})
|
||||
else:
|
||||
|
|
@ -296,29 +296,6 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
chapters = pre_toc + chapters
|
||||
return chapters
|
||||
|
||||
@staticmethod
|
||||
def split_text(text, length):
|
||||
segments = []
|
||||
current_segment = ""
|
||||
|
||||
for char in text:
|
||||
current_segment += char
|
||||
if len(current_segment) >= length:
|
||||
# 查找最近的句号
|
||||
last_period_index = current_segment.rfind('.')
|
||||
if last_period_index != -1:
|
||||
segments.append(current_segment[:last_period_index + 1])
|
||||
current_segment = current_segment[last_period_index + 1:] # 更新当前段落
|
||||
else:
|
||||
segments.append(current_segment)
|
||||
current_segment = ""
|
||||
|
||||
# 处理剩余的部分
|
||||
if current_segment:
|
||||
segments.append(current_segment)
|
||||
|
||||
return segments
|
||||
|
||||
@staticmethod
|
||||
def handle_chapter_title(title):
|
||||
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
|
||||
|
|
|
|||
|
|
@ -276,7 +276,7 @@ def post_handler_paragraph(content: str, limit: int):
|
|||
|
||||
def smart_split_paragraph(content: str, limit: int):
|
||||
"""
|
||||
智能分段:在limit前找到合适的分割点(句号、回车等)
|
||||
智能分段:在limit前找到合适的分割点(句号、回车等)
|
||||
:param content: 需要分段的文本
|
||||
:param limit: 最大字符限制
|
||||
:return: 分段后的文本列表
|
||||
|
|
@ -291,31 +291,29 @@ def smart_split_paragraph(content: str, limit: int):
|
|||
end = start + limit
|
||||
|
||||
if end >= len(content):
|
||||
# 剩余文本不超过限制,直接添加
|
||||
# 剩余文本不超过限制,直接添加
|
||||
result.append(content[start:])
|
||||
break
|
||||
|
||||
# 在limit范围内寻找最佳分割点
|
||||
best_split = end
|
||||
|
||||
# 优先级:句号 > 感叹号/问号 > 回车 > 分号/逗号 > 空格
|
||||
# 优先级:句号 > 感叹号/问号 > 回车
|
||||
split_chars = [
|
||||
('。', -1), ('!', -1), ('?', -1), # 句子结束符
|
||||
('。', 0), ('!', 0), ('?', 0), # 句子结束符,包含在当前段
|
||||
('\n', 0), # 回车符
|
||||
(';', -1), (',', -1), # 标点符号
|
||||
(' ', -1) # 空格
|
||||
]
|
||||
|
||||
# 从后往前找分割点
|
||||
for i in range(end - 1, start + limit // 2, -1): # 至少保留一半内容
|
||||
for char, offset in split_chars:
|
||||
if content[i] == char:
|
||||
best_split = i + 1 + offset
|
||||
best_split = i + 1 # 包含分隔符在当前段
|
||||
break
|
||||
if best_split != end:
|
||||
break
|
||||
|
||||
# 如果找不到合适分割点,使用原始limit
|
||||
# 如果找不到合适分割点,使用原始limit
|
||||
if best_split == end and end < len(content):
|
||||
best_split = end
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue