chore: replace split_text with smart_split_paragraph in pdf_split_handle.py

2025-12-26 10:12:51 +00:00 · 2025-10-27 14:15:36 +08:00 · 2025-10-27 14:15:36 +08:00 · d147b794ce
parent 8da6f7421c
commit d147b794ce
2 changed files with 9 additions and 34 deletions
--- a/apps/common/handle/impl/text/pdf_split_handle.py
+++ b/apps/common/handle/impl/text/pdf_split_handle.py
@ -19,7 +19,7 @@ from langchain_community.document_loaders import PyPDFLoader

 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.logger import maxkb_logger
-from common.utils.split_model import SplitModel
+from common.utils.split_model import SplitModel, smart_split_paragraph

 default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
                        re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
@ -183,7 +183,7 @@ class PdfSplitHandle(BaseSplitHandle):
            real_chapter_title = chapter_title[:256]
            # 限制章节内容长度
            if 0 < limit < len(chapter_text):
-                split_text = PdfSplitHandle.split_text(chapter_text, limit)
+                split_text = smart_split_paragraph(chapter_text, limit)
                for text in split_text:
                    chapters.append({"title": real_chapter_title, "content": text})
            else:
@ -262,7 +262,7 @@ class PdfSplitHandle(BaseSplitHandle):

                    # 限制章节内容长度
                    if 0 < limit < len(chapter_text):
-                        split_text = PdfSplitHandle.split_text(chapter_text, limit)
+                        split_text = smart_split_paragraph(chapter_text, limit)
                        for text in split_text:
                            chapters.append({"title": link_title, "content": text})
                    else:
@ -296,29 +296,6 @@ class PdfSplitHandle(BaseSplitHandle):
            chapters = pre_toc + chapters
        return chapters

-    @staticmethod
-    def split_text(text, length):
-        segments = []
-        current_segment = ""
-
-        for char in text:
-            current_segment += char
-            if len(current_segment) >= length:
-                # 查找最近的句号
-                last_period_index = current_segment.rfind('.')
-                if last_period_index != -1:
-                    segments.append(current_segment[:last_period_index + 1])
-                    current_segment = current_segment[last_period_index + 1:]  # 更新当前段落
-                else:
-                    segments.append(current_segment)
-                    current_segment = ""
-
-        # 处理剩余的部分
-        if current_segment:
-            segments.append(current_segment)
-
-        return segments
-
    @staticmethod
    def handle_chapter_title(title):
        title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
--- a/apps/common/utils/split_model.py
+++ b/apps/common/utils/split_model.py
@ -276,7 +276,7 @@ def post_handler_paragraph(content: str, limit: int):

 def smart_split_paragraph(content: str, limit: int):
    """
-    智能分段：在limit前找到合适的分割点（句号、回车等）
+    智能分段:在limit前找到合适的分割点(句号、回车等)
    :param content: 需要分段的文本
    :param limit: 最大字符限制
    :return: 分段后的文本列表
@ -291,31 +291,29 @@ def smart_split_paragraph(content: str, limit: int):
        end = start + limit

        if end >= len(content):
-            # 剩余文本不超过限制，直接添加
+            # 剩余文本不超过限制,直接添加
            result.append(content[start:])
            break

        # 在limit范围内寻找最佳分割点
        best_split = end

-        # 优先级：句号 > 感叹号/问号 > 回车 > 分号/逗号 > 空格
+        # 优先级:句号 > 感叹号/问号 > 回车
        split_chars = [
-            ('。', -1), ('！', -1), ('？', -1),  # 句子结束符
+            ('。', 0), ('!', 0), ('?', 0),  # 句子结束符,包含在当前段
            ('\n', 0),  # 回车符
-            ('；', -1), ('，', -1),  # 标点符号
-            (' ', -1)  # 空格
        ]

        # 从后往前找分割点
        for i in range(end - 1, start + limit // 2, -1):  # 至少保留一半内容
            for char, offset in split_chars:
                if content[i] == char:
-                    best_split = i + 1 + offset
+                    best_split = i + 1  # 包含分隔符在当前段
                    break
            if best_split != end:
                break

-        # 如果找不到合适分割点，使用原始limit
+        # 如果找不到合适分割点,使用原始limit
        if best_split == end and end < len(content):
            best_split = end