feat: add smart_split_paragraph function for intelligent text segmentation

--bug=1061634 --user=刘瑞斌【知识库】知识库智能分段，如果切分段落太长的话，可以按照完整段落进行切分，创建多个片段。 https://www.tapd.cn/62980211/s/1783998
2025-12-31 10:12:51 +00:00 · 2025-10-13 10:26:06 +08:00 · 2025-10-13 10:26:06 +08:00 · cae5682110
parent c0715c5b17
commit cae5682110
1 changed files with 53 additions and 2 deletions
--- a/apps/common/utils/split_model.py
+++ b/apps/common/utils/split_model.py
@ -274,6 +274,57 @@ def post_handler_paragraph(content: str, limit: int):
    return reduce(lambda x, y: [*x, *y], map(lambda row: re.findall(pattern, row), result), [])


+def smart_split_paragraph(content: str, limit: int):
+    """
+    智能分段：在limit前找到合适的分割点（句号、回车等）
+    :param content: 需要分段的文本
+    :param limit: 最大字符限制
+    :return: 分段后的文本列表
+    """
+    if len(content) <= limit:
+        return [content]
+
+    result = []
+    start = 0
+
+    while start < len(content):
+        end = start + limit
+
+        if end >= len(content):
+            # 剩余文本不超过限制，直接添加
+            result.append(content[start:])
+            break
+
+        # 在limit范围内寻找最佳分割点
+        best_split = end
+
+        # 优先级：句号 > 感叹号/问号 > 回车 > 分号/逗号 > 空格
+        split_chars = [
+            ('。', -1), ('！', -1), ('？', -1),  # 句子结束符
+            ('\n', 0),  # 回车符
+            ('；', -1), ('，', -1),  # 标点符号
+            (' ', -1)  # 空格
+        ]
+
+        # 从后往前找分割点
+        for i in range(end - 1, start + limit // 2, -1):  # 至少保留一半内容
+            for char, offset in split_chars:
+                if content[i] == char:
+                    best_split = i + 1 + offset
+                    break
+            if best_split != end:
+                break
+
+        # 如果找不到合适分割点，使用原始limit
+        if best_split == end and end < len(content):
+            best_split = end
+
+        result.append(content[start:best_split])
+        start = best_split
+
+    return [text for text in result if text.strip()]
+
+
 replace_map = {
    re.compile('\n+'): '\n',
    re.compile(' +'): ' ',
@ -316,7 +367,7 @@ class SplitModel:
        """
        level_content_list = parse_title_level(text, self.content_level_pattern, index)
        if len(level_content_list) == 0:
-            return [to_tree_obj(row, 'block') for row in post_handler_paragraph(text, limit=self.limit)]
+            return [to_tree_obj(row, 'block') for row in smart_split_paragraph(text, limit=self.limit)]
        if index == 0 and text.lstrip().index(level_content_list[0]["content"].lstrip()) != 0:
            level_content_list.insert(0, to_tree_obj(""))

@ -325,7 +376,7 @@ class SplitModel:
        for i in range(len(level_title_content_list)):
            start_content: str = level_title_content_list[i].get('content')
            if cursor < text.index(start_content, cursor):
-                for row in post_handler_paragraph(text[cursor:   text.index(start_content, cursor)], limit=self.limit):
+                for row in smart_split_paragraph(text[cursor:   text.index(start_content, cursor)], limit=self.limit):
                    level_content_list.insert(0, to_tree_obj(row, 'block'))

            block, cursor = get_level_block(text, level_title_content_list, i, cursor)