mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-31 10:12:51 +00:00
feat: add smart_split_paragraph function for intelligent text segmentation
--bug=1061634 --user=刘瑞斌 【知识库】知识库智能分段,如果切分段落太长的话,可以按照完整段落进行切分,创建多个片段。 https://www.tapd.cn/62980211/s/1783998
This commit is contained in:
parent
c0715c5b17
commit
cae5682110
|
|
@ -274,6 +274,57 @@ def post_handler_paragraph(content: str, limit: int):
|
|||
return reduce(lambda x, y: [*x, *y], map(lambda row: re.findall(pattern, row), result), [])
|
||||
|
||||
|
||||
def smart_split_paragraph(content: str, limit: int):
|
||||
"""
|
||||
智能分段:在limit前找到合适的分割点(句号、回车等)
|
||||
:param content: 需要分段的文本
|
||||
:param limit: 最大字符限制
|
||||
:return: 分段后的文本列表
|
||||
"""
|
||||
if len(content) <= limit:
|
||||
return [content]
|
||||
|
||||
result = []
|
||||
start = 0
|
||||
|
||||
while start < len(content):
|
||||
end = start + limit
|
||||
|
||||
if end >= len(content):
|
||||
# 剩余文本不超过限制,直接添加
|
||||
result.append(content[start:])
|
||||
break
|
||||
|
||||
# 在limit范围内寻找最佳分割点
|
||||
best_split = end
|
||||
|
||||
# 优先级:句号 > 感叹号/问号 > 回车 > 分号/逗号 > 空格
|
||||
split_chars = [
|
||||
('。', -1), ('!', -1), ('?', -1), # 句子结束符
|
||||
('\n', 0), # 回车符
|
||||
(';', -1), (',', -1), # 标点符号
|
||||
(' ', -1) # 空格
|
||||
]
|
||||
|
||||
# 从后往前找分割点
|
||||
for i in range(end - 1, start + limit // 2, -1): # 至少保留一半内容
|
||||
for char, offset in split_chars:
|
||||
if content[i] == char:
|
||||
best_split = i + 1 + offset
|
||||
break
|
||||
if best_split != end:
|
||||
break
|
||||
|
||||
# 如果找不到合适分割点,使用原始limit
|
||||
if best_split == end and end < len(content):
|
||||
best_split = end
|
||||
|
||||
result.append(content[start:best_split])
|
||||
start = best_split
|
||||
|
||||
return [text for text in result if text.strip()]
|
||||
|
||||
|
||||
replace_map = {
|
||||
re.compile('\n+'): '\n',
|
||||
re.compile(' +'): ' ',
|
||||
|
|
@ -316,7 +367,7 @@ class SplitModel:
|
|||
"""
|
||||
level_content_list = parse_title_level(text, self.content_level_pattern, index)
|
||||
if len(level_content_list) == 0:
|
||||
return [to_tree_obj(row, 'block') for row in post_handler_paragraph(text, limit=self.limit)]
|
||||
return [to_tree_obj(row, 'block') for row in smart_split_paragraph(text, limit=self.limit)]
|
||||
if index == 0 and text.lstrip().index(level_content_list[0]["content"].lstrip()) != 0:
|
||||
level_content_list.insert(0, to_tree_obj(""))
|
||||
|
||||
|
|
@ -325,7 +376,7 @@ class SplitModel:
|
|||
for i in range(len(level_title_content_list)):
|
||||
start_content: str = level_title_content_list[i].get('content')
|
||||
if cursor < text.index(start_content, cursor):
|
||||
for row in post_handler_paragraph(text[cursor: text.index(start_content, cursor)], limit=self.limit):
|
||||
for row in smart_split_paragraph(text[cursor: text.index(start_content, cursor)], limit=self.limit):
|
||||
level_content_list.insert(0, to_tree_obj(row, 'block'))
|
||||
|
||||
block, cursor = get_level_block(text, level_title_content_list, i, cursor)
|
||||
|
|
|
|||
Loading…
Reference in New Issue