fix: 修复上传PDF文件智能分段时提示 分段内容不能超过102400个字符 #998

This commit is contained in:
shaohuzhang1 2024-08-26 14:15:05 +08:00 committed by shaohuzhang1
parent bb6f5d6096
commit df172b530c

View File

@ -246,11 +246,15 @@ def post_handler_paragraph(content: str, limit: int):
while (pos := content.find("\n", start)) != -1:
split, start = content[start:pos + 1], pos + 1
if len(temp_char + split) > limit:
if len(temp_char) > 4096:
pass
result.append(temp_char)
temp_char = ''
temp_char = temp_char + split
temp_char = temp_char + content[start:]
if len(temp_char) > 0:
if len(temp_char) > 4096:
pass
result.append(temp_char)
pattern = "[\\S\\s]{1," + str(limit) + '}'
@ -298,7 +302,7 @@ class SplitModel:
"""
level_content_list = parse_title_level(text, self.content_level_pattern, index)
if len(level_content_list) == 0:
return list(map(lambda row: to_tree_obj(row, 'block'), post_handler_paragraph(text, limit=self.limit)))
return [to_tree_obj(row, 'block') for row in post_handler_paragraph(text, limit=self.limit)]
if index == 0 and text.lstrip().index(level_content_list[0]["content"].lstrip()) != 0:
level_content_list.insert(0, to_tree_obj(""))
@ -307,7 +311,9 @@ class SplitModel:
for i in range(len(level_title_content_list)):
start_content: str = level_title_content_list[i].get('content')
if cursor < text.index(start_content, cursor):
level_content_list.insert(0, to_tree_obj(text[cursor: text.index(start_content, cursor)], 'block'))
for row in post_handler_paragraph(text[cursor: text.index(start_content, cursor)], limit=self.limit):
level_content_list.insert(0, to_tree_obj(row, 'block'))
block, cursor = get_level_block(text, level_title_content_list, i, cursor)
if len(block) == 0:
continue
@ -330,6 +336,9 @@ class SplitModel:
text = text.replace("\0", '')
result_tree = self.parse_to_tree(text, 0)
result = result_tree_to_paragraph(result_tree, [], [], self.with_filter)
for e in result:
if len(e['content']) > 4096:
pass
return [item for item in [self.post_reset_paragraph(row) for row in result] if
'content' in item and len(item.get('content').strip()) > 0]