From df172b530c440804fa8fb7813bc1c798caeba5ee Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Mon, 26 Aug 2024 14:15:05 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=B8=8A=E4=BC=A0PDF?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E6=99=BA=E8=83=BD=E5=88=86=E6=AE=B5=E6=97=B6?= =?UTF-8?q?=E6=8F=90=E7=A4=BA=20=E5=88=86=E6=AE=B5=E5=86=85=E5=AE=B9?= =?UTF-8?q?=E4=B8=8D=E8=83=BD=E8=B6=85=E8=BF=87102400=E4=B8=AA=E5=AD=97?= =?UTF-8?q?=E7=AC=A6=20#998?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/util/split_model.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/apps/common/util/split_model.py b/apps/common/util/split_model.py index e347b60c1..0e7bcd5e1 100644 --- a/apps/common/util/split_model.py +++ b/apps/common/util/split_model.py @@ -246,11 +246,15 @@ def post_handler_paragraph(content: str, limit: int): while (pos := content.find("\n", start)) != -1: split, start = content[start:pos + 1], pos + 1 if len(temp_char + split) > limit: + if len(temp_char) > 4096: + pass result.append(temp_char) temp_char = '' temp_char = temp_char + split temp_char = temp_char + content[start:] if len(temp_char) > 0: + if len(temp_char) > 4096: + pass result.append(temp_char) pattern = "[\\S\\s]{1," + str(limit) + '}' @@ -298,7 +302,7 @@ class SplitModel: """ level_content_list = parse_title_level(text, self.content_level_pattern, index) if len(level_content_list) == 0: - return list(map(lambda row: to_tree_obj(row, 'block'), post_handler_paragraph(text, limit=self.limit))) + return [to_tree_obj(row, 'block') for row in post_handler_paragraph(text, limit=self.limit)] if index == 0 and text.lstrip().index(level_content_list[0]["content"].lstrip()) != 0: level_content_list.insert(0, to_tree_obj("")) @@ -307,7 +311,9 @@ class SplitModel: for i in range(len(level_title_content_list)): start_content: str = level_title_content_list[i].get('content') if cursor < text.index(start_content, cursor): - level_content_list.insert(0, to_tree_obj(text[cursor: text.index(start_content, cursor)], 'block')) + for row in post_handler_paragraph(text[cursor: text.index(start_content, cursor)], limit=self.limit): + level_content_list.insert(0, to_tree_obj(row, 'block')) + block, cursor = get_level_block(text, level_title_content_list, i, cursor) if len(block) == 0: continue @@ -330,6 +336,9 @@ class SplitModel: text = text.replace("\0", '') result_tree = self.parse_to_tree(text, 0) result = result_tree_to_paragraph(result_tree, [], [], self.with_filter) + for e in result: + if len(e['content']) > 4096: + pass return [item for item in [self.post_reset_paragraph(row) for row in result] if 'content' in item and len(item.get('content').strip()) > 0]