From 765c79ed9d5190f8908f70e58a1d0e2111b93591 Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Tue, 9 Apr 2024 18:05:50 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E6=94=B9=E5=88=86=E6=AE=B5?= =?UTF-8?q?=E6=AD=A3=E5=88=99,=E4=BC=98=E5=8C=96=E5=88=86=E6=AE=B5?= =?UTF-8?q?=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/handle/impl/doc_split_handle.py | 10 +++-- apps/common/handle/impl/pdf_split_handle.py | 11 +++-- apps/common/handle/impl/text_split_handle.py | 10 +++-- apps/common/util/split_model.py | 42 +++++++++++++------ .../serializers/document_serializers.py | 10 +++-- 5 files changed, 55 insertions(+), 28 deletions(-) diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py index 4bb70896c..d3002b37a 100644 --- a/apps/common/handle/impl/doc_split_handle.py +++ b/apps/common/handle/impl/doc_split_handle.py @@ -15,10 +15,12 @@ from docx import Document from common.handle.base_split_handle import BaseSplitHandle from common.util.split_model import SplitModel -default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(? 0: - level_content_list = [*level_content_list, *list( - map(lambda row: to_tree_obj(row, 'block'), - post_handler_paragraph(other_content, with_filter=self.with_filter, limit=self.limit)))] + children = self.parse_to_tree(text=other_content, + index=index) + if len(children) > 0: + level_content_list = [*level_content_list, *children] + else: + if len(other_content.strip()) > 0: + level_content_list = [*level_content_list, *list( + map(lambda row: to_tree_obj(row, 'block'), + post_handler_paragraph(other_content, with_filter=self.with_filter, limit=self.limit)))] else: if len(text.strip()) > 0: level_content_list = [*level_content_list, *list( @@ -330,15 +335,16 @@ class SplitModel: :param text: 文本数据 :return: 解析后数据 {content:段落数据,keywords:[‘段落关键词’],parent_chain:['段落父级链路']} """ - result_tree = self.parse_to_tree(text.replace('\r', '\n'), 0) + text = text.replace('\r', '\n') + result_tree = self.parse_to_tree(text, 0) result = result_tree_to_paragraph(result_tree, [], []) - # 过滤段落内容不为空字符串的数据 - result = [item for item in result if 'content' in item and len(item.get('content').strip()) > 0] - return [self.post_reset_paragraph(item) for item in result] + return [item for item in [self.post_reset_paragraph(row) for row in result] if + 'content' in item and len(item.get('content').strip()) > 0] def post_reset_paragraph(self, paragraph: Dict): result = self.filter_title_special_characters(paragraph) result = self.sub_title(result) + result = self.content_is_null(result) return result @staticmethod @@ -349,6 +355,15 @@ class SplitModel: return {**paragraph, 'title': title[0:255], 'content': title[255:len(title)] + paragraph.get('content')} return paragraph + @staticmethod + def content_is_null(paragraph: Dict): + if 'title' in paragraph: + title = paragraph.get('title') + content = paragraph.get('content') + if (content is None or len(content.strip()) == 0) and (title is not None and len(title) > 0): + return {'title': '', 'content': title} + return paragraph + @staticmethod def filter_title_special_characters(paragraph: Dict): title = paragraph.get('title') if 'title' in paragraph else '' @@ -361,9 +376,12 @@ class SplitModel: title_special_characters_list = ['#', '\n', '\r', '\\s'] default_split_pattern = { - 'md': [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?