From 5ec94860b2881d6a4657481a9cd37b6d6c8a1a4c Mon Sep 17 00:00:00 2001 From: shaohuzhang1 <80892890+shaohuzhang1@users.noreply.github.com> Date: Wed, 19 Mar 2025 12:04:43 +0800 Subject: [PATCH] perf: Enhance Word parsing (#2612) --- apps/common/handle/impl/doc_split_handle.py | 39 +++++++++++++++++---- apps/common/util/split_model.py | 12 ++++--- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py index 753e74fc4..1df7b6a66 100644 --- a/apps/common/handle/impl/doc_split_handle.py +++ b/apps/common/handle/impl/doc_split_handle.py @@ -110,24 +110,51 @@ def get_image_id_func(): return get_image_id +title_font_list = [ + [36, 100], + [26, 36], + [24, 26], + [22, 24], + [18, 22], + [16, 18] +] + + +def get_title_level(paragraph: Paragraph): + try: + if paragraph.style is not None: + psn = paragraph.style.name + if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'): + return int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题', + '')) + if len(paragraph.runs) == 1: + font_size = paragraph.runs[0].font.size + pt = font_size.pt + if pt >= 16: + for _value, index in zip(title_font_list, range(len(title_font_list))): + if pt >= _value[0] and pt < _value[1]: + return index + 1 + except Exception as e: + pass + return None + + class DocSplitHandle(BaseSplitHandle): @staticmethod def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id): try: - psn = paragraph.style.name - if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'): - title = "".join(["#" for i in range( - int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题', - '')))]) + " " + paragraph.text + title_level = get_title_level(paragraph) + if title_level is not None: + title = "".join(["#" for i in range(title_level)]) + " " + paragraph.text images = reduce(lambda x, y: [*x, *y], [get_paragraph_element_images(e, doc, images_list, get_image_id) for e in paragraph._element], []) - if len(images) > 0: return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len( paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id) return title + except Exception as e: traceback.print_exc() return paragraph.text diff --git a/apps/common/util/split_model.py b/apps/common/util/split_model.py index 0e7bcd5e1..5194bffea 100644 --- a/apps/common/util/split_model.py +++ b/apps/common/util/split_model.py @@ -339,13 +339,14 @@ class SplitModel: for e in result: if len(e['content']) > 4096: pass - return [item for item in [self.post_reset_paragraph(row) for row in result] if + title_list = list(set([row.get('title') for row in result])) + return [item for item in [self.post_reset_paragraph(row, title_list) for row in result] if 'content' in item and len(item.get('content').strip()) > 0] - def post_reset_paragraph(self, paragraph: Dict): + def post_reset_paragraph(self, paragraph: Dict, title_list: List[str]): result = self.filter_title_special_characters(paragraph) result = self.sub_title(result) - result = self.content_is_null(result) + result = self.content_is_null(result, title_list) return result @staticmethod @@ -357,11 +358,14 @@ class SplitModel: return paragraph @staticmethod - def content_is_null(paragraph: Dict): + def content_is_null(paragraph: Dict, title_list: List[str]): if 'title' in paragraph: title = paragraph.get('title') content = paragraph.get('content') if (content is None or len(content.strip()) == 0) and (title is not None and len(title) > 0): + find = [t for t in title_list if t.__contains__(title) and t != title] + if find: + return {'title': '', 'content': ''} return {'title': '', 'content': title} return paragraph