From 5ec94860b2881d6a4657481a9cd37b6d6c8a1a4c Mon Sep 17 00:00:00 2001
From: shaohuzhang1 <80892890+shaohuzhang1@users.noreply.github.com>
Date: Wed, 19 Mar 2025 12:04:43 +0800
Subject: [PATCH] perf: Enhance Word parsing (#2612)

---
 apps/common/handle/impl/doc_split_handle.py | 39 +++++++++++++++++----
 apps/common/util/split_model.py             | 12 ++++---
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py
index 753e74fc4..1df7b6a66 100644
--- a/apps/common/handle/impl/doc_split_handle.py
+++ b/apps/common/handle/impl/doc_split_handle.py
@@ -110,24 +110,51 @@ def get_image_id_func():
     return get_image_id
 
 
+title_font_list = [
+    [36, 100],
+    [26, 36],
+    [24, 26],
+    [22, 24],
+    [18, 22],
+    [16, 18]
+]
+
+
+def get_title_level(paragraph: Paragraph):
+    try:
+        if paragraph.style is not None:
+            psn = paragraph.style.name
+            if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
+                return int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
+                                                                                       ''))
+        if len(paragraph.runs) == 1:
+            font_size = paragraph.runs[0].font.size
+            pt = font_size.pt
+            if pt >= 16:
+                for _value, index in zip(title_font_list, range(len(title_font_list))):
+                    if pt >= _value[0] and pt < _value[1]:
+                        return index + 1
+    except Exception as e:
+        pass
+    return None
+
+
 class DocSplitHandle(BaseSplitHandle):
     @staticmethod
     def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id):
         try:
-            psn = paragraph.style.name
-            if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
-                title = "".join(["#" for i in range(
-                    int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
-                                                                                    '')))]) + " " + paragraph.text
+            title_level = get_title_level(paragraph)
+            if title_level is not None:
+                title = "".join(["#" for i in range(title_level)]) + " " + paragraph.text
                 images = reduce(lambda x, y: [*x, *y],
                                 [get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
                                  paragraph._element],
                                 [])
-
                 if len(images) > 0:
                     return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len(
                         paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id)
                 return title
+
         except Exception as e:
             traceback.print_exc()
             return paragraph.text
diff --git a/apps/common/util/split_model.py b/apps/common/util/split_model.py
index 0e7bcd5e1..5194bffea 100644
--- a/apps/common/util/split_model.py
+++ b/apps/common/util/split_model.py
@@ -339,13 +339,14 @@ class SplitModel:
         for e in result:
             if len(e['content']) > 4096:
                 pass
-        return [item for item in [self.post_reset_paragraph(row) for row in result] if
+        title_list = list(set([row.get('title') for row in result]))
+        return [item for item in [self.post_reset_paragraph(row, title_list) for row in result] if
                 'content' in item and len(item.get('content').strip()) > 0]
 
-    def post_reset_paragraph(self, paragraph: Dict):
+    def post_reset_paragraph(self, paragraph: Dict, title_list: List[str]):
         result = self.filter_title_special_characters(paragraph)
         result = self.sub_title(result)
-        result = self.content_is_null(result)
+        result = self.content_is_null(result, title_list)
         return result
 
     @staticmethod
@@ -357,11 +358,14 @@ class SplitModel:
         return paragraph
 
     @staticmethod
-    def content_is_null(paragraph: Dict):
+    def content_is_null(paragraph: Dict, title_list: List[str]):
         if 'title' in paragraph:
             title = paragraph.get('title')
             content = paragraph.get('content')
             if (content is None or len(content.strip()) == 0) and (title is not None and len(title) > 0):
+                find = [t for t in title_list if t.__contains__(title) and t != title]
+                if find:
+                    return {'title': '', 'content': ''}
                 return {'title': '', 'content': title}
         return paragraph