fix: word分段支持表格数据

2025-12-26 01:33:05 +00:00 · 2024-04-10 10:38:17 +08:00 · 2024-04-10 10:38:17 +08:00 · bd3f6e4a9b
parent 4114d86a8c
commit bd3f6e4a9b
1 changed files with 26 additions and 2 deletions
--- a/apps/common/handle/impl/doc_split_handle.py
+++ b/apps/common/handle/impl/doc_split_handle.py
@ -11,6 +11,8 @@ import re
 from typing import List

 from docx import Document
+from docx.table import Table
+from docx.text.paragraph import Paragraph

 from common.handle.base_split_handle import BaseSplitHandle
 from common.util.split_model import SplitModel
@ -34,9 +36,31 @@ class DocSplitHandle(BaseSplitHandle):
            return paragraph.text
        return paragraph.text

+    @staticmethod
+    def table_to_md(table):
+        rows = table.rows
+        # 创建 Markdown 格式的表格
+        md_table = '| ' + ' | '.join([cell.text.replace("\n", '</br>') for cell in rows[0].cells]) + ' |\n'
+        md_table += '| ' + ' | '.join(['---' for i in range(len(rows[0].cells))]) + ' |\n'
+        for row in rows[1:]:
+            md_table += '| ' + ' | '.join([cell.text.replace("\n", '</br>') for cell in row.cells]) + ' |\n'
+        return md_table
+
    def to_md(self, doc):
-        ps = doc.paragraphs
-        return "\n".join([self.paragraph_to_md(para) for para in ps])
+        elements = []
+        for element in doc.element.body:
+            if element.tag.endswith('tbl'):
+                # 处理表格
+                table = Table(element, doc)
+                elements.append(table)
+            elif element.tag.endswith('p'):
+                # 处理段落
+                paragraph = Paragraph(element, doc)
+                elements.append(paragraph)
+
+        return "\n".join(
+            [self.paragraph_to_md(element) if isinstance(element, Paragraph) else self.table_to_md(element) for element
+             in elements])

    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
        try: