From bd3f6e4a9bbae81ca972e81a0568cee16a3cb46e Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Wed, 10 Apr 2024 10:38:17 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20word=E5=88=86=E6=AE=B5=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E8=A1=A8=E6=A0=BC=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/handle/impl/doc_split_handle.py | 28 +++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py index d3002b37a..d1f538936 100644 --- a/apps/common/handle/impl/doc_split_handle.py +++ b/apps/common/handle/impl/doc_split_handle.py @@ -11,6 +11,8 @@ import re from typing import List from docx import Document +from docx.table import Table +from docx.text.paragraph import Paragraph from common.handle.base_split_handle import BaseSplitHandle from common.util.split_model import SplitModel @@ -34,9 +36,31 @@ class DocSplitHandle(BaseSplitHandle): return paragraph.text return paragraph.text + @staticmethod + def table_to_md(table): + rows = table.rows + # 创建 Markdown 格式的表格 + md_table = '| ' + ' | '.join([cell.text.replace("\n", '
') for cell in rows[0].cells]) + ' |\n' + md_table += '| ' + ' | '.join(['---' for i in range(len(rows[0].cells))]) + ' |\n' + for row in rows[1:]: + md_table += '| ' + ' | '.join([cell.text.replace("\n", '
') for cell in row.cells]) + ' |\n' + return md_table + def to_md(self, doc): - ps = doc.paragraphs - return "\n".join([self.paragraph_to_md(para) for para in ps]) + elements = [] + for element in doc.element.body: + if element.tag.endswith('tbl'): + # 处理表格 + table = Table(element, doc) + elements.append(table) + elif element.tag.endswith('p'): + # 处理段落 + paragraph = Paragraph(element, doc) + elements.append(paragraph) + + return "\n".join( + [self.paragraph_to_md(element) if isinstance(element, Paragraph) else self.table_to_md(element) for element + in elements]) def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer): try: