fix: word分段支持表格数据

This commit is contained in:
shaohuzhang1 2024-04-10 10:38:17 +08:00
parent 4114d86a8c
commit bd3f6e4a9b

View File

@ -11,6 +11,8 @@ import re
from typing import List
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel
@ -34,9 +36,31 @@ class DocSplitHandle(BaseSplitHandle):
return paragraph.text
return paragraph.text
@staticmethod
def table_to_md(table):
rows = table.rows
# 创建 Markdown 格式的表格
md_table = '| ' + ' | '.join([cell.text.replace("\n", '</br>') for cell in rows[0].cells]) + ' |\n'
md_table += '| ' + ' | '.join(['---' for i in range(len(rows[0].cells))]) + ' |\n'
for row in rows[1:]:
md_table += '| ' + ' | '.join([cell.text.replace("\n", '</br>') for cell in row.cells]) + ' |\n'
return md_table
def to_md(self, doc):
ps = doc.paragraphs
return "\n".join([self.paragraph_to_md(para) for para in ps])
elements = []
for element in doc.element.body:
if element.tag.endswith('tbl'):
# 处理表格
table = Table(element, doc)
elements.append(table)
elif element.tag.endswith('p'):
# 处理段落
paragraph = Paragraph(element, doc)
elements.append(paragraph)
return "\n".join(
[self.paragraph_to_md(element) if isinstance(element, Paragraph) else self.table_to_md(element) for element
in elements])
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
try: