mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
fix: word分段支持表格数据
This commit is contained in:
parent
4114d86a8c
commit
bd3f6e4a9b
|
|
@ -11,6 +11,8 @@ import re
|
|||
from typing import List
|
||||
|
||||
from docx import Document
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.util.split_model import SplitModel
|
||||
|
|
@ -34,9 +36,31 @@ class DocSplitHandle(BaseSplitHandle):
|
|||
return paragraph.text
|
||||
return paragraph.text
|
||||
|
||||
@staticmethod
|
||||
def table_to_md(table):
|
||||
rows = table.rows
|
||||
# 创建 Markdown 格式的表格
|
||||
md_table = '| ' + ' | '.join([cell.text.replace("\n", '</br>') for cell in rows[0].cells]) + ' |\n'
|
||||
md_table += '| ' + ' | '.join(['---' for i in range(len(rows[0].cells))]) + ' |\n'
|
||||
for row in rows[1:]:
|
||||
md_table += '| ' + ' | '.join([cell.text.replace("\n", '</br>') for cell in row.cells]) + ' |\n'
|
||||
return md_table
|
||||
|
||||
def to_md(self, doc):
|
||||
ps = doc.paragraphs
|
||||
return "\n".join([self.paragraph_to_md(para) for para in ps])
|
||||
elements = []
|
||||
for element in doc.element.body:
|
||||
if element.tag.endswith('tbl'):
|
||||
# 处理表格
|
||||
table = Table(element, doc)
|
||||
elements.append(table)
|
||||
elif element.tag.endswith('p'):
|
||||
# 处理段落
|
||||
paragraph = Paragraph(element, doc)
|
||||
elements.append(paragraph)
|
||||
|
||||
return "\n".join(
|
||||
[self.paragraph_to_md(element) if isinstance(element, Paragraph) else self.table_to_md(element) for element
|
||||
in elements])
|
||||
|
||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
|
||||
try:
|
||||
|
|
|
|||
Loading…
Reference in New Issue