MaxKB/apps/common/handle/impl/doc_split_handle.py
shaohuzhang1 fb7abb432f
Pr@main@fix bugs (#41)
* fix: 修复提示问题

* fix: 上传文档限制

* feat: 问题管理

* fix: 修改分段正则,优化分段逻辑

* feat: 问题管理

* fix: word分段支持表格数据

* fix: 问题批量插入去重

* fix: 修复文档问题

* feat: 文档分页优化

* fix: 优化关联问题

* fix: 嵌入样式
2024-04-10 14:16:56 +08:00

86 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
"""
@project: maxkb
@Author
@file text_split_handle.py
@date2024/3/27 18:19
@desc:
"""
import io
import re
from typing import List
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
class DocSplitHandle(BaseSplitHandle):
@staticmethod
def paragraph_to_md(paragraph):
try:
psn = paragraph.style.name
if psn.startswith('Heading'):
return "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text
except Exception as e:
return paragraph.text
return paragraph.text
@staticmethod
def table_to_md(table):
rows = table.rows
# 创建 Markdown 格式的表格
md_table = '| ' + ' | '.join([cell.text.replace("\n", '</br>') for cell in rows[0].cells]) + ' |\n'
md_table += '| ' + ' | '.join(['---' for i in range(len(rows[0].cells))]) + ' |\n'
for row in rows[1:]:
md_table += '| ' + ' | '.join([cell.text.replace("\n", '</br>') for cell in row.cells]) + ' |\n'
return md_table
def to_md(self, doc):
elements = []
for element in doc.element.body:
if element.tag.endswith('tbl'):
# 处理表格
table = Table(element, doc)
elements.append(table)
elif element.tag.endswith('p'):
# 处理段落
paragraph = Paragraph(element, doc)
elements.append(paragraph)
return "\n".join(
[self.paragraph_to_md(element) if isinstance(element, Paragraph) else self.table_to_md(element) for element
in elements])
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
try:
buffer = get_buffer(file)
doc = Document(io.BytesIO(buffer))
content = self.to_md(doc)
if pattern_list is not None and len(pattern_list) > 0:
split_model = SplitModel(pattern_list, with_filter, limit)
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
except BaseException as e:
return {'name': file.name,
'content': []}
return {'name': file.name,
'content': split_model.parse(content)
}
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".docx") or file_name.endswith(".doc"):
return True
return False