MaxKB/apps/common/handle/impl/table/xlsx_parse_table_handle.py
2024-09-12 10:49:31 +08:00

58 lines
2.1 KiB
Python

# coding=utf-8
import io
import logging
from openpyxl import load_workbook
from common.handle.base_parse_table_handle import BaseParseTableHandle
from common.handle.impl.tools import xlsx_embed_cells_images
max_kb = logging.getLogger("max_kb")
class XlsxSplitHandle(BaseParseTableHandle):
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith('.xlsx'):
return True
return False
def handle(self, file, get_buffer, save_image):
buffer = get_buffer(file)
try:
wb = load_workbook(io.BytesIO(buffer))
try:
image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
save_image([item for item in image_dict.values()])
except Exception as e:
image_dict = {}
result = []
for sheetname in wb.sheetnames:
paragraphs = []
ws = wb[sheetname]
rows = list(ws.rows)
if not rows: continue
ti = list(rows[0])
for r in list(rows[1:]):
l = []
for i, c in enumerate(r):
if not c.value:
continue
t = str(ti[i].value) if i < len(ti) else ""
content = str(c.value)
image = image_dict.get(content, None)
if image is not None:
content = f'![](/api/image/{image.id})'
t += (": " if t else "") + content
l.append(t)
l = "; ".join(l)
if sheetname.lower().find("sheet") < 0:
l += " ——" + sheetname
paragraphs.append({'title': '', 'content': l})
result.append({'name': sheetname, 'paragraphs': paragraphs})
except BaseException as e:
max_kb.error(f'excel split handle error: {e}')
return [{'name': file.name, 'paragraphs': []}]
return result