mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
feat: 上传文档表格对支持xlsx文件单元格图片
This commit is contained in:
parent
bec2ed8067
commit
b924958176
|
|
@ -15,5 +15,5 @@ class BaseParseTableHandle(ABC):
|
|||
pass
|
||||
|
||||
@abstractmethod
|
||||
def handle(self, file, get_buffer):
|
||||
def handle(self, file, get_buffer,save_image):
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -7,114 +7,11 @@
|
|||
@desc:
|
||||
"""
|
||||
import io
|
||||
import uuid
|
||||
from functools import reduce
|
||||
from io import BytesIO
|
||||
from xml.etree.ElementTree import fromstring
|
||||
from zipfile import ZipFile
|
||||
|
||||
import openpyxl
|
||||
from PIL import Image as PILImage
|
||||
from openpyxl.drawing.image import Image as openpyxl_Image
|
||||
from openpyxl.packaging.relationship import get_rels_path, get_dependents
|
||||
from openpyxl.xml.constants import SHEET_DRAWING_NS, REL_NS, SHEET_MAIN_NS
|
||||
|
||||
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
|
||||
from dataset.models import Image
|
||||
|
||||
|
||||
def parse_element(element) -> {}:
|
||||
data = {}
|
||||
xdr_namespace = "{%s}" % SHEET_DRAWING_NS
|
||||
targets = level_order_traversal(element, xdr_namespace + "nvPicPr")
|
||||
for target in targets:
|
||||
cNvPr = embed = ""
|
||||
for child in target:
|
||||
if child.tag == xdr_namespace + "nvPicPr":
|
||||
cNvPr = child[0].attrib["name"]
|
||||
elif child.tag == xdr_namespace + "blipFill":
|
||||
_rel_embed = "{%s}embed" % REL_NS
|
||||
embed = child[0].attrib[_rel_embed]
|
||||
if cNvPr:
|
||||
data[cNvPr] = embed
|
||||
return data
|
||||
|
||||
|
||||
def parse_element_sheet_xml(element) -> []:
|
||||
data = []
|
||||
xdr_namespace = "{%s}" % SHEET_MAIN_NS
|
||||
targets = level_order_traversal(element, xdr_namespace + "f")
|
||||
for target in targets:
|
||||
for child in target:
|
||||
if child.tag == xdr_namespace + "f":
|
||||
data.append(child.text)
|
||||
return data
|
||||
|
||||
|
||||
def level_order_traversal(root, flag: str) -> []:
|
||||
queue = [root]
|
||||
targets = []
|
||||
while queue:
|
||||
node = queue.pop(0)
|
||||
children = [child.tag for child in node]
|
||||
if flag in children:
|
||||
targets.append(node)
|
||||
continue
|
||||
for child in node:
|
||||
queue.append(child)
|
||||
return targets
|
||||
|
||||
|
||||
def handle_images(deps, archive: ZipFile) -> []:
|
||||
images = []
|
||||
if not PILImage: # Pillow not installed, drop images
|
||||
return images
|
||||
for dep in deps:
|
||||
try:
|
||||
image_io = archive.read(dep.target)
|
||||
image = openpyxl_Image(BytesIO(image_io))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
continue
|
||||
image.embed = dep.id # 文件rId
|
||||
image.target = dep.target # 文件地址
|
||||
images.append(image)
|
||||
return images
|
||||
|
||||
|
||||
def xlsx_embed_cells_images(buffer) -> {}:
|
||||
archive = ZipFile(buffer)
|
||||
# 解析cellImage.xml文件
|
||||
deps = get_dependents(archive, get_rels_path("xl/cellimages.xml"))
|
||||
image_rel = handle_images(deps=deps, archive=archive)
|
||||
# 工作表及其中图片ID
|
||||
sheet_list = {}
|
||||
for item in archive.namelist():
|
||||
if not item.startswith('xl/worksheets/sheet'):
|
||||
continue
|
||||
key = item.split('/')[-1].split('.')[0].split('sheet')[-1]
|
||||
sheet_list[key] = parse_element_sheet_xml(fromstring(archive.read(item)))
|
||||
cell_images_xml = parse_element(fromstring(archive.read("xl/cellimages.xml")))
|
||||
cell_images_rel = {}
|
||||
for image in image_rel:
|
||||
cell_images_rel[image.embed] = image
|
||||
for cnv, embed in cell_images_xml.items():
|
||||
cell_images_xml[cnv] = cell_images_rel.get(embed)
|
||||
result = {}
|
||||
for key, img in cell_images_xml.items():
|
||||
image_excel_id_list = [_xl for _xl in
|
||||
reduce(lambda x, y: [*x, *y], [sheet for sheet_id, sheet in sheet_list.items()], []) if
|
||||
key in _xl]
|
||||
if len(image_excel_id_list) > 0:
|
||||
image_excel_id = image_excel_id_list[-1]
|
||||
f = archive.open(img.target)
|
||||
img_byte = io.BytesIO()
|
||||
im = PILImage.open(f).convert('RGB')
|
||||
im.save(img_byte, format='JPEG')
|
||||
image = Image(id=uuid.uuid1(), image=img_byte.getvalue(), image_name=img.path)
|
||||
result['=' + image_excel_id] = image
|
||||
archive.close()
|
||||
return result
|
||||
from common.handle.impl.tools import xlsx_embed_cells_images
|
||||
|
||||
|
||||
def handle_sheet(file_name, sheet, image_dict):
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ class CsvSplitHandle(BaseParseTableHandle):
|
|||
return True
|
||||
return False
|
||||
|
||||
def handle(self, file, get_buffer):
|
||||
def handle(self, file, get_buffer,save_image):
|
||||
buffer = get_buffer(file)
|
||||
try:
|
||||
content = buffer.decode(detect(buffer)['encoding'])
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import logging
|
|||
from openpyxl import load_workbook
|
||||
|
||||
from common.handle.base_parse_table_handle import BaseParseTableHandle
|
||||
from common.handle.impl.tools import xlsx_embed_cells_images
|
||||
|
||||
max_kb = logging.getLogger("max_kb")
|
||||
|
||||
|
|
@ -16,10 +17,15 @@ class ExcelSplitHandle(BaseParseTableHandle):
|
|||
return True
|
||||
return False
|
||||
|
||||
def handle(self, file, get_buffer):
|
||||
def handle(self, file, get_buffer, save_image):
|
||||
buffer = get_buffer(file)
|
||||
try:
|
||||
wb = load_workbook(io.BytesIO(buffer))
|
||||
try:
|
||||
image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
|
||||
save_image([item for item in image_dict.values()])
|
||||
except Exception as e:
|
||||
image_dict = {}
|
||||
result = []
|
||||
for sheetname in wb.sheetnames:
|
||||
paragraphs = []
|
||||
|
|
@ -35,7 +41,11 @@ class ExcelSplitHandle(BaseParseTableHandle):
|
|||
continue
|
||||
t = str(ti[i].value) if i < len(ti) else ""
|
||||
title.append(t)
|
||||
t += (": " if t else "") + str(c.value)
|
||||
content = str(c.value)
|
||||
image = image_dict.get(content, None)
|
||||
if image is not None:
|
||||
content = f''
|
||||
t += (": " if t else "") + content
|
||||
l.append(t)
|
||||
l = "; ".join(l)
|
||||
if sheetname.lower().find("sheet") < 0:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,118 @@
|
|||
# coding=utf-8
|
||||
"""
|
||||
@project: MaxKB
|
||||
@Author:虎
|
||||
@file: tools.py
|
||||
@date:2024/9/11 16:41
|
||||
@desc:
|
||||
"""
|
||||
import io
|
||||
import uuid
|
||||
from functools import reduce
|
||||
from io import BytesIO
|
||||
from xml.etree.ElementTree import fromstring
|
||||
from zipfile import ZipFile
|
||||
|
||||
from PIL import Image as PILImage
|
||||
from openpyxl.drawing.image import Image as openpyxl_Image
|
||||
from openpyxl.packaging.relationship import get_rels_path, get_dependents
|
||||
from openpyxl.xml.constants import SHEET_DRAWING_NS, REL_NS, SHEET_MAIN_NS
|
||||
|
||||
from common.handle.base_parse_qa_handle import get_title_row_index_dict, get_row_value
|
||||
from dataset.models import Image
|
||||
|
||||
|
||||
def parse_element(element) -> {}:
|
||||
data = {}
|
||||
xdr_namespace = "{%s}" % SHEET_DRAWING_NS
|
||||
targets = level_order_traversal(element, xdr_namespace + "nvPicPr")
|
||||
for target in targets:
|
||||
cNvPr = embed = ""
|
||||
for child in target:
|
||||
if child.tag == xdr_namespace + "nvPicPr":
|
||||
cNvPr = child[0].attrib["name"]
|
||||
elif child.tag == xdr_namespace + "blipFill":
|
||||
_rel_embed = "{%s}embed" % REL_NS
|
||||
embed = child[0].attrib[_rel_embed]
|
||||
if cNvPr:
|
||||
data[cNvPr] = embed
|
||||
return data
|
||||
|
||||
|
||||
def parse_element_sheet_xml(element) -> []:
|
||||
data = []
|
||||
xdr_namespace = "{%s}" % SHEET_MAIN_NS
|
||||
targets = level_order_traversal(element, xdr_namespace + "f")
|
||||
for target in targets:
|
||||
for child in target:
|
||||
if child.tag == xdr_namespace + "f":
|
||||
data.append(child.text)
|
||||
return data
|
||||
|
||||
|
||||
def level_order_traversal(root, flag: str) -> []:
|
||||
queue = [root]
|
||||
targets = []
|
||||
while queue:
|
||||
node = queue.pop(0)
|
||||
children = [child.tag for child in node]
|
||||
if flag in children:
|
||||
targets.append(node)
|
||||
continue
|
||||
for child in node:
|
||||
queue.append(child)
|
||||
return targets
|
||||
|
||||
|
||||
def handle_images(deps, archive: ZipFile) -> []:
|
||||
images = []
|
||||
if not PILImage: # Pillow not installed, drop images
|
||||
return images
|
||||
for dep in deps:
|
||||
try:
|
||||
image_io = archive.read(dep.target)
|
||||
image = openpyxl_Image(BytesIO(image_io))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
continue
|
||||
image.embed = dep.id # 文件rId
|
||||
image.target = dep.target # 文件地址
|
||||
images.append(image)
|
||||
return images
|
||||
|
||||
|
||||
def xlsx_embed_cells_images(buffer) -> {}:
|
||||
archive = ZipFile(buffer)
|
||||
# 解析cellImage.xml文件
|
||||
deps = get_dependents(archive, get_rels_path("xl/cellimages.xml"))
|
||||
image_rel = handle_images(deps=deps, archive=archive)
|
||||
# 工作表及其中图片ID
|
||||
sheet_list = {}
|
||||
for item in archive.namelist():
|
||||
if not item.startswith('xl/worksheets/sheet'):
|
||||
continue
|
||||
key = item.split('/')[-1].split('.')[0].split('sheet')[-1]
|
||||
sheet_list[key] = parse_element_sheet_xml(fromstring(archive.read(item)))
|
||||
cell_images_xml = parse_element(fromstring(archive.read("xl/cellimages.xml")))
|
||||
cell_images_rel = {}
|
||||
for image in image_rel:
|
||||
cell_images_rel[image.embed] = image
|
||||
for cnv, embed in cell_images_xml.items():
|
||||
cell_images_xml[cnv] = cell_images_rel.get(embed)
|
||||
result = {}
|
||||
for key, img in cell_images_xml.items():
|
||||
image_excel_id_list = [_xl for _xl in
|
||||
reduce(lambda x, y: [*x, *y], [sheet for sheet_id, sheet in sheet_list.items()], []) if
|
||||
key in _xl]
|
||||
if len(image_excel_id_list) > 0:
|
||||
image_excel_id = image_excel_id_list[-1]
|
||||
f = archive.open(img.target)
|
||||
img_byte = io.BytesIO()
|
||||
im = PILImage.open(f).convert('RGB')
|
||||
im.save(img_byte, format='JPEG')
|
||||
image = Image(id=uuid.uuid1(), image=img_byte.getvalue(), image_name=img.path)
|
||||
result['=' + image_excel_id] = image
|
||||
archive.close()
|
||||
return result
|
||||
|
||||
|
||||
|
|
@ -671,7 +671,7 @@ class DocumentSerializers(ApiMixin, serializers.Serializer):
|
|||
get_buffer = FileBufferHandle().get_buffer
|
||||
for parse_table_handle in parse_table_handle_list:
|
||||
if parse_table_handle.support(file, get_buffer):
|
||||
return parse_table_handle.handle(file, get_buffer)
|
||||
return parse_table_handle.handle(file, get_buffer, save_image)
|
||||
raise AppApiException(500, '不支持的文件格式')
|
||||
|
||||
def save_qa(self, instance: Dict, with_valid=True):
|
||||
|
|
|
|||
Loading…
Reference in New Issue