From 83d97439e41a5ebfd57f26b13f791ac317223fa4 Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Mon, 28 Oct 2024 17:42:18 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=AF=BC=E5=85=A5word?= =?UTF-8?q?=E6=96=87=E6=A1=A3=EF=BC=8C=E6=9C=89=E7=9A=84=E5=9B=BE=E7=89=87?= =?UTF-8?q?=E5=AF=BC=E5=85=A5=E4=B8=8D=E8=BF=9B=E5=8E=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/handle/impl/doc_split_handle.py | 45 ++++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py index ed65b5b73..0e0356ed8 100644 --- a/apps/common/handle/impl/doc_split_handle.py +++ b/apps/common/handle/impl/doc_split_handle.py @@ -10,6 +10,7 @@ import io import re import traceback import uuid +from functools import reduce from typing import List from docx import Document, ImagePart @@ -31,6 +32,7 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'} combine_nsmap = {**ns.nsmap, **old_docx_nsmap} + def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=True): if is_new_docx: image_ids = image.xpath('.//a:blip/@r:embed') @@ -46,18 +48,31 @@ def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=T return f'![](/api/image/{image_uuid})' +def get_paragraph_element_images(paragraph_element, doc: Document, images_list, get_image_id): + images_xpath_list = [".//pic:pic", ".//w:pict"] + images = [] + for images_xpath in images_xpath_list: + try: + _images = paragraph_element.xpath(images_xpath) + if _images is not None and len(_images) > 0: + for image in _images: + images.append(image) + except Exception as e: + pass + return images + + +def images_to_string(images, doc: Document, images_list, get_image_id): + return "".join( + [item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if + item is not None]) + + def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id): try: - images = paragraph_element.xpath(".//pic:pic") - old_docx_images = paragraph_element.xpath(".//w:pict") + images = get_paragraph_element_images(paragraph_element, doc, images_list, get_image_id) if len(images) > 0: - return "".join( - [item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if - item is not None]) - elif len(old_docx_images) > 0: - return "".join( - [item for item in [image_to_mode(image, doc, images_list, get_image_id, is_new_docx=False) for image in old_docx_images] if - item is not None]) + return images_to_string(images, doc, images_list, get_image_id) elif paragraph_element.text is not None: return paragraph_element.text return "" @@ -101,8 +116,18 @@ class DocSplitHandle(BaseSplitHandle): try: psn = paragraph.style.name if psn.startswith('Heading'): - return "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text + title = "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text + images = reduce(lambda x, y: [*x, *y], + [get_paragraph_element_images(e, doc, images_list, get_image_id) for e in + paragraph._element], + []) + + if len(images) > 0: + return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len( + paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id) + return title except Exception as e: + traceback.print_exc() return paragraph.text return get_paragraph_txt(paragraph, doc, images_list, get_image_id)