From 17af6033978e34bb277770c4d461f07f9a6724de Mon Sep 17 00:00:00 2001 From: CaptainB Date: Tue, 20 Aug 2024 16:17:42 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E4=BC=98=E5=8C=96pdf=E5=8A=A0?= =?UTF-8?q?=E8=BD=BD=EF=BC=8C=E4=BF=AE=E5=A4=8D=E9=83=A8=E5=88=86pdf?= =?UTF-8?q?=E4=B8=AD=E6=96=87=E4=B9=B1=E7=A0=81=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/handle/impl/pdf_split_handle.py | 25 ++++++++++++--------- pyproject.toml | 1 + 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index 3e0d951e0..9f005fec2 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -9,7 +9,7 @@ import re from typing import List -from pypdf import PdfReader, PdfWriter +import fitz import os import tempfile import logging @@ -40,26 +40,28 @@ class PdfSplitHandle(BaseSplitHandle): # 获取临时文件的路径 temp_file_path = temp_file.name + pdf_document = fitz.open(temp_file_path) try: content = "" - reader = PdfReader(temp_file_path) - for page_num in range(len(reader.pages)): + for page_num in range(len(pdf_document)): start_time = time.time() - page = reader.pages[page_num] - text = page.extract_text() + page = pdf_document.load_page(page_num) + text = page.get_text() if text and text.strip(): # 如果页面中有文本内容 page_content = text else: try: - writer = PdfWriter() - writer.add_page(page) - with tempfile.NamedTemporaryFile(delete=False) as output_pdf: - writer.write(output_pdf) - loader = PyPDFLoader(output_pdf.name, extract_images=True) + new_doc = fitz.open() + new_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num) + page_num_pdf = tempfile.gettempdir() + f"/{file.name}_{page_num}.pdf" + new_doc.save(page_num_pdf) + new_doc.close() + + loader = PyPDFLoader(page_num_pdf, extract_images=True) page_content = "\n" + loader.load()[0].page_content finally: - os.remove(output_pdf.name) + os.remove(page_num_pdf) content += page_content @@ -76,6 +78,7 @@ class PdfSplitHandle(BaseSplitHandle): return {'name': file.name, 'content': []} finally: + pdf_document.close() # 处理完后可以删除临时文件 os.remove(temp_file_path) diff --git a/pyproject.toml b/pyproject.toml index cec72dadb..62620afd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ html2text = "^2024.2.26" langchain-openai = "^0.1.8" django-ipware = "^6.0.4" django-apscheduler = "^0.6.2" +pymupdf = "1.24.9" pypdf = "4.3.1" rapidocr-onnxruntime = "1.3.24" python-docx = "^1.1.0"