refactor: 支持解析pdf中的图片

2025-12-26 01:33:05 +00:00 · 2024-08-15 15:45:16 +08:00 · 2024-08-15 15:45:16 +08:00 · e266dd9d99
parent ebc49fd6e7
commit e266dd9d99
2 changed files with 44 additions and 8 deletions
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@ -9,7 +9,10 @@
 import re
 from typing import List
-import fitz
+import pypdf
 import os
 import tempfile
 from langchain_community.document_loaders import PyPDFLoader
 from common.handle.base_split_handle import BaseSplitHandle
 from common.util.split_model import SplitModel
@ -24,16 +27,44 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
 def number_to_text(pdf_document, page_number):
-    page = pdf_document.load_page(page_number)
+    return pdf_document[page_number].page_content
-    text = page.get_text()
+
-    return text
+
 def check_pdf_is_image(pdf_path):
    try:
        # 打开PDF文件
        with open(pdf_path, "rb") as f:
            reader = pypdf.PdfReader(f)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                # 尝试提取文本
                text = page.extract_text()
                if text and text.strip():  # 如果页面中有文本内容
                    return False  # 不是纯图片
                else:
                    return True  # 可能是图片或扫描件
    except Exception as e:
        print(f"Error: {e}")
        return None
 class PdfSplitHandle(BaseSplitHandle):
-    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer,save_image):
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            # 将上传的文件保存到临时文件中
            for chunk in file.chunks():
                temp_file.write(chunk)
            # 获取临时文件的路径
            temp_file_path = temp_file.name
        try:
-            buffer = get_buffer(file)
+            if check_pdf_is_image(temp_file_path):
-            pdf_document = fitz.open(file.name, buffer)
+                loader = PyPDFLoader(temp_file_path, extract_images=True)
            else:
                loader = PyPDFLoader(temp_file_path, extract_images=False)
            pdf_document = loader.load()
            content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
            if pattern_list is not None and len(pattern_list) > 0:
                split_model = SplitModel(pattern_list, with_filter, limit)
@ -42,6 +73,10 @@ class PdfSplitHandle(BaseSplitHandle):
        except BaseException as e:
            return {'name': file.name,
                    'content': []}
        finally:
            # 处理完后可以删除临时文件
            os.remove(temp_file_path)
        return {'name': file.name,
                'content': split_model.parse(content)
                }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -31,7 +31,8 @@ html2text = "^2024.2.26"
 langchain-openai = "^0.1.8"
 django-ipware = "^6.0.4"
 django-apscheduler = "^0.6.2"
-pymupdf = "1.24.1"
+pypdf = "4.3.1"
 rapidocr-onnxruntime = "1.3.24"
 python-docx = "^1.1.0"
 xlwt = "^1.3.0"
 dashscope = "^1.17.0"