refactor: 支持解析pdf中的图片

2025-12-25 17:22:55 +00:00 · 2024-08-15 15:45:16 +08:00 · 2024-08-15 15:45:16 +08:00 · e266dd9d99
parent ebc49fd6e7
commit e266dd9d99
2 changed files with 44 additions and 8 deletions
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@ -9,7 +9,10 @@
 import re
 from typing import List

-import fitz
+import pypdf
+import os
+import tempfile
+from langchain_community.document_loaders import PyPDFLoader

 from common.handle.base_split_handle import BaseSplitHandle
 from common.util.split_model import SplitModel
@ -24,16 +27,44 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),


 def number_to_text(pdf_document, page_number):
-    page = pdf_document.load_page(page_number)
-    text = page.get_text()
-    return text
+    return pdf_document[page_number].page_content
+
+
+def check_pdf_is_image(pdf_path):
+    try:
+        # 打开PDF文件
+        with open(pdf_path, "rb") as f:
+            reader = pypdf.PdfReader(f)
+            for page_num in range(len(reader.pages)):
+                page = reader.pages[page_num]
+
+                # 尝试提取文本
+                text = page.extract_text()
+                if text and text.strip():  # 如果页面中有文本内容
+                    return False  # 不是纯图片
+                else:
+                    return True  # 可能是图片或扫描件
+
+    except Exception as e:
+        print(f"Error: {e}")
+        return None


 class PdfSplitHandle(BaseSplitHandle):
-    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer,save_image):
+    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
+        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            # 将上传的文件保存到临时文件中
+            for chunk in file.chunks():
+                temp_file.write(chunk)
+            # 获取临时文件的路径
+            temp_file_path = temp_file.name
+
        try:
-            buffer = get_buffer(file)
-            pdf_document = fitz.open(file.name, buffer)
+            if check_pdf_is_image(temp_file_path):
+                loader = PyPDFLoader(temp_file_path, extract_images=True)
+            else:
+                loader = PyPDFLoader(temp_file_path, extract_images=False)
+            pdf_document = loader.load()
            content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
            if pattern_list is not None and len(pattern_list) > 0:
                split_model = SplitModel(pattern_list, with_filter, limit)
@ -42,6 +73,10 @@ class PdfSplitHandle(BaseSplitHandle):
        except BaseException as e:
            return {'name': file.name,
                    'content': []}
+        finally:
+            # 处理完后可以删除临时文件
+            os.remove(temp_file_path)
+
        return {'name': file.name,
                'content': split_model.parse(content)
                }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -31,7 +31,8 @@ html2text = "^2024.2.26"
 langchain-openai = "^0.1.8"
 django-ipware = "^6.0.4"
 django-apscheduler = "^0.6.2"
-pymupdf = "1.24.1"
+pypdf = "4.3.1"
+rapidocr-onnxruntime = "1.3.24"
 python-docx = "^1.1.0"
 xlwt = "^1.3.0"
 dashscope = "^1.17.0"