refactor: 优化pdf加载,修复部分pdf中文乱码的问题
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
CaptainB 2024-08-20 16:17:42 +08:00 committed by 刘瑞斌
parent f9784dcbd1
commit 17af603397
2 changed files with 15 additions and 11 deletions

View File

@ -9,7 +9,7 @@
import re
from typing import List
from pypdf import PdfReader, PdfWriter
import fitz
import os
import tempfile
import logging
@ -40,26 +40,28 @@ class PdfSplitHandle(BaseSplitHandle):
# 获取临时文件的路径
temp_file_path = temp_file.name
pdf_document = fitz.open(temp_file_path)
try:
content = ""
reader = PdfReader(temp_file_path)
for page_num in range(len(reader.pages)):
for page_num in range(len(pdf_document)):
start_time = time.time()
page = reader.pages[page_num]
text = page.extract_text()
page = pdf_document.load_page(page_num)
text = page.get_text()
if text and text.strip(): # 如果页面中有文本内容
page_content = text
else:
try:
writer = PdfWriter()
writer.add_page(page)
with tempfile.NamedTemporaryFile(delete=False) as output_pdf:
writer.write(output_pdf)
loader = PyPDFLoader(output_pdf.name, extract_images=True)
new_doc = fitz.open()
new_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
page_num_pdf = tempfile.gettempdir() + f"/{file.name}_{page_num}.pdf"
new_doc.save(page_num_pdf)
new_doc.close()
loader = PyPDFLoader(page_num_pdf, extract_images=True)
page_content = "\n" + loader.load()[0].page_content
finally:
os.remove(output_pdf.name)
os.remove(page_num_pdf)
content += page_content
@ -76,6 +78,7 @@ class PdfSplitHandle(BaseSplitHandle):
return {'name': file.name,
'content': []}
finally:
pdf_document.close()
# 处理完后可以删除临时文件
os.remove(temp_file_path)

View File

@ -31,6 +31,7 @@ html2text = "^2024.2.26"
langchain-openai = "^0.1.8"
django-ipware = "^6.0.4"
django-apscheduler = "^0.6.2"
pymupdf = "1.24.9"
pypdf = "4.3.1"
rapidocr-onnxruntime = "1.3.24"
python-docx = "^1.1.0"