refactor: 优化pdf加载,修复部分pdf中文乱码的问题
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
CaptainB 2024-08-20 16:17:42 +08:00 committed by 刘瑞斌
parent f9784dcbd1
commit 17af603397
2 changed files with 15 additions and 11 deletions

View File

@ -9,7 +9,7 @@
import re import re
from typing import List from typing import List
from pypdf import PdfReader, PdfWriter import fitz
import os import os
import tempfile import tempfile
import logging import logging
@ -40,26 +40,28 @@ class PdfSplitHandle(BaseSplitHandle):
# 获取临时文件的路径 # 获取临时文件的路径
temp_file_path = temp_file.name temp_file_path = temp_file.name
pdf_document = fitz.open(temp_file_path)
try: try:
content = "" content = ""
reader = PdfReader(temp_file_path) for page_num in range(len(pdf_document)):
for page_num in range(len(reader.pages)):
start_time = time.time() start_time = time.time()
page = reader.pages[page_num] page = pdf_document.load_page(page_num)
text = page.extract_text() text = page.get_text()
if text and text.strip(): # 如果页面中有文本内容 if text and text.strip(): # 如果页面中有文本内容
page_content = text page_content = text
else: else:
try: try:
writer = PdfWriter() new_doc = fitz.open()
writer.add_page(page) new_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
with tempfile.NamedTemporaryFile(delete=False) as output_pdf: page_num_pdf = tempfile.gettempdir() + f"/{file.name}_{page_num}.pdf"
writer.write(output_pdf) new_doc.save(page_num_pdf)
loader = PyPDFLoader(output_pdf.name, extract_images=True) new_doc.close()
loader = PyPDFLoader(page_num_pdf, extract_images=True)
page_content = "\n" + loader.load()[0].page_content page_content = "\n" + loader.load()[0].page_content
finally: finally:
os.remove(output_pdf.name) os.remove(page_num_pdf)
content += page_content content += page_content
@ -76,6 +78,7 @@ class PdfSplitHandle(BaseSplitHandle):
return {'name': file.name, return {'name': file.name,
'content': []} 'content': []}
finally: finally:
pdf_document.close()
# 处理完后可以删除临时文件 # 处理完后可以删除临时文件
os.remove(temp_file_path) os.remove(temp_file_path)

View File

@ -31,6 +31,7 @@ html2text = "^2024.2.26"
langchain-openai = "^0.1.8" langchain-openai = "^0.1.8"
django-ipware = "^6.0.4" django-ipware = "^6.0.4"
django-apscheduler = "^0.6.2" django-apscheduler = "^0.6.2"
pymupdf = "1.24.9"
pypdf = "4.3.1" pypdf = "4.3.1"
rapidocr-onnxruntime = "1.3.24" rapidocr-onnxruntime = "1.3.24"
python-docx = "^1.1.0" python-docx = "^1.1.0"