mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-25 17:22:55 +00:00
refactor: 优化pdf加载,修复部分pdf中文乱码的问题
This commit is contained in:
parent
f9784dcbd1
commit
17af603397
|
|
@ -9,7 +9,7 @@
|
|||
import re
|
||||
from typing import List
|
||||
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
import fitz
|
||||
import os
|
||||
import tempfile
|
||||
import logging
|
||||
|
|
@ -40,26 +40,28 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
# 获取临时文件的路径
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
pdf_document = fitz.open(temp_file_path)
|
||||
try:
|
||||
content = ""
|
||||
reader = PdfReader(temp_file_path)
|
||||
for page_num in range(len(reader.pages)):
|
||||
for page_num in range(len(pdf_document)):
|
||||
start_time = time.time()
|
||||
page = reader.pages[page_num]
|
||||
text = page.extract_text()
|
||||
page = pdf_document.load_page(page_num)
|
||||
text = page.get_text()
|
||||
|
||||
if text and text.strip(): # 如果页面中有文本内容
|
||||
page_content = text
|
||||
else:
|
||||
try:
|
||||
writer = PdfWriter()
|
||||
writer.add_page(page)
|
||||
with tempfile.NamedTemporaryFile(delete=False) as output_pdf:
|
||||
writer.write(output_pdf)
|
||||
loader = PyPDFLoader(output_pdf.name, extract_images=True)
|
||||
new_doc = fitz.open()
|
||||
new_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
|
||||
page_num_pdf = tempfile.gettempdir() + f"/{file.name}_{page_num}.pdf"
|
||||
new_doc.save(page_num_pdf)
|
||||
new_doc.close()
|
||||
|
||||
loader = PyPDFLoader(page_num_pdf, extract_images=True)
|
||||
page_content = "\n" + loader.load()[0].page_content
|
||||
finally:
|
||||
os.remove(output_pdf.name)
|
||||
os.remove(page_num_pdf)
|
||||
|
||||
content += page_content
|
||||
|
||||
|
|
@ -76,6 +78,7 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
return {'name': file.name,
|
||||
'content': []}
|
||||
finally:
|
||||
pdf_document.close()
|
||||
# 处理完后可以删除临时文件
|
||||
os.remove(temp_file_path)
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ html2text = "^2024.2.26"
|
|||
langchain-openai = "^0.1.8"
|
||||
django-ipware = "^6.0.4"
|
||||
django-apscheduler = "^0.6.2"
|
||||
pymupdf = "1.24.9"
|
||||
pypdf = "4.3.1"
|
||||
rapidocr-onnxruntime = "1.3.24"
|
||||
python-docx = "^1.1.0"
|
||||
|
|
|
|||
Loading…
Reference in New Issue