mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
refactor: 优化pdf加载,修复部分pdf中文乱码的问题
This commit is contained in:
parent
f9784dcbd1
commit
17af603397
|
|
@ -9,7 +9,7 @@
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from pypdf import PdfReader, PdfWriter
|
import fitz
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -40,26 +40,28 @@ class PdfSplitHandle(BaseSplitHandle):
|
||||||
# 获取临时文件的路径
|
# 获取临时文件的路径
|
||||||
temp_file_path = temp_file.name
|
temp_file_path = temp_file.name
|
||||||
|
|
||||||
|
pdf_document = fitz.open(temp_file_path)
|
||||||
try:
|
try:
|
||||||
content = ""
|
content = ""
|
||||||
reader = PdfReader(temp_file_path)
|
for page_num in range(len(pdf_document)):
|
||||||
for page_num in range(len(reader.pages)):
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
page = reader.pages[page_num]
|
page = pdf_document.load_page(page_num)
|
||||||
text = page.extract_text()
|
text = page.get_text()
|
||||||
|
|
||||||
if text and text.strip(): # 如果页面中有文本内容
|
if text and text.strip(): # 如果页面中有文本内容
|
||||||
page_content = text
|
page_content = text
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
writer = PdfWriter()
|
new_doc = fitz.open()
|
||||||
writer.add_page(page)
|
new_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
|
||||||
with tempfile.NamedTemporaryFile(delete=False) as output_pdf:
|
page_num_pdf = tempfile.gettempdir() + f"/{file.name}_{page_num}.pdf"
|
||||||
writer.write(output_pdf)
|
new_doc.save(page_num_pdf)
|
||||||
loader = PyPDFLoader(output_pdf.name, extract_images=True)
|
new_doc.close()
|
||||||
|
|
||||||
|
loader = PyPDFLoader(page_num_pdf, extract_images=True)
|
||||||
page_content = "\n" + loader.load()[0].page_content
|
page_content = "\n" + loader.load()[0].page_content
|
||||||
finally:
|
finally:
|
||||||
os.remove(output_pdf.name)
|
os.remove(page_num_pdf)
|
||||||
|
|
||||||
content += page_content
|
content += page_content
|
||||||
|
|
||||||
|
|
@ -76,6 +78,7 @@ class PdfSplitHandle(BaseSplitHandle):
|
||||||
return {'name': file.name,
|
return {'name': file.name,
|
||||||
'content': []}
|
'content': []}
|
||||||
finally:
|
finally:
|
||||||
|
pdf_document.close()
|
||||||
# 处理完后可以删除临时文件
|
# 处理完后可以删除临时文件
|
||||||
os.remove(temp_file_path)
|
os.remove(temp_file_path)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,7 @@ html2text = "^2024.2.26"
|
||||||
langchain-openai = "^0.1.8"
|
langchain-openai = "^0.1.8"
|
||||||
django-ipware = "^6.0.4"
|
django-ipware = "^6.0.4"
|
||||||
django-apscheduler = "^0.6.2"
|
django-apscheduler = "^0.6.2"
|
||||||
|
pymupdf = "1.24.9"
|
||||||
pypdf = "4.3.1"
|
pypdf = "4.3.1"
|
||||||
rapidocr-onnxruntime = "1.3.24"
|
rapidocr-onnxruntime = "1.3.24"
|
||||||
python-docx = "^1.1.0"
|
python-docx = "^1.1.0"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue