From 6cacb5be7168c49f55974c0f9fee73e412838b20 Mon Sep 17 00:00:00 2001 From: CaptainB Date: Tue, 24 Sep 2024 11:49:34 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E5=A4=84=E7=90=86=E4=B8=8D=E8=A7=84?= =?UTF-8?q?=E8=8C=83=E7=9A=84pdf=E4=B8=AD=E5=89=8D=E8=A8=80=E9=83=A8?= =?UTF-8?q?=E5=88=86=E6=B2=A1=E5=9C=A8=E7=9B=AE=E5=BD=95=E4=B8=AD=E6=A0=87?= =?UTF-8?q?=E8=AF=86=E5=87=BA=E6=9D=A5=EF=BC=8C=E5=AF=BC=E8=87=B4=E4=B8=8D?= =?UTF-8?q?=E8=83=BD=E6=AD=A3=E5=B8=B8=E8=AF=86=E5=88=AB=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/handle/impl/pdf_split_handle.py | 37 ++++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index f64d7b0a9..ca6d3bbde 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -6,20 +6,19 @@ @date:2024/3/27 18:19 @desc: """ +import logging +import os import re +import tempfile +import time from typing import List import fitz -import os -import tempfile -import logging from langchain_community.document_loaders import PyPDFLoader from common.handle.base_split_handle import BaseSplitHandle from common.util.split_model import SplitModel -import time - default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<=\\n)(? 0: return {'name': file.name, 'content': result} @@ -168,15 +167,21 @@ class PdfSplitHandle(BaseSplitHandle): return title @staticmethod - def handle_links(doc): + def handle_links(doc, pattern_list, with_filter, limit): # 创建存储章节内容的数组 chapters = [] - + toc_start_page = -1 + page_content = "" + handle_pre_toc = True # 遍历 PDF 的每一页,查找带有目录链接的页 for page_num in range(doc.page_count): page = doc.load_page(page_num) links = page.get_links() - + # 如果目录开始页码未设置,则设置为当前页码 + if len(links) > 0: + toc_start_page = page_num + if toc_start_page < 0: + page_content += page.get_text('text') # 检查该页是否包含内部链接(即指向文档内部的页面) for num in range(len(links)): link = links[num] @@ -184,6 +189,9 @@ class PdfSplitHandle(BaseSplitHandle): # 获取链接目标的页面 dest_page = link['page'] rect = link['from'] # 获取链接的矩形区域 + # 如果目录开始页码包括前言部分,则不处理前言部分 + if dest_page < toc_start_page: + handle_pre_toc = False # 提取链接区域的文本作为标题 link_title = page.get_text("text", clip=rect).strip().split("\n")[0].replace('.', '').strip() @@ -226,6 +234,17 @@ class PdfSplitHandle(BaseSplitHandle): "content": chapter_text }) + # 目录中没有前言部分,手动处理 + if handle_pre_toc: + if pattern_list is not None and len(pattern_list) > 0: + split_model = SplitModel(pattern_list, with_filter, limit) + else: + split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) + # 插入目录前的部分 + page_content = re.sub(r'(?