From 9b1a4979258e5833bd56ba43a5b8905181eea990 Mon Sep 17 00:00:00 2001 From: CaptainB Date: Fri, 6 Dec 2024 10:49:37 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E5=A4=84=E7=90=86=E6=9F=90=E4=BA=9Bpdf?= =?UTF-8?q?=E4=B8=AD=E4=B8=8D=E5=8C=85=E6=8B=AC=E7=9B=AE=E5=BD=95=E5=92=8C?= =?UTF-8?q?=E5=86=85=E9=83=A8=E9=93=BE=E6=8E=A5=E4=B8=8D=E8=83=BD=E5=AE=8C?= =?UTF-8?q?=E6=95=B4=E5=AF=BC=E5=85=A5=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit fb8b96779c384056ffbb7b698f6a0642754f495d) --- apps/common/handle/impl/pdf_split_handle.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index 8de0129e1..fa8f62fa5 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -31,6 +31,16 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), max_kb = logging.getLogger("max_kb") +def check_links_in_pdf(doc): + for page_number in range(len(doc)): + page = doc[page_number] + links = page.get_links() + if links: + for link in links: + if link['kind'] == 1: + return True + return False + class PdfSplitHandle(BaseSplitHandle): def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): with tempfile.NamedTemporaryFile(delete=False) as temp_file: @@ -175,6 +185,9 @@ class PdfSplitHandle(BaseSplitHandle): @staticmethod def handle_links(doc, pattern_list, with_filter, limit): + # 检查文档是否包含内部链接 + if not check_links_in_pdf(doc): + return # 创建存储章节内容的数组 chapters = [] toc_start_page = -1