fix: 处理某些pdf中不包括目录和内部链接不能完整导入的问题

(cherry picked from commit fb8b96779c)
2025-12-26 01:33:05 +00:00 · 2024-12-06 10:49:37 +08:00 · 2024-12-06 10:49:37 +08:00 · 9b1a497925
parent 1a5bb20871
commit 9b1a497925
1 changed files with 13 additions and 0 deletions
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@ -31,6 +31,16 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
 max_kb = logging.getLogger("max_kb")


+def check_links_in_pdf(doc):
+    for page_number in range(len(doc)):
+        page = doc[page_number]
+        links = page.get_links()
+        if links:
+            for link in links:
+                if link['kind'] == 1:
+                    return True
+    return False
+
 class PdfSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
@ -175,6 +185,9 @@ class PdfSplitHandle(BaseSplitHandle):

    @staticmethod
    def handle_links(doc, pattern_list, with_filter, limit):
+        # 检查文档是否包含内部链接
+        if not check_links_in_pdf(doc):
+            return
        # 创建存储章节内容的数组
        chapters = []
        toc_start_page = -1