mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
fix: 处理某些pdf中不包括目录和内部链接不能完整导入的问题
(cherry picked from commit fb8b96779c)
This commit is contained in:
parent
1a5bb20871
commit
9b1a497925
|
|
@ -31,6 +31,16 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
|||
max_kb = logging.getLogger("max_kb")
|
||||
|
||||
|
||||
def check_links_in_pdf(doc):
|
||||
for page_number in range(len(doc)):
|
||||
page = doc[page_number]
|
||||
links = page.get_links()
|
||||
if links:
|
||||
for link in links:
|
||||
if link['kind'] == 1:
|
||||
return True
|
||||
return False
|
||||
|
||||
class PdfSplitHandle(BaseSplitHandle):
|
||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||
|
|
@ -175,6 +185,9 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
|
||||
@staticmethod
|
||||
def handle_links(doc, pattern_list, with_filter, limit):
|
||||
# 检查文档是否包含内部链接
|
||||
if not check_links_in_pdf(doc):
|
||||
return
|
||||
# 创建存储章节内容的数组
|
||||
chapters = []
|
||||
toc_start_page = -1
|
||||
|
|
|
|||
Loading…
Reference in New Issue