mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
refactor: PDF分段强制按字数限制
--bug=1047568 --user=刘瑞斌 【github#1363】pdf 文件高级分段默认分段长度为500,但生成的段落长度超过29000字符 https://www.tapd.cn/57709429/s/1600183
This commit is contained in:
parent
2cb8d26609
commit
834ccaa35b
|
|
@ -42,7 +42,7 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
pdf_document = fitz.open(temp_file_path)
|
||||
try:
|
||||
# 处理有目录的pdf
|
||||
result = self.handle_toc(pdf_document)
|
||||
result = self.handle_toc(pdf_document, limit)
|
||||
if result is not None:
|
||||
return {'name': file.name, 'content': result}
|
||||
|
||||
|
|
@ -110,7 +110,7 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
return content
|
||||
|
||||
@staticmethod
|
||||
def handle_toc(doc):
|
||||
def handle_toc(doc, limit):
|
||||
# 找到目录
|
||||
toc = doc.get_toc()
|
||||
if toc is None or len(toc) == 0:
|
||||
|
|
@ -155,17 +155,16 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
text = text[:idx]
|
||||
|
||||
chapter_text += text # 提取文本
|
||||
|
||||
# 限制章节内容长度
|
||||
if 0 < limit < len(chapter_text):
|
||||
split_text = PdfSplitHandle.split_text(chapter_text, limit)
|
||||
for text in split_text:
|
||||
chapters.append({"title": chapter_title, "content": text})
|
||||
else:
|
||||
chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
|
||||
# 保存章节内容和章节标题
|
||||
chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
|
||||
return chapters
|
||||
|
||||
@staticmethod
|
||||
def handle_chapter_title(title):
|
||||
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
|
||||
title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
|
||||
return title
|
||||
|
||||
@staticmethod
|
||||
def handle_links(doc, pattern_list, with_filter, limit):
|
||||
# 创建存储章节内容的数组
|
||||
|
|
@ -228,11 +227,14 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
text = text[:idx]
|
||||
chapter_text += text
|
||||
|
||||
# 保存章节信息
|
||||
chapters.append({
|
||||
"title": link_title,
|
||||
"content": chapter_text
|
||||
})
|
||||
# 限制章节内容长度
|
||||
if 0 < limit < len(chapter_text):
|
||||
split_text = PdfSplitHandle.split_text(chapter_text, limit)
|
||||
for text in split_text:
|
||||
chapters.append({"title": link_title, "content": text})
|
||||
else:
|
||||
# 保存章节信息
|
||||
chapters.append({"title": link_title, "content": chapter_text})
|
||||
|
||||
# 目录中没有前言部分,手动处理
|
||||
if handle_pre_toc:
|
||||
|
|
@ -261,6 +263,35 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||
chapters = pre_toc + chapters
|
||||
return chapters
|
||||
|
||||
@staticmethod
|
||||
def split_text(text, length):
|
||||
segments = []
|
||||
current_segment = ""
|
||||
|
||||
for char in text:
|
||||
current_segment += char
|
||||
if len(current_segment) >= length:
|
||||
# 查找最近的句号
|
||||
last_period_index = current_segment.rfind('.')
|
||||
if last_period_index != -1:
|
||||
segments.append(current_segment[:last_period_index + 1])
|
||||
current_segment = current_segment[last_period_index + 1:] # 更新当前段落
|
||||
else:
|
||||
segments.append(current_segment)
|
||||
current_segment = ""
|
||||
|
||||
# 处理剩余的部分
|
||||
if current_segment:
|
||||
segments.append(current_segment)
|
||||
|
||||
return segments
|
||||
|
||||
@staticmethod
|
||||
def handle_chapter_title(title):
|
||||
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
|
||||
title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
|
||||
return title
|
||||
|
||||
def support(self, file, get_buffer):
|
||||
file_name: str = file.name.lower()
|
||||
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
|
||||
|
|
|
|||
Loading…
Reference in New Issue