refactor: PDF分段强制按字数限制

--bug=1047568 --user=刘瑞斌 【github#1363】pdf 文件高级分段默认分段长度为500,但生成的段落长度超过29000字符 https://www.tapd.cn/57709429/s/1600183
This commit is contained in:
CaptainB 2024-10-29 11:39:35 +08:00 committed by 刘瑞斌
parent 2cb8d26609
commit 834ccaa35b

View File

@ -42,7 +42,7 @@ class PdfSplitHandle(BaseSplitHandle):
pdf_document = fitz.open(temp_file_path)
try:
# 处理有目录的pdf
result = self.handle_toc(pdf_document)
result = self.handle_toc(pdf_document, limit)
if result is not None:
return {'name': file.name, 'content': result}
@ -110,7 +110,7 @@ class PdfSplitHandle(BaseSplitHandle):
return content
@staticmethod
def handle_toc(doc):
def handle_toc(doc, limit):
# 找到目录
toc = doc.get_toc()
if toc is None or len(toc) == 0:
@ -155,17 +155,16 @@ class PdfSplitHandle(BaseSplitHandle):
text = text[:idx]
chapter_text += text # 提取文本
# 限制章节内容长度
if 0 < limit < len(chapter_text):
split_text = PdfSplitHandle.split_text(chapter_text, limit)
for text in split_text:
chapters.append({"title": chapter_title, "content": text})
else:
chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
# 保存章节内容和章节标题
chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
return chapters
@staticmethod
def handle_chapter_title(title):
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
return title
@staticmethod
def handle_links(doc, pattern_list, with_filter, limit):
# 创建存储章节内容的数组
@ -228,11 +227,14 @@ class PdfSplitHandle(BaseSplitHandle):
text = text[:idx]
chapter_text += text
# 保存章节信息
chapters.append({
"title": link_title,
"content": chapter_text
})
# 限制章节内容长度
if 0 < limit < len(chapter_text):
split_text = PdfSplitHandle.split_text(chapter_text, limit)
for text in split_text:
chapters.append({"title": link_title, "content": text})
else:
# 保存章节信息
chapters.append({"title": link_title, "content": chapter_text})
# 目录中没有前言部分,手动处理
if handle_pre_toc:
@ -261,6 +263,35 @@ class PdfSplitHandle(BaseSplitHandle):
chapters = pre_toc + chapters
return chapters
@staticmethod
def split_text(text, length):
segments = []
current_segment = ""
for char in text:
current_segment += char
if len(current_segment) >= length:
# 查找最近的句号
last_period_index = current_segment.rfind('.')
if last_period_index != -1:
segments.append(current_segment[:last_period_index + 1])
current_segment = current_segment[last_period_index + 1:] # 更新当前段落
else:
segments.append(current_segment)
current_segment = ""
# 处理剩余的部分
if current_segment:
segments.append(current_segment)
return segments
@staticmethod
def handle_chapter_title(title):
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
return title
def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):