diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py index d5eb779c3..52a33b0de 100644 --- a/apps/common/handle/impl/pdf_split_handle.py +++ b/apps/common/handle/impl/pdf_split_handle.py @@ -42,7 +42,7 @@ class PdfSplitHandle(BaseSplitHandle): pdf_document = fitz.open(temp_file_path) try: # 处理有目录的pdf - result = self.handle_toc(pdf_document) + result = self.handle_toc(pdf_document, limit) if result is not None: return {'name': file.name, 'content': result} @@ -110,7 +110,7 @@ class PdfSplitHandle(BaseSplitHandle): return content @staticmethod - def handle_toc(doc): + def handle_toc(doc, limit): # 找到目录 toc = doc.get_toc() if toc is None or len(toc) == 0: @@ -155,17 +155,16 @@ class PdfSplitHandle(BaseSplitHandle): text = text[:idx] chapter_text += text # 提取文本 - + # 限制章节内容长度 + if 0 < limit < len(chapter_text): + split_text = PdfSplitHandle.split_text(chapter_text, limit) + for text in split_text: + chapters.append({"title": chapter_title, "content": text}) + else: + chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title}) # 保存章节内容和章节标题 - chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title}) return chapters - @staticmethod - def handle_chapter_title(title): - title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title) - title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title) - return title - @staticmethod def handle_links(doc, pattern_list, with_filter, limit): # 创建存储章节内容的数组 @@ -228,11 +227,14 @@ class PdfSplitHandle(BaseSplitHandle): text = text[:idx] chapter_text += text - # 保存章节信息 - chapters.append({ - "title": link_title, - "content": chapter_text - }) + # 限制章节内容长度 + if 0 < limit < len(chapter_text): + split_text = PdfSplitHandle.split_text(chapter_text, limit) + for text in split_text: + chapters.append({"title": link_title, "content": text}) + else: + # 保存章节信息 + chapters.append({"title": link_title, "content": chapter_text}) # 目录中没有前言部分,手动处理 if handle_pre_toc: @@ -261,6 +263,35 @@ class PdfSplitHandle(BaseSplitHandle): chapters = pre_toc + chapters return chapters + @staticmethod + def split_text(text, length): + segments = [] + current_segment = "" + + for char in text: + current_segment += char + if len(current_segment) >= length: + # 查找最近的句号 + last_period_index = current_segment.rfind('.') + if last_period_index != -1: + segments.append(current_segment[:last_period_index + 1]) + current_segment = current_segment[last_period_index + 1:] # 更新当前段落 + else: + segments.append(current_segment) + current_segment = "" + + # 处理剩余的部分 + if current_segment: + segments.append(current_segment) + + return segments + + @staticmethod + def handle_chapter_title(title): + title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title) + title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title) + return title + def support(self, file, get_buffer): file_name: str = file.name.lower() if file_name.endswith(".pdf") or file_name.endswith(".PDF"):