refactor: PDF分段强制按字数限制

--bug=1047568 --user=刘瑞斌【github#1363】pdf 文件高级分段默认分段长度为500，但生成的段落长度超过29000字符 https://www.tapd.cn/57709429/s/1600183
2025-12-26 01:33:05 +00:00 · 2024-10-29 11:39:35 +08:00 · 2024-10-29 11:39:35 +08:00 · 834ccaa35b
parent 2cb8d26609
commit 834ccaa35b
1 changed files with 46 additions and 15 deletions
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@ -42,7 +42,7 @@ class PdfSplitHandle(BaseSplitHandle):
        pdf_document = fitz.open(temp_file_path)
        try:
            # 处理有目录的pdf
-            result = self.handle_toc(pdf_document)
+            result = self.handle_toc(pdf_document, limit)
            if result is not None:
                return {'name': file.name, 'content': result}

@ -110,7 +110,7 @@ class PdfSplitHandle(BaseSplitHandle):
        return content

    @staticmethod
-    def handle_toc(doc):
+    def handle_toc(doc, limit):
        # 找到目录
        toc = doc.get_toc()
        if toc is None or len(toc) == 0:
@ -155,17 +155,16 @@ class PdfSplitHandle(BaseSplitHandle):
                        text = text[:idx]

                chapter_text += text  # 提取文本
-
+            # 限制章节内容长度
+            if 0 < limit < len(chapter_text):
+                split_text = PdfSplitHandle.split_text(chapter_text, limit)
+                for text in split_text:
+                    chapters.append({"title": chapter_title, "content": text})
+            else:
+                chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
            # 保存章节内容和章节标题
-            chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
        return chapters

-    @staticmethod
-    def handle_chapter_title(title):
-        title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
-        title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
-        return title
-
    @staticmethod
    def handle_links(doc, pattern_list, with_filter, limit):
        # 创建存储章节内容的数组
@ -228,11 +227,14 @@ class PdfSplitHandle(BaseSplitHandle):
                                text = text[:idx]
                        chapter_text += text

-                    # 保存章节信息
-                    chapters.append({
-                        "title": link_title,
-                        "content": chapter_text
-                    })
+                    # 限制章节内容长度
+                    if 0 < limit < len(chapter_text):
+                        split_text = PdfSplitHandle.split_text(chapter_text, limit)
+                        for text in split_text:
+                            chapters.append({"title": link_title, "content": text})
+                    else:
+                        # 保存章节信息
+                        chapters.append({"title": link_title, "content": chapter_text})

        # 目录中没有前言部分，手动处理
        if handle_pre_toc:
@ -261,6 +263,35 @@ class PdfSplitHandle(BaseSplitHandle):
            chapters = pre_toc + chapters
        return chapters

+    @staticmethod
+    def split_text(text, length):
+        segments = []
+        current_segment = ""
+
+        for char in text:
+            current_segment += char
+            if len(current_segment) >= length:
+                # 查找最近的句号
+                last_period_index = current_segment.rfind('.')
+                if last_period_index != -1:
+                    segments.append(current_segment[:last_period_index + 1])
+                    current_segment = current_segment[last_period_index + 1:]  # 更新当前段落
+                else:
+                    segments.append(current_segment)
+                    current_segment = ""
+
+        # 处理剩余的部分
+        if current_segment:
+            segments.append(current_segment)
+
+        return segments
+
+    @staticmethod
+    def handle_chapter_title(title):
+        title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
+        title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
+        return title
+
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".pdf") or file_name.endswith(".PDF"):