From 4c9756839a46f217739925afdbd184647d63e49e Mon Sep 17 00:00:00 2001 From: CaptainB Date: Thu, 10 Jul 2025 15:06:10 +0800 Subject: [PATCH] chore: normalize with_filter parameter to boolean in split handle files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --bug=1057879 --user=刘瑞斌 【知识库】高级分段中自动清洗功能未生效 https://www.tapd.cn/62980211/s/1727744 --- apps/common/handle/impl/text/doc_split_handle.py | 2 ++ apps/common/handle/impl/text/html_split_handle.py | 2 ++ apps/common/handle/impl/text/pdf_split_handle.py | 2 ++ apps/common/handle/impl/text/text_split_handle.py | 2 ++ apps/common/handle/impl/text/zip_split_handle.py | 2 ++ 5 files changed, 10 insertions(+) diff --git a/apps/common/handle/impl/text/doc_split_handle.py b/apps/common/handle/impl/text/doc_split_handle.py index b1ed58177..8d3c74767 100644 --- a/apps/common/handle/impl/text/doc_split_handle.py +++ b/apps/common/handle/impl/text/doc_split_handle.py @@ -197,6 +197,8 @@ class DocSplitHandle(BaseSplitHandle): try: if type(limit) is str: limit = int(limit) + if type(with_filter) is str: + with_filter = with_filter.lower() == 'true' image_list = [] buffer = get_buffer(file) doc = Document(io.BytesIO(buffer)) diff --git a/apps/common/handle/impl/text/html_split_handle.py b/apps/common/handle/impl/text/html_split_handle.py index ca3ad66ab..a82cfdaec 100644 --- a/apps/common/handle/impl/text/html_split_handle.py +++ b/apps/common/handle/impl/text/html_split_handle.py @@ -48,6 +48,8 @@ class HTMLSplitHandle(BaseSplitHandle): buffer = get_buffer(file) if type(limit) is str: limit = int(limit) + if type(with_filter) is str: + with_filter = with_filter.lower() == 'true' if pattern_list is not None and len(pattern_list) > 0: split_model = SplitModel(pattern_list, with_filter, limit) else: diff --git a/apps/common/handle/impl/text/pdf_split_handle.py b/apps/common/handle/impl/text/pdf_split_handle.py index f7e41eb2d..d666796b9 100644 --- a/apps/common/handle/impl/text/pdf_split_handle.py +++ b/apps/common/handle/impl/text/pdf_split_handle.py @@ -54,6 +54,8 @@ class PdfSplitHandle(BaseSplitHandle): try: if type(limit) is str: limit = int(limit) + if type(with_filter) is str: + with_filter = with_filter.lower() == 'true' # 处理有目录的pdf result = self.handle_toc(pdf_document, limit) if result is not None: diff --git a/apps/common/handle/impl/text/text_split_handle.py b/apps/common/handle/impl/text/text_split_handle.py index 6eb40f6a1..fab396320 100644 --- a/apps/common/handle/impl/text/text_split_handle.py +++ b/apps/common/handle/impl/text/text_split_handle.py @@ -43,6 +43,8 @@ class TextSplitHandle(BaseSplitHandle): buffer = get_buffer(file) if type(limit) is str: limit = int(limit) + if type(with_filter) is str: + with_filter = with_filter.lower() == 'true' if pattern_list is not None and len(pattern_list) > 0: split_model = SplitModel(pattern_list, with_filter, limit) else: diff --git a/apps/common/handle/impl/text/zip_split_handle.py b/apps/common/handle/impl/text/zip_split_handle.py index d448f28cb..5752fe0d7 100644 --- a/apps/common/handle/impl/text/zip_split_handle.py +++ b/apps/common/handle/impl/text/zip_split_handle.py @@ -121,6 +121,8 @@ class ZipSplitHandle(BaseSplitHandle): def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): if type(limit) is str: limit = int(limit) + if type(with_filter) is str: + with_filter = with_filter.lower() == 'true' buffer = get_buffer(file) bytes_io = io.BytesIO(buffer) result = []