From 16d7316dca00f613b767ba8ea018c2edadc64a1b Mon Sep 17 00:00:00 2001 From: shaohuzhang1 <80892890+shaohuzhang1@users.noreply.github.com> Date: Tue, 30 Jul 2024 11:08:53 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=AE=B5=E8=90=BD=E5=88=86=E5=9D=97?= =?UTF-8?q?=E8=AE=BE=E7=BD=AE=E6=9C=80=E5=B0=8F=E5=88=86=E5=9D=97=E5=AD=97?= =?UTF-8?q?=E6=95=B0=20(#898)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/chunk/impl/mark_chunk_handle.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/apps/common/chunk/impl/mark_chunk_handle.py b/apps/common/chunk/impl/mark_chunk_handle.py index f86290a1f..9e5a66c0e 100644 --- a/apps/common/chunk/impl/mark_chunk_handle.py +++ b/apps/common/chunk/impl/mark_chunk_handle.py @@ -12,6 +12,7 @@ from typing import List from common.chunk.i_chunk_handle import IChunkHandle split_chunk_pattern = "!|。|\n|;|;" +min_chunk_len = 20 class MarkChunkHandle(IChunkHandle): @@ -20,5 +21,17 @@ class MarkChunkHandle(IChunkHandle): for chunk in chunk_list: base_chunk = re.split(split_chunk_pattern, chunk) base_chunk = [chunk.strip() for chunk in base_chunk if len(chunk.strip()) > 0] - result = [*result, *base_chunk] + result_chunk = [] + for c in base_chunk: + if len(result_chunk) == 0: + result_chunk.append(c) + else: + if len(result_chunk[-1]) < min_chunk_len: + result_chunk[-1] = result_chunk[-1] + c + else: + if len(c) < min_chunk_len: + result_chunk[-1] = result_chunk[-1] + c + else: + result_chunk.append(c) + result = [*result, *result_chunk] return result