diff --git a/apps/common/chunk/impl/mark_chunk_handle.py b/apps/common/chunk/impl/mark_chunk_handle.py index f86290a1f..9e5a66c0e 100644 --- a/apps/common/chunk/impl/mark_chunk_handle.py +++ b/apps/common/chunk/impl/mark_chunk_handle.py @@ -12,6 +12,7 @@ from typing import List from common.chunk.i_chunk_handle import IChunkHandle split_chunk_pattern = "!|。|\n|;|;" +min_chunk_len = 20 class MarkChunkHandle(IChunkHandle): @@ -20,5 +21,17 @@ class MarkChunkHandle(IChunkHandle): for chunk in chunk_list: base_chunk = re.split(split_chunk_pattern, chunk) base_chunk = [chunk.strip() for chunk in base_chunk if len(chunk.strip()) > 0] - result = [*result, *base_chunk] + result_chunk = [] + for c in base_chunk: + if len(result_chunk) == 0: + result_chunk.append(c) + else: + if len(result_chunk[-1]) < min_chunk_len: + result_chunk[-1] = result_chunk[-1] + c + else: + if len(c) < min_chunk_len: + result_chunk[-1] = result_chunk[-1] + c + else: + result_chunk.append(c) + result = [*result, *result_chunk] return result