feat: 段落分块设置最小分块字数 (#898)

This commit is contained in:
shaohuzhang1 2024-07-30 11:08:53 +08:00 committed by GitHub
parent 61024a661e
commit 16d7316dca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -12,6 +12,7 @@ from typing import List
from common.chunk.i_chunk_handle import IChunkHandle
split_chunk_pattern = "|。|\n||;"
min_chunk_len = 20
class MarkChunkHandle(IChunkHandle):
@ -20,5 +21,17 @@ class MarkChunkHandle(IChunkHandle):
for chunk in chunk_list:
base_chunk = re.split(split_chunk_pattern, chunk)
base_chunk = [chunk.strip() for chunk in base_chunk if len(chunk.strip()) > 0]
result = [*result, *base_chunk]
result_chunk = []
for c in base_chunk:
if len(result_chunk) == 0:
result_chunk.append(c)
else:
if len(result_chunk[-1]) < min_chunk_len:
result_chunk[-1] = result_chunk[-1] + c
else:
if len(c) < min_chunk_len:
result_chunk[-1] = result_chunk[-1] + c
else:
result_chunk.append(c)
result = [*result, *result_chunk]
return result