mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
feat: 段落分块设置最小分块字数 (#898)
This commit is contained in:
parent
61024a661e
commit
16d7316dca
|
|
@ -12,6 +12,7 @@ from typing import List
|
|||
from common.chunk.i_chunk_handle import IChunkHandle
|
||||
|
||||
split_chunk_pattern = "!|。|\n|;|;"
|
||||
min_chunk_len = 20
|
||||
|
||||
|
||||
class MarkChunkHandle(IChunkHandle):
|
||||
|
|
@ -20,5 +21,17 @@ class MarkChunkHandle(IChunkHandle):
|
|||
for chunk in chunk_list:
|
||||
base_chunk = re.split(split_chunk_pattern, chunk)
|
||||
base_chunk = [chunk.strip() for chunk in base_chunk if len(chunk.strip()) > 0]
|
||||
result = [*result, *base_chunk]
|
||||
result_chunk = []
|
||||
for c in base_chunk:
|
||||
if len(result_chunk) == 0:
|
||||
result_chunk.append(c)
|
||||
else:
|
||||
if len(result_chunk[-1]) < min_chunk_len:
|
||||
result_chunk[-1] = result_chunk[-1] + c
|
||||
else:
|
||||
if len(c) < min_chunk_len:
|
||||
result_chunk[-1] = result_chunk[-1] + c
|
||||
else:
|
||||
result_chunk.append(c)
|
||||
result = [*result, *result_chunk]
|
||||
return result
|
||||
|
|
|
|||
Loading…
Reference in New Issue