fix: 修复上传文档,高级分段设置分段长度为10w字符,生成预览还是4096个字符一段 (#884)

This commit is contained in:
shaohuzhang1 2024-07-29 14:08:40 +08:00 committed by GitHub
parent 485eeb6ac1
commit d935e9a836
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 5 additions and 8 deletions

View File

@ -30,9 +30,6 @@ class Command(BaseCommand):
def handle(self, *args, **options):
log_format = '%(h)s %(t)s %(L)ss "%(r)s" %(s)s %(b)s '
print(options.get('worker_connections'))
print(options.get('threads'))
print(options)
cmd = [
'gunicorn', 'smartdoc.wsgi:application',
'-b', options.get('b') if options.get('b') is not None else '0.0.0.0:8080',

View File

@ -280,11 +280,11 @@ def filter_special_char(content: str):
class SplitModel:
def __init__(self, content_level_pattern, with_filter=True, limit=4096):
def __init__(self, content_level_pattern, with_filter=True, limit=100000):
self.content_level_pattern = content_level_pattern
self.with_filter = with_filter
if limit is None or limit > 4096:
limit = 4096
if limit is None or limit > 100000:
limit = 100000
if limit < 50:
limit = 50
self.limit = limit
@ -375,7 +375,7 @@ default_split_pattern = {
}
def get_split_model(filename: str, with_filter: bool = False, limit: int = 4096):
def get_split_model(filename: str, with_filter: bool = False, limit: int = 100000):
"""
根据文件名称获取分段模型
:param limit: 每段大小

View File

@ -788,7 +788,7 @@ class DocumentSerializers(ApiMixin, serializers.Serializer):
file_list = self.data.get("file")
return list(
map(lambda f: file_to_paragraph(f, self.data.get("patterns", None), self.data.get("with_filter", None),
self.data.get("limit", None)), file_list))
self.data.get("limit", 4096)), file_list))
class SplitPattern(ApiMixin, serializers.Serializer):
@staticmethod