From d935e9a8367a47e2452f4608546651c312c6856f Mon Sep 17 00:00:00 2001 From: shaohuzhang1 <80892890+shaohuzhang1@users.noreply.github.com> Date: Mon, 29 Jul 2024 14:08:40 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=B8=8A=E4=BC=A0?= =?UTF-8?q?=E6=96=87=E6=A1=A3=EF=BC=8C=E9=AB=98=E7=BA=A7=E5=88=86=E6=AE=B5?= =?UTF-8?q?=E8=AE=BE=E7=BD=AE=E5=88=86=E6=AE=B5=E9=95=BF=E5=BA=A6=E4=B8=BA?= =?UTF-8?q?10w=E5=AD=97=E7=AC=A6=EF=BC=8C=E7=94=9F=E6=88=90=E9=A2=84?= =?UTF-8?q?=E8=A7=88=E8=BF=98=E6=98=AF4096=E4=B8=AA=E5=AD=97=E7=AC=A6?= =?UTF-8?q?=E4=B8=80=E6=AE=B5=20(#884)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/management/commands/gunicorn.py | 3 --- apps/common/util/split_model.py | 8 ++++---- apps/dataset/serializers/document_serializers.py | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/apps/common/management/commands/gunicorn.py b/apps/common/management/commands/gunicorn.py index 0d4f86438..436a604b4 100644 --- a/apps/common/management/commands/gunicorn.py +++ b/apps/common/management/commands/gunicorn.py @@ -30,9 +30,6 @@ class Command(BaseCommand): def handle(self, *args, **options): log_format = '%(h)s %(t)s %(L)ss "%(r)s" %(s)s %(b)s ' - print(options.get('worker_connections')) - print(options.get('threads')) - print(options) cmd = [ 'gunicorn', 'smartdoc.wsgi:application', '-b', options.get('b') if options.get('b') is not None else '0.0.0.0:8080', diff --git a/apps/common/util/split_model.py b/apps/common/util/split_model.py index ce8a6946e..c747cb1fc 100644 --- a/apps/common/util/split_model.py +++ b/apps/common/util/split_model.py @@ -280,11 +280,11 @@ def filter_special_char(content: str): class SplitModel: - def __init__(self, content_level_pattern, with_filter=True, limit=4096): + def __init__(self, content_level_pattern, with_filter=True, limit=100000): self.content_level_pattern = content_level_pattern self.with_filter = with_filter - if limit is None or limit > 4096: - limit = 4096 + if limit is None or limit > 100000: + limit = 100000 if limit < 50: limit = 50 self.limit = limit @@ -375,7 +375,7 @@ default_split_pattern = { } -def get_split_model(filename: str, with_filter: bool = False, limit: int = 4096): +def get_split_model(filename: str, with_filter: bool = False, limit: int = 100000): """ 根据文件名称获取分段模型 :param limit: 每段大小 diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py index f89c12dc0..0977db7e1 100644 --- a/apps/dataset/serializers/document_serializers.py +++ b/apps/dataset/serializers/document_serializers.py @@ -788,7 +788,7 @@ class DocumentSerializers(ApiMixin, serializers.Serializer): file_list = self.data.get("file") return list( map(lambda f: file_to_paragraph(f, self.data.get("patterns", None), self.data.get("with_filter", None), - self.data.get("limit", None)), file_list)) + self.data.get("limit", 4096)), file_list)) class SplitPattern(ApiMixin, serializers.Serializer): @staticmethod