fix: 修复上传文档，高级分段设置分段长度为10w字符，生成预览还是4096个字符一段 (#884)

2025-12-26 01:33:05 +00:00 · 2024-07-29 14:08:40 +08:00 · 2024-07-29 14:08:40 +08:00 · d935e9a836
parent 485eeb6ac1
commit d935e9a836
3 changed files with 5 additions and 8 deletions
--- a/apps/common/management/commands/gunicorn.py
+++ b/apps/common/management/commands/gunicorn.py
@ -30,9 +30,6 @@ class Command(BaseCommand):

    def handle(self, *args, **options):
        log_format = '%(h)s %(t)s %(L)ss "%(r)s" %(s)s %(b)s '
-        print(options.get('worker_connections'))
-        print(options.get('threads'))
-        print(options)
        cmd = [
            'gunicorn', 'smartdoc.wsgi:application',
            '-b', options.get('b') if options.get('b') is not None else '0.0.0.0:8080',
--- a/apps/common/util/split_model.py
+++ b/apps/common/util/split_model.py
@ -280,11 +280,11 @@ def filter_special_char(content: str):

 class SplitModel:

-    def __init__(self, content_level_pattern, with_filter=True, limit=4096):
+    def __init__(self, content_level_pattern, with_filter=True, limit=100000):
        self.content_level_pattern = content_level_pattern
        self.with_filter = with_filter
-        if limit is None or limit > 4096:
-            limit = 4096
+        if limit is None or limit > 100000:
+            limit = 100000
        if limit < 50:
            limit = 50
        self.limit = limit
@ -375,7 +375,7 @@ default_split_pattern = {
 }


-def get_split_model(filename: str, with_filter: bool = False, limit: int = 4096):
+def get_split_model(filename: str, with_filter: bool = False, limit: int = 100000):
    """
    根据文件名称获取分段模型
    :param limit:        每段大小
--- a/apps/dataset/serializers/document_serializers.py
+++ b/apps/dataset/serializers/document_serializers.py
@ -788,7 +788,7 @@ class DocumentSerializers(ApiMixin, serializers.Serializer):
            file_list = self.data.get("file")
            return list(
                map(lambda f: file_to_paragraph(f, self.data.get("patterns", None), self.data.get("with_filter", None),
-                                                self.data.get("limit", None)), file_list))
+                                                self.data.get("limit", 4096)), file_list))

    class SplitPattern(ApiMixin, serializers.Serializer):
        @staticmethod