feat: enhance Document Split Node with result processing and problem list generation

2026-01-01 02:52:48 +00:00 · 2025-11-21 15:24:40 +08:00 · 2025-11-21 15:24:40 +08:00 · 5922597775
parent 820b68071b
commit 5922597775
6 changed files with 43 additions and 37 deletions
--- a/apps/application/flow/step_node/document_extract_node/i_document_extract_node.py
+++ b/apps/application/flow/step_node/document_extract_node/i_document_extract_node.py
@ -25,5 +25,5 @@ class IDocumentExtractNode(INode):
                                                       self.node_params_serializer.data.get('document_list')[1:])
        return self.execute(document=res, **self.flow_params_serializer.data)

-    def execute(self, document, chat_id, **kwargs) -> NodeResult:
+    def execute(self, document, chat_id=None, **kwargs) -> NodeResult:
        pass
--- a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
+++ b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
@ -42,23 +42,28 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
    def save_context(self, details, workflow_manage):
        self.context['content'] = details.get('content')

-    def execute(self, document, chat_id, **kwargs):
+    def execute(self, document, chat_id=None, **kwargs):
        get_buffer = FileBufferHandle().get_buffer

        self.context['document_list'] = document
        content = []
        if document is None or not isinstance(document, list):
-            return NodeResult({'content': ''}, {})
+            return NodeResult({'content': '', 'document_list': []}, {})

-        application = self.workflow_manage.work_flow_post_handler.chat_info.application
+        # 安全获取 application
+        application = None
+        if (self.workflow_manage and
+                self.workflow_manage.work_flow_post_handler and
+                self.workflow_manage.work_flow_post_handler.chat_info):
+            application = self.workflow_manage.work_flow_post_handler.chat_info.application

        # doc文件中的图片保存
        def save_image(image_list):
            for image in image_list:
                meta = {
-                    'debug': False if application.id else True,
+                    'debug': False if (application and application.id) else True,
                    'chat_id': chat_id,
-                    'application_id': str(application.id) if application.id else None,
+                    'application_id': str(application.id) if (application and application.id) else None,
                    'file_id': str(image.id)
                }
                file_bytes = image.meta.pop('content')
@ -70,6 +75,7 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
                    'source_type': FileSourceType.APPLICATION.value
                }).upload()

+        document_list = []
        for doc in document:
            file = QuerySet(File).filter(id=doc['file_id']).first()
            buffer = io.BytesIO(file.get_bytes())
@ -81,9 +87,10 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
                    buffer.seek(0)
                    file_content = split_handle.get_content(buffer, save_image)
                    content.append('### ' + doc['name'] + '\n' + file_content)
+                    document_list.append({'id': file.id, 'name': doc['name'], 'content': file_content})
                    break

-        return NodeResult({'content': splitter.join(content)}, {})
+        return NodeResult({'content': splitter.join(content), 'document_list': document_list}, {})

    def get_details(self, index: int, **kwargs):
        content = self.context.get('content', '').split(splitter)
--- a/apps/application/flow/step_node/document_split_node/i_document_split_node.py
+++ b/apps/application/flow/step_node/document_split_node/i_document_split_node.py
@ -10,7 +10,7 @@ from application.flow.i_step_node import INode, NodeResult


 class DocumentSplitNodeSerializer(serializers.Serializer):
-    file_list = serializers.ListField(required=False, label=_("file list"))
+    document_list = serializers.ListField(required=False, label=_("document list"))
    split_strategy = serializers.ChoiceField(
        choices=['auto', 'custom', 'qa'], required=False, label=_("split strategy"), default='auto'
    )
@ -53,11 +53,11 @@ class IDocumentSplitNode(INode):
        return DocumentSplitNodeSerializer

    def _run(self):
-        res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
-                                                       self.node_params_serializer.data.get('file_list')[1:])
-        return self.execute(files=res, **self.node_params_serializer.data, **self.flow_params_serializer.data)
+        # res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
+        #                                                self.node_params_serializer.data.get('file_list')[1:])
+        return self.execute(**self.node_params_serializer.data, **self.flow_params_serializer.data)

-    def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
+    def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
                paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
                document_name_relate_problem_type, document_name_relate_problem,
                document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
--- a/apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py
+++ b/apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py
@ -9,7 +9,7 @@ from django.db.models import QuerySet
 from application.flow.i_step_node import NodeResult
 from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode
 from knowledge.models import File, FileSourceType
-from knowledge.serializers.document import split_handles, FileBufferHandle
+from knowledge.serializers.document import default_split_handle, FileBufferHandle


 def bytes_to_uploaded_file(file_bytes, file_name="file.txt"):
@ -42,36 +42,31 @@ class BaseDocumentSplitNode(IDocumentSplitNode):
    def get_reference_content(self, fields: List[str]):
        return self.workflow_manage.get_reference_field(fields[0], fields[1:])

-    def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
+    def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
                paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
                document_name_relate_problem_type, document_name_relate_problem,
                document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
-        get_buffer = FileBufferHandle().get_buffer
-        self.context['file_list'] = files
        self.context['knowledge_id'] = knowledge_id
-
+        file_list = self.workflow_manage.get_reference_field(document_list[0], document_list[1:])
        paragraph_list = []
-        for doc in files:
-            file = QuerySet(File).filter(id=doc['file_id']).first()
-            file_mem = bytes_to_uploaded_file(file.get_bytes(), file_name=file.file_name)
+        get_buffer = FileBufferHandle().get_buffer

-            for split_handle in split_handles:
-                if split_handle.support(file_mem, get_buffer):
-                    result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
-                    # 统一处理结果为列表
-                    results = result if isinstance(result, list) else [result]
+        for doc in file_list:
+            file_mem = bytes_to_uploaded_file(doc['content'].encode('utf-8'), doc['name'])
+            result = default_split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
+            # 统一处理结果为列表
+            results = result if isinstance(result, list) else [result]

-                    for item in results:
-                        self._process_split_result(
-                            item, knowledge_id, file.id, file.file_name,
-                            split_strategy, paragraph_title_relate_problem_type,
-                            paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
-                            document_name_relate_problem_type, document_name_relate_problem,
-                            document_name_relate_problem_reference
-                        )
+            for item in results:
+                self._process_split_result(
+                    item, knowledge_id, doc['id'], doc['name'],
+                    split_strategy, paragraph_title_relate_problem_type,
+                    paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
+                    document_name_relate_problem_type, document_name_relate_problem,
+                    document_name_relate_problem_reference
+                )

-                    paragraph_list = results
-                    break
+            paragraph_list = results

        self.context['paragraph_list'] = paragraph_list

--- a/ui/src/workflow/common/data.ts
+++ b/ui/src/workflow/common/data.ts
@ -383,6 +383,10 @@ export const documentExtractNode = {
          label: t('views.applicationWorkflow.nodes.documentExtractNode.content'),
          value: 'content',
        },
+        {
+          label: t('views.applicationWorkflow.nodes.dataSourceWebNode.field_label'),
+          value: 'document_list',
+        },
      ],
    },
  },
--- a/ui/src/workflow/nodes/document-split-node/index.vue
+++ b/ui/src/workflow/nodes/document-split-node/index.vue
@ -22,7 +22,7 @@
            :nodeModel="nodeModel"
            class="w-full"
            :placeholder="$t('views.chatLog.documentPlaceholder')"
-            v-model="form_data.file_list"
+            v-model="form_data.document_list"
          />
        </el-form-item>
        <el-form-item
@ -207,7 +207,7 @@ const props = defineProps<{ nodeModel: any }>()
 const splitPatternList = ref<Array<KeyValue<string, string>>>([])

 const form = {
-  file_list: [],
+  document_list: [],
  split_strategy: 'auto',
  paragraph_title_relate_problem_type: 'custom',
  paragraph_title_relate_problem: false,