diff --git a/apps/application/flow/step_node/__init__.py b/apps/application/flow/step_node/__init__.py index c323a46c9..523794868 100644 --- a/apps/application/flow/step_node/__init__.py +++ b/apps/application/flow/step_node/__init__.py @@ -39,6 +39,7 @@ from .variable_aggregation_node.impl.base_variable_aggregation_node import BaseV from .variable_assign_node import BaseVariableAssignNode from .variable_splitting_node import BaseVariableSplittingNode from .video_understand_step_node import BaseVideoUnderstandNode +from .document_split_node import BaseDocumentSplitNode node_list = [BaseStartStepNode, BaseChatNode, BaseSearchKnowledgeNode, BaseSearchDocumentNode, BaseQuestionNode, BaseConditionNode, BaseReplyNode, @@ -50,7 +51,7 @@ node_list = [BaseStartStepNode, BaseChatNode, BaseSearchKnowledgeNode, BaseSearc BaseIntentNode, BaseLoopNode, BaseLoopStartStepNode, BaseLoopContinueNode, BaseLoopBreakNode, BaseVariableSplittingNode, BaseParameterExtractionNode, BaseVariableAggregationNode, - BaseDataSourceLocalNode, BaseDataSourceWebNode, BaseKnowledgeWriteNode] + BaseDataSourceLocalNode, BaseDataSourceWebNode, BaseKnowledgeWriteNode, BaseDocumentSplitNode] node_map = {n.type: {w: n for w in n.support} for n in node_list} diff --git a/apps/application/flow/step_node/document_split_node/__init__.py b/apps/application/flow/step_node/document_split_node/__init__.py new file mode 100644 index 000000000..ce8f10f3e --- /dev/null +++ b/apps/application/flow/step_node/document_split_node/__init__.py @@ -0,0 +1 @@ +from .impl import * \ No newline at end of file diff --git a/apps/application/flow/step_node/document_split_node/i_document_split_node.py b/apps/application/flow/step_node/document_split_node/i_document_split_node.py new file mode 100644 index 000000000..540b1da9a --- /dev/null +++ b/apps/application/flow/step_node/document_split_node/i_document_split_node.py @@ -0,0 +1,64 @@ +# coding=utf-8 + +from typing import Type + +from django.utils.translation import gettext_lazy as _ +from rest_framework import serializers + +from application.flow.common import WorkflowMode +from application.flow.i_step_node import INode, NodeResult + + +class DocumentSplitNodeSerializer(serializers.Serializer): + file_list = serializers.ListField(required=False, label=_("file list")) + split_strategy = serializers.ChoiceField( + choices=['auto', 'custom', 'qa'], required=False, label=_("split strategy"), default='auto' + ) + paragraph_title_relate_problem_type = serializers.ChoiceField( + choices=['custom', 'referencing'], required=False, label=_("paragraph title relate problem type"), + default='custom' + ) + paragraph_title_relate_problem = serializers.BooleanField( + required=False, label=_("paragraph title relate problem"), default=False + ) + paragraph_title_relate_problem_reference = serializers.ListField( + required=False, label=_("paragraph title relate problem reference"), child=serializers.CharField() + ) + document_name_relate_problem_type = serializers.ChoiceField( + choices=['custom', 'referencing'], required=False, label=_("document name relate problem type"), + default='custom' + ) + document_name_relate_problem = serializers.BooleanField( + required=False, label=_("document name relate problem"), default=False + ) + document_name_relate_problem_reference = serializers.ListField( + required=False, label=_("document name relate problem reference"), child=serializers.CharField() + ) + limit = serializers.IntegerField(required=False, label=_("limit"), default=4096) + patterns = serializers.ListField( + required=False, label=_("patterns"), child=serializers.CharField(), default=[] + ) + with_filter = serializers.BooleanField( + required=False, label=_("with filter"), default=False + ) + + +class IDocumentSplitNode(INode): + type = 'document-split-node' + support = [ + WorkflowMode.APPLICATION, WorkflowMode.APPLICATION_LOOP, WorkflowMode.KNOWLEDGE_LOOP, WorkflowMode.KNOWLEDGE + ] + + def get_node_params_serializer_class(self) -> Type[serializers.Serializer]: + return DocumentSplitNodeSerializer + + def _run(self): + res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0], + self.node_params_serializer.data.get('file_list')[1:]) + return self.execute(file_list=res, **self.flow_params_serializer.data) + + def execute(self, file_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type, + paragraph_title_relate_problem, paragraph_title_relate_problem_reference, + document_name_relate_problem_type, document_name_relate_problem, + document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult: + pass diff --git a/apps/application/flow/step_node/document_split_node/impl/__init__.py b/apps/application/flow/step_node/document_split_node/impl/__init__.py new file mode 100644 index 000000000..cc7dc7dda --- /dev/null +++ b/apps/application/flow/step_node/document_split_node/impl/__init__.py @@ -0,0 +1 @@ +from .base_document_split_node import BaseDocumentSplitNode diff --git a/apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py b/apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py new file mode 100644 index 000000000..89fcec523 --- /dev/null +++ b/apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py @@ -0,0 +1,68 @@ +# coding=utf-8 +from django.db.models import QuerySet + +from application.flow.i_step_node import NodeResult +from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode +from knowledge.models import File +from knowledge.serializers.document import split_handles, FileBufferHandle + + +class BaseDocumentSplitNode(IDocumentSplitNode): + def save_context(self, details, workflow_manage): + self.context['content'] = details.get('content') + print(details) + + def execute(self, file_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type, + paragraph_title_relate_problem, paragraph_title_relate_problem_reference, + document_name_relate_problem_type, document_name_relate_problem, + document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult: + get_buffer = FileBufferHandle().get_buffer + + paragraph_list = [] + for doc in file_list: + file = QuerySet(File).filter(id=doc['file_id']).first() + file_id = file.id + for split_handle in split_handles: + if split_handle.support(file, get_buffer): + result = split_handle.handle(file, patterns, with_filter, limit, get_buffer, self.save_image) + if isinstance(result, list): + for item in result: + item['source_file_id'] = file_id + paragraph_list = result + else: + result['source_file_id'] = file_id + paragraph_list = [result] + + self.context['file_list'] = file_list + self.context['paragraph_list'] = paragraph_list + + print(paragraph_list) + + return NodeResult({'paragraph_list': paragraph_list}, {}) + + def save_image(self, image_list): + # if image_list is not None and len(image_list) > 0: + # exist_image_list = [str(i.get('id')) for i in + # QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')] + # save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))] + # save_image_list = list({img.id: img for img in save_image_list}.values()) + # # save image + # for file in save_image_list: + # file_bytes = file.meta.pop('content') + # file.meta['knowledge_id'] = self.data.get('knowledge_id') + # file.source_type = FileSourceType.KNOWLEDGE + # file.source_id = self.data.get('knowledge_id') + # file.save(file_bytes) + pass + + def get_details(self, index: int, **kwargs): + return { + 'name': self.node.properties.get('stepName'), + "index": index, + 'run_time': self.context.get('run_time'), + 'type': self.node.type, + 'status': self.status, + 'err_message': self.err_message, + 'file_list': self.context.get('file_list'), + 'paragraph_list': self.context.get('paragraph_list', []), + } diff --git a/ui/src/enums/application.ts b/ui/src/enums/application.ts index 479855e28..d0781273e 100644 --- a/ui/src/enums/application.ts +++ b/ui/src/enums/application.ts @@ -19,6 +19,7 @@ export enum WorkflowType { RerankerNode = 'reranker-node', Application = 'application-node', DocumentExtractNode = 'document-extract-node', + DocumentSplitNode = 'document-split-node', ImageUnderstandNode = 'image-understand-node', VariableAssignNode = 'variable-assign-node', FormNode = 'form-node', diff --git a/ui/src/locales/lang/en-US/views/application-workflow.ts b/ui/src/locales/lang/en-US/views/application-workflow.ts index 9bc8e206c..6c98edbcc 100644 --- a/ui/src/locales/lang/en-US/views/application-workflow.ts +++ b/ui/src/locales/lang/en-US/views/application-workflow.ts @@ -83,7 +83,7 @@ export default { chunk_length: 'Chunk length', text: 'Knowledge write', label: 'Knowledge write', - }, + }, dataSourceWebNode: { label: 'Web Site', text: 'Web Site', @@ -250,6 +250,16 @@ You are a master of problem optimization, adept at accurately inferring user int text: 'Extract content from documents', content: 'Document Content', }, + documentSplitNode: { + label: 'Document Splitting', + text: 'Split document content into smaller segments', + paragraph_list: 'List of split segments', + splitStrategy: { + label: 'Splitting Strategy', + placeholder: 'Please select a splitting strategy', + requiredMessage: 'Please select a splitting strategy', + }, + }, imageUnderstandNode: { label: 'Image Understanding', text: 'Analyze images to identify objects, scenes, and provide answers', diff --git a/ui/src/locales/lang/zh-CN/views/application-workflow.ts b/ui/src/locales/lang/zh-CN/views/application-workflow.ts index 9f6fea9c8..ea39b54fb 100644 --- a/ui/src/locales/lang/zh-CN/views/application-workflow.ts +++ b/ui/src/locales/lang/zh-CN/views/application-workflow.ts @@ -85,7 +85,7 @@ export default { chunk_length: '子分块长度', text: '知识库写入', label: '知识库写入', - }, + }, dataSourceWebNode: { label: 'Web站点', text: 'Web站点', @@ -256,6 +256,16 @@ export default { text: '提取文档中的内容', content: '文档内容', }, + documentSplitNode: { + label: '文档分段', + text: '将文档内容拆分为多个分段', + paragraph_list: '分段列表', + splitStrategy: { + label: '分段策略', + placeholder: '请选择分段策略', + requiredMessage: '请选择分段策略', + }, + }, imageUnderstandNode: { label: '图片理解', text: '识别出图片中的对象、场景等信息回答用户问题', diff --git a/ui/src/locales/lang/zh-Hant/views/application-workflow.ts b/ui/src/locales/lang/zh-Hant/views/application-workflow.ts index 0bf97a87d..cf34de3f3 100644 --- a/ui/src/locales/lang/zh-Hant/views/application-workflow.ts +++ b/ui/src/locales/lang/zh-Hant/views/application-workflow.ts @@ -84,7 +84,7 @@ export default { chunk_length: '子分塊長度', text: '知識庫寫入', label: '知識庫寫入', - }, + }, dataSourceWebNode: { label: 'Web網站', text: 'Web網站', @@ -250,6 +250,16 @@ export default { text: '提取文檔中的內容', content: '文檔內容', }, + documentSplitNode: { + label: '文檔拆分', + text: '將文檔內容拆分為多個分段', + paragraph_list: '分段列表', + splitStrategy: { + label: '分段策略', + placeholder: '請選擇分段策略', + requiredMessage: '請選擇分段策略', + }, + }, imageUnderstandNode: { label: '圖片理解', text: '識別出圖片中的物件、場景等信息回答用戶問題', diff --git a/ui/src/workflow/common/data.ts b/ui/src/workflow/common/data.ts index 7a3012298..9ced28264 100644 --- a/ui/src/workflow/common/data.ts +++ b/ui/src/workflow/common/data.ts @@ -387,6 +387,24 @@ export const documentExtractNode = { }, }, } +export const documentSplitNode = { + type: WorkflowType.DocumentSplitNode, + text: t('views.applicationWorkflow.nodes.documentSplitNode.text'), + label: t('views.applicationWorkflow.nodes.documentSplitNode.label'), + height: 252, + properties: { + width: 500, + stepName: t('views.applicationWorkflow.nodes.documentSplitNode.label'), + config: { + fields: [ + { + label: t('views.applicationWorkflow.nodes.documentSplitNode.paragraph_list'), + value: 'paragraph_list', + }, + ], + }, + }, +} export const imageUnderstandNode = { type: WorkflowType.ImageUnderstandNode, text: t('views.applicationWorkflow.nodes.imageUnderstandNode.text'), @@ -724,7 +742,7 @@ export const knowledgeMenuNodes = [ }, { label: t('views.knowledge.title'), - list: [documentExtractNode, knowledgeWriteNode], + list: [documentExtractNode, documentSplitNode, knowledgeWriteNode], }, { label: t('views.applicationWorkflow.nodes.classify.businessLogic'), @@ -763,7 +781,7 @@ export const menuNodes = [ }, { label: t('views.knowledge.title'), - list: [searchKnowledgeNode, searchDocumentNode, rerankerNode, documentExtractNode], + list: [searchKnowledgeNode, searchDocumentNode, rerankerNode, documentExtractNode, documentSplitNode, knowledgeWriteNode], }, { label: t('views.applicationWorkflow.nodes.classify.businessLogic'), @@ -949,6 +967,7 @@ export const nodeDict: any = { [WorkflowType.FormNode]: formNode, [WorkflowType.Application]: applicationNode, [WorkflowType.DocumentExtractNode]: documentExtractNode, + [WorkflowType.DocumentSplitNode]: documentSplitNode, [WorkflowType.ImageUnderstandNode]: imageUnderstandNode, [WorkflowType.TextToSpeechNode]: textToSpeechNode, [WorkflowType.SpeechToTextNode]: speechToTextNode, diff --git a/ui/src/workflow/common/validate.ts b/ui/src/workflow/common/validate.ts index b41044918..aad2ccd4a 100644 --- a/ui/src/workflow/common/validate.ts +++ b/ui/src/workflow/common/validate.ts @@ -15,12 +15,12 @@ const end_nodes: Array = [ WorkflowType.ImageGenerateNode, WorkflowType.ImageToVideoGenerateNode, WorkflowType.TextToVideoGenerateNode, - WorkflowType.ImageGenerateNode, WorkflowType.LoopBodyNode, WorkflowType.LoopNode, WorkflowType.LoopBreakNode, WorkflowType.VideoUnderstandNode, WorkflowType.VariableAssignNode, + WorkflowType.KnowledgeWriteNode, ] const loop_end_nodes: Array = [ @@ -36,7 +36,6 @@ const loop_end_nodes: Array = [ WorkflowType.ImageGenerateNode, WorkflowType.ImageToVideoGenerateNode, WorkflowType.TextToVideoGenerateNode, - WorkflowType.ImageGenerateNode, WorkflowType.LoopBodyNode, WorkflowType.LoopNode, WorkflowType.LoopBreakNode, diff --git a/ui/src/workflow/icons/document-split-node-icon.vue b/ui/src/workflow/icons/document-split-node-icon.vue new file mode 100644 index 000000000..a081bb44f --- /dev/null +++ b/ui/src/workflow/icons/document-split-node-icon.vue @@ -0,0 +1,6 @@ + + diff --git a/ui/src/workflow/nodes/document-split-node/index.ts b/ui/src/workflow/nodes/document-split-node/index.ts new file mode 100644 index 000000000..ace4d3d14 --- /dev/null +++ b/ui/src/workflow/nodes/document-split-node/index.ts @@ -0,0 +1,14 @@ +import DocumentSplitNodeVue from './index.vue' +import { AppNode, AppNodeModel } from '@/workflow/common/app-node' + +class DocumentSplitNode extends AppNode { + constructor(props: any) { + super(props, DocumentSplitNodeVue) + } +} + +export default { + type: 'document-split-node', + model: AppNodeModel, + view: DocumentSplitNode +} diff --git a/ui/src/workflow/nodes/document-split-node/index.vue b/ui/src/workflow/nodes/document-split-node/index.vue new file mode 100644 index 000000000..776aff0c8 --- /dev/null +++ b/ui/src/workflow/nodes/document-split-node/index.vue @@ -0,0 +1,275 @@ + + + + +