feat: add Document Split Node functionality and localization

This commit is contained in:
CaptainB 2025-11-20 14:44:58 +08:00
parent 9dc3f21406
commit b6c6e1b336
14 changed files with 487 additions and 8 deletions

View File

@ -39,6 +39,7 @@ from .variable_aggregation_node.impl.base_variable_aggregation_node import BaseV
from .variable_assign_node import BaseVariableAssignNode
from .variable_splitting_node import BaseVariableSplittingNode
from .video_understand_step_node import BaseVideoUnderstandNode
from .document_split_node import BaseDocumentSplitNode
node_list = [BaseStartStepNode, BaseChatNode, BaseSearchKnowledgeNode, BaseSearchDocumentNode, BaseQuestionNode,
BaseConditionNode, BaseReplyNode,
@ -50,7 +51,7 @@ node_list = [BaseStartStepNode, BaseChatNode, BaseSearchKnowledgeNode, BaseSearc
BaseIntentNode, BaseLoopNode, BaseLoopStartStepNode,
BaseLoopContinueNode,
BaseLoopBreakNode, BaseVariableSplittingNode, BaseParameterExtractionNode, BaseVariableAggregationNode,
BaseDataSourceLocalNode, BaseDataSourceWebNode, BaseKnowledgeWriteNode]
BaseDataSourceLocalNode, BaseDataSourceWebNode, BaseKnowledgeWriteNode, BaseDocumentSplitNode]
node_map = {n.type: {w: n for w in n.support} for n in node_list}

View File

@ -0,0 +1 @@
from .impl import *

View File

@ -0,0 +1,64 @@
# coding=utf-8
from typing import Type
from django.utils.translation import gettext_lazy as _
from rest_framework import serializers
from application.flow.common import WorkflowMode
from application.flow.i_step_node import INode, NodeResult
class DocumentSplitNodeSerializer(serializers.Serializer):
file_list = serializers.ListField(required=False, label=_("file list"))
split_strategy = serializers.ChoiceField(
choices=['auto', 'custom', 'qa'], required=False, label=_("split strategy"), default='auto'
)
paragraph_title_relate_problem_type = serializers.ChoiceField(
choices=['custom', 'referencing'], required=False, label=_("paragraph title relate problem type"),
default='custom'
)
paragraph_title_relate_problem = serializers.BooleanField(
required=False, label=_("paragraph title relate problem"), default=False
)
paragraph_title_relate_problem_reference = serializers.ListField(
required=False, label=_("paragraph title relate problem reference"), child=serializers.CharField()
)
document_name_relate_problem_type = serializers.ChoiceField(
choices=['custom', 'referencing'], required=False, label=_("document name relate problem type"),
default='custom'
)
document_name_relate_problem = serializers.BooleanField(
required=False, label=_("document name relate problem"), default=False
)
document_name_relate_problem_reference = serializers.ListField(
required=False, label=_("document name relate problem reference"), child=serializers.CharField()
)
limit = serializers.IntegerField(required=False, label=_("limit"), default=4096)
patterns = serializers.ListField(
required=False, label=_("patterns"), child=serializers.CharField(), default=[]
)
with_filter = serializers.BooleanField(
required=False, label=_("with filter"), default=False
)
class IDocumentSplitNode(INode):
type = 'document-split-node'
support = [
WorkflowMode.APPLICATION, WorkflowMode.APPLICATION_LOOP, WorkflowMode.KNOWLEDGE_LOOP, WorkflowMode.KNOWLEDGE
]
def get_node_params_serializer_class(self) -> Type[serializers.Serializer]:
return DocumentSplitNodeSerializer
def _run(self):
res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
self.node_params_serializer.data.get('file_list')[1:])
return self.execute(file_list=res, **self.flow_params_serializer.data)
def execute(self, file_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
document_name_relate_problem_type, document_name_relate_problem,
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
pass

View File

@ -0,0 +1 @@
from .base_document_split_node import BaseDocumentSplitNode

View File

@ -0,0 +1,68 @@
# coding=utf-8
from django.db.models import QuerySet
from application.flow.i_step_node import NodeResult
from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode
from knowledge.models import File
from knowledge.serializers.document import split_handles, FileBufferHandle
class BaseDocumentSplitNode(IDocumentSplitNode):
def save_context(self, details, workflow_manage):
self.context['content'] = details.get('content')
print(details)
def execute(self, file_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
document_name_relate_problem_type, document_name_relate_problem,
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
get_buffer = FileBufferHandle().get_buffer
paragraph_list = []
for doc in file_list:
file = QuerySet(File).filter(id=doc['file_id']).first()
file_id = file.id
for split_handle in split_handles:
if split_handle.support(file, get_buffer):
result = split_handle.handle(file, patterns, with_filter, limit, get_buffer, self.save_image)
if isinstance(result, list):
for item in result:
item['source_file_id'] = file_id
paragraph_list = result
else:
result['source_file_id'] = file_id
paragraph_list = [result]
self.context['file_list'] = file_list
self.context['paragraph_list'] = paragraph_list
print(paragraph_list)
return NodeResult({'paragraph_list': paragraph_list}, {})
def save_image(self, image_list):
# if image_list is not None and len(image_list) > 0:
# exist_image_list = [str(i.get('id')) for i in
# QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
# save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))]
# save_image_list = list({img.id: img for img in save_image_list}.values())
# # save image
# for file in save_image_list:
# file_bytes = file.meta.pop('content')
# file.meta['knowledge_id'] = self.data.get('knowledge_id')
# file.source_type = FileSourceType.KNOWLEDGE
# file.source_id = self.data.get('knowledge_id')
# file.save(file_bytes)
pass
def get_details(self, index: int, **kwargs):
return {
'name': self.node.properties.get('stepName'),
"index": index,
'run_time': self.context.get('run_time'),
'type': self.node.type,
'status': self.status,
'err_message': self.err_message,
'file_list': self.context.get('file_list'),
'paragraph_list': self.context.get('paragraph_list', []),
}

View File

@ -19,6 +19,7 @@ export enum WorkflowType {
RerankerNode = 'reranker-node',
Application = 'application-node',
DocumentExtractNode = 'document-extract-node',
DocumentSplitNode = 'document-split-node',
ImageUnderstandNode = 'image-understand-node',
VariableAssignNode = 'variable-assign-node',
FormNode = 'form-node',

View File

@ -83,7 +83,7 @@ export default {
chunk_length: 'Chunk length',
text: 'Knowledge write',
label: 'Knowledge write',
},
},
dataSourceWebNode: {
label: 'Web Site',
text: 'Web Site',
@ -250,6 +250,16 @@ You are a master of problem optimization, adept at accurately inferring user int
text: 'Extract content from documents',
content: 'Document Content',
},
documentSplitNode: {
label: 'Document Splitting',
text: 'Split document content into smaller segments',
paragraph_list: 'List of split segments',
splitStrategy: {
label: 'Splitting Strategy',
placeholder: 'Please select a splitting strategy',
requiredMessage: 'Please select a splitting strategy',
},
},
imageUnderstandNode: {
label: 'Image Understanding',
text: 'Analyze images to identify objects, scenes, and provide answers',

View File

@ -85,7 +85,7 @@ export default {
chunk_length: '子分块长度',
text: '知识库写入',
label: '知识库写入',
},
},
dataSourceWebNode: {
label: 'Web站点',
text: 'Web站点',
@ -256,6 +256,16 @@ export default {
text: '提取文档中的内容',
content: '文档内容',
},
documentSplitNode: {
label: '文档分段',
text: '将文档内容拆分为多个分段',
paragraph_list: '分段列表',
splitStrategy: {
label: '分段策略',
placeholder: '请选择分段策略',
requiredMessage: '请选择分段策略',
},
},
imageUnderstandNode: {
label: '图片理解',
text: '识别出图片中的对象、场景等信息回答用户问题',

View File

@ -84,7 +84,7 @@ export default {
chunk_length: '子分塊長度',
text: '知識庫寫入',
label: '知識庫寫入',
},
},
dataSourceWebNode: {
label: 'Web網站',
text: 'Web網站',
@ -250,6 +250,16 @@ export default {
text: '提取文檔中的內容',
content: '文檔內容',
},
documentSplitNode: {
label: '文檔拆分',
text: '將文檔內容拆分為多個分段',
paragraph_list: '分段列表',
splitStrategy: {
label: '分段策略',
placeholder: '請選擇分段策略',
requiredMessage: '請選擇分段策略',
},
},
imageUnderstandNode: {
label: '圖片理解',
text: '識別出圖片中的物件、場景等信息回答用戶問題',

View File

@ -387,6 +387,24 @@ export const documentExtractNode = {
},
},
}
export const documentSplitNode = {
type: WorkflowType.DocumentSplitNode,
text: t('views.applicationWorkflow.nodes.documentSplitNode.text'),
label: t('views.applicationWorkflow.nodes.documentSplitNode.label'),
height: 252,
properties: {
width: 500,
stepName: t('views.applicationWorkflow.nodes.documentSplitNode.label'),
config: {
fields: [
{
label: t('views.applicationWorkflow.nodes.documentSplitNode.paragraph_list'),
value: 'paragraph_list',
},
],
},
},
}
export const imageUnderstandNode = {
type: WorkflowType.ImageUnderstandNode,
text: t('views.applicationWorkflow.nodes.imageUnderstandNode.text'),
@ -724,7 +742,7 @@ export const knowledgeMenuNodes = [
},
{
label: t('views.knowledge.title'),
list: [documentExtractNode, knowledgeWriteNode],
list: [documentExtractNode, documentSplitNode, knowledgeWriteNode],
},
{
label: t('views.applicationWorkflow.nodes.classify.businessLogic'),
@ -763,7 +781,7 @@ export const menuNodes = [
},
{
label: t('views.knowledge.title'),
list: [searchKnowledgeNode, searchDocumentNode, rerankerNode, documentExtractNode],
list: [searchKnowledgeNode, searchDocumentNode, rerankerNode, documentExtractNode, documentSplitNode, knowledgeWriteNode],
},
{
label: t('views.applicationWorkflow.nodes.classify.businessLogic'),
@ -949,6 +967,7 @@ export const nodeDict: any = {
[WorkflowType.FormNode]: formNode,
[WorkflowType.Application]: applicationNode,
[WorkflowType.DocumentExtractNode]: documentExtractNode,
[WorkflowType.DocumentSplitNode]: documentSplitNode,
[WorkflowType.ImageUnderstandNode]: imageUnderstandNode,
[WorkflowType.TextToSpeechNode]: textToSpeechNode,
[WorkflowType.SpeechToTextNode]: speechToTextNode,

View File

@ -15,12 +15,12 @@ const end_nodes: Array<string> = [
WorkflowType.ImageGenerateNode,
WorkflowType.ImageToVideoGenerateNode,
WorkflowType.TextToVideoGenerateNode,
WorkflowType.ImageGenerateNode,
WorkflowType.LoopBodyNode,
WorkflowType.LoopNode,
WorkflowType.LoopBreakNode,
WorkflowType.VideoUnderstandNode,
WorkflowType.VariableAssignNode,
WorkflowType.KnowledgeWriteNode,
]
const loop_end_nodes: Array<string> = [
@ -36,7 +36,6 @@ const loop_end_nodes: Array<string> = [
WorkflowType.ImageGenerateNode,
WorkflowType.ImageToVideoGenerateNode,
WorkflowType.TextToVideoGenerateNode,
WorkflowType.ImageGenerateNode,
WorkflowType.LoopBodyNode,
WorkflowType.LoopNode,
WorkflowType.LoopBreakNode,

View File

@ -0,0 +1,6 @@
<template>
<el-avatar shape="square" class="avatar-blue">
<img src="@/assets/workflow/icon_docs.svg" style="width: 65%" alt="" />
</el-avatar>
</template>
<script setup lang="ts"></script>

View File

@ -0,0 +1,14 @@
import DocumentSplitNodeVue from './index.vue'
import { AppNode, AppNodeModel } from '@/workflow/common/app-node'
class DocumentSplitNode extends AppNode {
constructor(props: any) {
super(props, DocumentSplitNodeVue)
}
}
export default {
type: 'document-split-node',
model: AppNodeModel,
view: DocumentSplitNode
}

View File

@ -0,0 +1,275 @@
<template>
<NodeContainer :nodeModel="nodeModel">
<h5 class="title-decoration-1 mb-8">{{ $t('views.applicationWorkflow.nodeSetting') }}</h5>
<el-card shadow="never" class="card-never">
<el-form
ref="aiChatNodeFormRef"
@submit.prevent
:model="form_data"
label-position="top"
require-asterisk-position="right"
label-width="auto"
>
<el-form-item :label="$t('views.problem.relateParagraph.selectDocument')" :rules="{
type: 'array',
required: true,
message: $t('views.chatLog.documentPlaceholder'),
trigger: 'change'
}"
>
<NodeCascader
ref="nodeCascaderRef"
:nodeModel="nodeModel"
class="w-full"
:placeholder="$t('views.chatLog.documentPlaceholder')"
v-model="form_data.file_list"
/>
</el-form-item>
<el-form-item
:label="$t('views.applicationWorkflow.nodes.documentSplitNode.splitStrategy.label')"
:rules="{
required: true,
message: $t('views.applicationWorkflow.nodes.documentSplitNode.splitStrategy.required'),
trigger: 'change'
}"
>
<el-select
v-model="form_data.split_strategy"
:placeholder="$t('views.applicationWorkflow.nodes.documentSplitNode.splitStrategy.placeholder')">
<el-option
:label="$t('views.document.setRules.intelligent.label')"
value="auto"
/>
<el-option
:label="$t('views.document.setRules.advanced.label')"
value="custom"
/>
<el-option
:label="$t('views.document.fileType.QA.label')"
value="qa"
/>
</el-select>
</el-form-item>
<div v-if="form_data.split_strategy === 'custom'">
<div class="set-rules__form">
<div class="form-item mb-16">
<div class="title flex align-center mb-8">
<span style="margin-right: 4px">{{
$t('views.document.setRules.patterns.label')
}}</span>
<el-tooltip
effect="dark"
:content="$t('views.document.setRules.patterns.tooltip')"
placement="right"
>
<AppIcon iconName="app-warning" class="app-warning-icon"></AppIcon>
</el-tooltip>
</div>
<div @click.stop>
<el-select
v-model="form_data.patterns"
multiple
:reserve-keyword="false"
allow-create
default-first-option
filterable
:placeholder="$t('views.document.setRules.patterns.placeholder')"
>
<el-option
v-for="(item, index) in splitPatternList"
:key="index"
:label="item.key"
:value="item.value"
>
</el-option>
</el-select>
</div>
</div>
<div class="form-item mb-16">
<div class="title mb-8">
{{ $t('views.document.setRules.limit.label') }}
</div>
<el-slider
v-model="form_data.limit"
show-input
:show-input-controls="false"
:min="50"
:max="100000"
/>
</div>
<div class="form-item mb-16">
<div class="title mb-8">
{{ $t('views.document.setRules.with_filter.label') }}
</div>
<el-switch size="small" v-model="form_data.with_filter" />
<div style="margin-top: 4px">
<el-text type="info">
{{ $t('views.document.setRules.with_filter.text') }}
</el-text
>
</div>
</div>
</div>
</div>
<el-form-item v-if="form_data.split_strategy !== 'qa'">
<template #label>
<div class="flex-between">
<span>分段标题设置为分段的关联问题</span>
<el-select v-model="form_data.paragraph_title_relate_problem_type" size="small"
style="width: 100px">
<el-option
:label="$t('views.applicationWorkflow.nodes.searchDocumentNode.custom')"
value="custom"
/>
<el-option
:label="$t('views.applicationWorkflow.variable.Referencing')"
value="referencing"
/>
</el-select>
</div>
</template>
<el-switch
v-if="form_data.paragraph_title_relate_problem_type === 'custom'"
size="small"
v-model="form_data.paragraph_title_relate_problem"
/>
<NodeCascader
v-else
ref="nodeCascaderRef2"
:nodeModel="nodeModel"
class="w-full"
:placeholder="$t('views.chatLog.documentPlaceholder')"
v-model="form_data.paragraph_title_relate_problem_reference"
/>
</el-form-item>
<el-form-item>
<template #label>
<div class="flex-between">
<span>文档名称设置为分段的关联问题</span>
<el-select v-model="form_data.document_name_relate_problem_type" size="small"
style="width: 100px">
<el-option
:label="$t('views.applicationWorkflow.nodes.searchDocumentNode.custom')"
value="custom"
/>
<el-option
:label="$t('views.applicationWorkflow.variable.Referencing')"
value="referencing"
/>
</el-select>
</div>
</template>
<el-switch
v-if="form_data.document_name_relate_problem_type === 'custom'"
size="small"
v-model="form_data.document_name_relate_problem"
/>
<NodeCascader
v-else
ref="nodeCascaderRef3"
:nodeModel="nodeModel"
class="w-full"
:placeholder="$t('views.chatLog.documentPlaceholder')"
v-model="form_data.document_name_relate_problem_reference"
/>
</el-form-item>
</el-form>
</el-card>
</NodeContainer>
</template>
<script setup lang="ts">
import NodeContainer from '@/workflow/common/NodeContainer.vue'
import { computed, onMounted, ref } from 'vue'
import { set } from 'lodash'
import NodeCascader from '@/workflow/common/NodeCascader.vue'
import type { FormInstance } from 'element-plus'
import type { KeyValue } from '@/api/type/common.ts'
import { loadSharedApi } from '@/utils/dynamics-api/shared-api.ts'
import { useRoute } from 'vue-router'
const route = useRoute()
const {
query: { id } // idknowledgeID
} = route as any
const apiType = computed(() => {
if (route.path.includes('shared')) {
return 'systemShare'
} else if (route.path.includes('resource-management')) {
return 'systemManage'
} else {
return 'workspace'
}
})
const props = defineProps<{ nodeModel: any }>()
const splitPatternList = ref<Array<KeyValue<string, string>>>([])
const form = {
file_list: [],
split_strategy: 'auto',
paragraph_title_relate_problem_type: 'custom',
paragraph_title_relate_problem: false,
paragraph_title_relate_problem_reference: [],
document_name_relate_problem_type: 'custom',
document_name_relate_problem: false,
document_name_relate_problem_reference: [],
limit: 4096,
patterns: [],
with_filter: false
}
const form_data = computed({
get: () => {
if (props.nodeModel.properties.node_data) {
return props.nodeModel.properties.node_data
} else {
set(props.nodeModel.properties, 'node_data', form)
}
return props.nodeModel.properties.node_data
},
set: (value) => {
set(props.nodeModel.properties, 'node_data', value)
}
})
const aiChatNodeFormRef = ref<FormInstance>()
const nodeCascaderRef = ref()
const nodeCascaderRef2 = ref()
const nodeCascaderRef3 = ref()
const validate = () => {
return Promise.all([
nodeCascaderRef.value ? nodeCascaderRef.value.validate() : Promise.resolve(''),
nodeCascaderRef2.value ? nodeCascaderRef2.value.validate() : Promise.resolve(''),
nodeCascaderRef3.value ? nodeCascaderRef3.value.validate() : Promise.resolve(''),
aiChatNodeFormRef.value?.validate()
]).catch((err: any) => {
return Promise.reject({ node: props.nodeModel, errMessage: err })
})
}
const patternLoading = ref<boolean>(false)
const initSplitPatternList = () => {
loadSharedApi({ type: 'document', systemType: apiType.value })
.listSplitPattern(id, patternLoading)
.then((ok: any) => {
splitPatternList.value = ok.data
})
}
onMounted(() => {
initSplitPatternList()
set(props.nodeModel, 'validate', validate)
})
</script>
<style lang="scss" scoped>
</style>