feat: enhance Document Split Node with result processing and problem list generation

This commit is contained in:
CaptainB 2025-11-21 15:24:40 +08:00
parent 820b68071b
commit 5922597775
6 changed files with 43 additions and 37 deletions

View File

@ -25,5 +25,5 @@ class IDocumentExtractNode(INode):
self.node_params_serializer.data.get('document_list')[1:])
return self.execute(document=res, **self.flow_params_serializer.data)
def execute(self, document, chat_id, **kwargs) -> NodeResult:
def execute(self, document, chat_id=None, **kwargs) -> NodeResult:
pass

View File

@ -42,23 +42,28 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
def save_context(self, details, workflow_manage):
self.context['content'] = details.get('content')
def execute(self, document, chat_id, **kwargs):
def execute(self, document, chat_id=None, **kwargs):
get_buffer = FileBufferHandle().get_buffer
self.context['document_list'] = document
content = []
if document is None or not isinstance(document, list):
return NodeResult({'content': ''}, {})
return NodeResult({'content': '', 'document_list': []}, {})
application = self.workflow_manage.work_flow_post_handler.chat_info.application
# 安全获取 application
application = None
if (self.workflow_manage and
self.workflow_manage.work_flow_post_handler and
self.workflow_manage.work_flow_post_handler.chat_info):
application = self.workflow_manage.work_flow_post_handler.chat_info.application
# doc文件中的图片保存
def save_image(image_list):
for image in image_list:
meta = {
'debug': False if application.id else True,
'debug': False if (application and application.id) else True,
'chat_id': chat_id,
'application_id': str(application.id) if application.id else None,
'application_id': str(application.id) if (application and application.id) else None,
'file_id': str(image.id)
}
file_bytes = image.meta.pop('content')
@ -70,6 +75,7 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
'source_type': FileSourceType.APPLICATION.value
}).upload()
document_list = []
for doc in document:
file = QuerySet(File).filter(id=doc['file_id']).first()
buffer = io.BytesIO(file.get_bytes())
@ -81,9 +87,10 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
buffer.seek(0)
file_content = split_handle.get_content(buffer, save_image)
content.append('### ' + doc['name'] + '\n' + file_content)
document_list.append({'id': file.id, 'name': doc['name'], 'content': file_content})
break
return NodeResult({'content': splitter.join(content)}, {})
return NodeResult({'content': splitter.join(content), 'document_list': document_list}, {})
def get_details(self, index: int, **kwargs):
content = self.context.get('content', '').split(splitter)

View File

@ -10,7 +10,7 @@ from application.flow.i_step_node import INode, NodeResult
class DocumentSplitNodeSerializer(serializers.Serializer):
file_list = serializers.ListField(required=False, label=_("file list"))
document_list = serializers.ListField(required=False, label=_("document list"))
split_strategy = serializers.ChoiceField(
choices=['auto', 'custom', 'qa'], required=False, label=_("split strategy"), default='auto'
)
@ -53,11 +53,11 @@ class IDocumentSplitNode(INode):
return DocumentSplitNodeSerializer
def _run(self):
res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
self.node_params_serializer.data.get('file_list')[1:])
return self.execute(files=res, **self.node_params_serializer.data, **self.flow_params_serializer.data)
# res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
# self.node_params_serializer.data.get('file_list')[1:])
return self.execute(**self.node_params_serializer.data, **self.flow_params_serializer.data)
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
document_name_relate_problem_type, document_name_relate_problem,
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:

View File

@ -9,7 +9,7 @@ from django.db.models import QuerySet
from application.flow.i_step_node import NodeResult
from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode
from knowledge.models import File, FileSourceType
from knowledge.serializers.document import split_handles, FileBufferHandle
from knowledge.serializers.document import default_split_handle, FileBufferHandle
def bytes_to_uploaded_file(file_bytes, file_name="file.txt"):
@ -42,36 +42,31 @@ class BaseDocumentSplitNode(IDocumentSplitNode):
def get_reference_content(self, fields: List[str]):
return self.workflow_manage.get_reference_field(fields[0], fields[1:])
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
document_name_relate_problem_type, document_name_relate_problem,
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
get_buffer = FileBufferHandle().get_buffer
self.context['file_list'] = files
self.context['knowledge_id'] = knowledge_id
file_list = self.workflow_manage.get_reference_field(document_list[0], document_list[1:])
paragraph_list = []
for doc in files:
file = QuerySet(File).filter(id=doc['file_id']).first()
file_mem = bytes_to_uploaded_file(file.get_bytes(), file_name=file.file_name)
get_buffer = FileBufferHandle().get_buffer
for split_handle in split_handles:
if split_handle.support(file_mem, get_buffer):
result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
# 统一处理结果为列表
results = result if isinstance(result, list) else [result]
for doc in file_list:
file_mem = bytes_to_uploaded_file(doc['content'].encode('utf-8'), doc['name'])
result = default_split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
# 统一处理结果为列表
results = result if isinstance(result, list) else [result]
for item in results:
self._process_split_result(
item, knowledge_id, file.id, file.file_name,
split_strategy, paragraph_title_relate_problem_type,
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
document_name_relate_problem_type, document_name_relate_problem,
document_name_relate_problem_reference
)
for item in results:
self._process_split_result(
item, knowledge_id, doc['id'], doc['name'],
split_strategy, paragraph_title_relate_problem_type,
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
document_name_relate_problem_type, document_name_relate_problem,
document_name_relate_problem_reference
)
paragraph_list = results
break
paragraph_list = results
self.context['paragraph_list'] = paragraph_list

View File

@ -383,6 +383,10 @@ export const documentExtractNode = {
label: t('views.applicationWorkflow.nodes.documentExtractNode.content'),
value: 'content',
},
{
label: t('views.applicationWorkflow.nodes.dataSourceWebNode.field_label'),
value: 'document_list',
},
],
},
},

View File

@ -22,7 +22,7 @@
:nodeModel="nodeModel"
class="w-full"
:placeholder="$t('views.chatLog.documentPlaceholder')"
v-model="form_data.file_list"
v-model="form_data.document_list"
/>
</el-form-item>
<el-form-item
@ -207,7 +207,7 @@ const props = defineProps<{ nodeModel: any }>()
const splitPatternList = ref<Array<KeyValue<string, string>>>([])
const form = {
file_list: [],
document_list: [],
split_strategy: 'auto',
paragraph_title_relate_problem_type: 'custom',
paragraph_title_relate_problem: false,