mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2026-01-01 02:52:48 +00:00
feat: enhance Document Split Node with result processing and problem list generation
This commit is contained in:
parent
820b68071b
commit
5922597775
|
|
@ -25,5 +25,5 @@ class IDocumentExtractNode(INode):
|
|||
self.node_params_serializer.data.get('document_list')[1:])
|
||||
return self.execute(document=res, **self.flow_params_serializer.data)
|
||||
|
||||
def execute(self, document, chat_id, **kwargs) -> NodeResult:
|
||||
def execute(self, document, chat_id=None, **kwargs) -> NodeResult:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -42,23 +42,28 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
|
|||
def save_context(self, details, workflow_manage):
|
||||
self.context['content'] = details.get('content')
|
||||
|
||||
def execute(self, document, chat_id, **kwargs):
|
||||
def execute(self, document, chat_id=None, **kwargs):
|
||||
get_buffer = FileBufferHandle().get_buffer
|
||||
|
||||
self.context['document_list'] = document
|
||||
content = []
|
||||
if document is None or not isinstance(document, list):
|
||||
return NodeResult({'content': ''}, {})
|
||||
return NodeResult({'content': '', 'document_list': []}, {})
|
||||
|
||||
application = self.workflow_manage.work_flow_post_handler.chat_info.application
|
||||
# 安全获取 application
|
||||
application = None
|
||||
if (self.workflow_manage and
|
||||
self.workflow_manage.work_flow_post_handler and
|
||||
self.workflow_manage.work_flow_post_handler.chat_info):
|
||||
application = self.workflow_manage.work_flow_post_handler.chat_info.application
|
||||
|
||||
# doc文件中的图片保存
|
||||
def save_image(image_list):
|
||||
for image in image_list:
|
||||
meta = {
|
||||
'debug': False if application.id else True,
|
||||
'debug': False if (application and application.id) else True,
|
||||
'chat_id': chat_id,
|
||||
'application_id': str(application.id) if application.id else None,
|
||||
'application_id': str(application.id) if (application and application.id) else None,
|
||||
'file_id': str(image.id)
|
||||
}
|
||||
file_bytes = image.meta.pop('content')
|
||||
|
|
@ -70,6 +75,7 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
|
|||
'source_type': FileSourceType.APPLICATION.value
|
||||
}).upload()
|
||||
|
||||
document_list = []
|
||||
for doc in document:
|
||||
file = QuerySet(File).filter(id=doc['file_id']).first()
|
||||
buffer = io.BytesIO(file.get_bytes())
|
||||
|
|
@ -81,9 +87,10 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
|
|||
buffer.seek(0)
|
||||
file_content = split_handle.get_content(buffer, save_image)
|
||||
content.append('### ' + doc['name'] + '\n' + file_content)
|
||||
document_list.append({'id': file.id, 'name': doc['name'], 'content': file_content})
|
||||
break
|
||||
|
||||
return NodeResult({'content': splitter.join(content)}, {})
|
||||
return NodeResult({'content': splitter.join(content), 'document_list': document_list}, {})
|
||||
|
||||
def get_details(self, index: int, **kwargs):
|
||||
content = self.context.get('content', '').split(splitter)
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from application.flow.i_step_node import INode, NodeResult
|
|||
|
||||
|
||||
class DocumentSplitNodeSerializer(serializers.Serializer):
|
||||
file_list = serializers.ListField(required=False, label=_("file list"))
|
||||
document_list = serializers.ListField(required=False, label=_("document list"))
|
||||
split_strategy = serializers.ChoiceField(
|
||||
choices=['auto', 'custom', 'qa'], required=False, label=_("split strategy"), default='auto'
|
||||
)
|
||||
|
|
@ -53,11 +53,11 @@ class IDocumentSplitNode(INode):
|
|||
return DocumentSplitNodeSerializer
|
||||
|
||||
def _run(self):
|
||||
res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
|
||||
self.node_params_serializer.data.get('file_list')[1:])
|
||||
return self.execute(files=res, **self.node_params_serializer.data, **self.flow_params_serializer.data)
|
||||
# res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
|
||||
# self.node_params_serializer.data.get('file_list')[1:])
|
||||
return self.execute(**self.node_params_serializer.data, **self.flow_params_serializer.data)
|
||||
|
||||
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
|
||||
def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
|
||||
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
|
||||
document_name_relate_problem_type, document_name_relate_problem,
|
||||
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from django.db.models import QuerySet
|
|||
from application.flow.i_step_node import NodeResult
|
||||
from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode
|
||||
from knowledge.models import File, FileSourceType
|
||||
from knowledge.serializers.document import split_handles, FileBufferHandle
|
||||
from knowledge.serializers.document import default_split_handle, FileBufferHandle
|
||||
|
||||
|
||||
def bytes_to_uploaded_file(file_bytes, file_name="file.txt"):
|
||||
|
|
@ -42,36 +42,31 @@ class BaseDocumentSplitNode(IDocumentSplitNode):
|
|||
def get_reference_content(self, fields: List[str]):
|
||||
return self.workflow_manage.get_reference_field(fields[0], fields[1:])
|
||||
|
||||
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
|
||||
def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
|
||||
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
|
||||
document_name_relate_problem_type, document_name_relate_problem,
|
||||
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
|
||||
get_buffer = FileBufferHandle().get_buffer
|
||||
self.context['file_list'] = files
|
||||
self.context['knowledge_id'] = knowledge_id
|
||||
|
||||
file_list = self.workflow_manage.get_reference_field(document_list[0], document_list[1:])
|
||||
paragraph_list = []
|
||||
for doc in files:
|
||||
file = QuerySet(File).filter(id=doc['file_id']).first()
|
||||
file_mem = bytes_to_uploaded_file(file.get_bytes(), file_name=file.file_name)
|
||||
get_buffer = FileBufferHandle().get_buffer
|
||||
|
||||
for split_handle in split_handles:
|
||||
if split_handle.support(file_mem, get_buffer):
|
||||
result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
|
||||
# 统一处理结果为列表
|
||||
results = result if isinstance(result, list) else [result]
|
||||
for doc in file_list:
|
||||
file_mem = bytes_to_uploaded_file(doc['content'].encode('utf-8'), doc['name'])
|
||||
result = default_split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
|
||||
# 统一处理结果为列表
|
||||
results = result if isinstance(result, list) else [result]
|
||||
|
||||
for item in results:
|
||||
self._process_split_result(
|
||||
item, knowledge_id, file.id, file.file_name,
|
||||
split_strategy, paragraph_title_relate_problem_type,
|
||||
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
|
||||
document_name_relate_problem_type, document_name_relate_problem,
|
||||
document_name_relate_problem_reference
|
||||
)
|
||||
for item in results:
|
||||
self._process_split_result(
|
||||
item, knowledge_id, doc['id'], doc['name'],
|
||||
split_strategy, paragraph_title_relate_problem_type,
|
||||
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
|
||||
document_name_relate_problem_type, document_name_relate_problem,
|
||||
document_name_relate_problem_reference
|
||||
)
|
||||
|
||||
paragraph_list = results
|
||||
break
|
||||
paragraph_list = results
|
||||
|
||||
self.context['paragraph_list'] = paragraph_list
|
||||
|
||||
|
|
|
|||
|
|
@ -383,6 +383,10 @@ export const documentExtractNode = {
|
|||
label: t('views.applicationWorkflow.nodes.documentExtractNode.content'),
|
||||
value: 'content',
|
||||
},
|
||||
{
|
||||
label: t('views.applicationWorkflow.nodes.dataSourceWebNode.field_label'),
|
||||
value: 'document_list',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@
|
|||
:nodeModel="nodeModel"
|
||||
class="w-full"
|
||||
:placeholder="$t('views.chatLog.documentPlaceholder')"
|
||||
v-model="form_data.file_list"
|
||||
v-model="form_data.document_list"
|
||||
/>
|
||||
</el-form-item>
|
||||
<el-form-item
|
||||
|
|
@ -207,7 +207,7 @@ const props = defineProps<{ nodeModel: any }>()
|
|||
const splitPatternList = ref<Array<KeyValue<string, string>>>([])
|
||||
|
||||
const form = {
|
||||
file_list: [],
|
||||
document_list: [],
|
||||
split_strategy: 'auto',
|
||||
paragraph_title_relate_problem_type: 'custom',
|
||||
paragraph_title_relate_problem: false,
|
||||
|
|
|
|||
Loading…
Reference in New Issue