mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
feat: enhance Document Split Node with result processing and problem list generation
This commit is contained in:
parent
1da372e4bd
commit
80f14f1e54
|
|
@ -1,6 +1,7 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
import io
|
import io
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
from typing import List
|
||||||
|
|
||||||
from django.core.files.uploadedfile import InMemoryUploadedFile
|
from django.core.files.uploadedfile import InMemoryUploadedFile
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet
|
||||||
|
|
@ -38,6 +39,9 @@ class BaseDocumentSplitNode(IDocumentSplitNode):
|
||||||
def save_context(self, details, workflow_manage):
|
def save_context(self, details, workflow_manage):
|
||||||
self.context['content'] = details.get('content')
|
self.context['content'] = details.get('content')
|
||||||
|
|
||||||
|
def get_reference_content(self, fields: List[str]):
|
||||||
|
return self.workflow_manage.get_reference_field(fields[0], fields[1:])
|
||||||
|
|
||||||
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
|
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
|
||||||
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
|
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
|
||||||
document_name_relate_problem_type, document_name_relate_problem,
|
document_name_relate_problem_type, document_name_relate_problem,
|
||||||
|
|
@ -53,21 +57,27 @@ class BaseDocumentSplitNode(IDocumentSplitNode):
|
||||||
|
|
||||||
for split_handle in split_handles:
|
for split_handle in split_handles:
|
||||||
if split_handle.support(file_mem, get_buffer):
|
if split_handle.support(file_mem, get_buffer):
|
||||||
result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self.save_image)
|
result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
|
||||||
if isinstance(result, list):
|
# 统一处理结果为列表
|
||||||
for item in result:
|
results = result if isinstance(result, list) else [result]
|
||||||
item['source_file_id'] = file.id
|
|
||||||
paragraph_list = result
|
for item in results:
|
||||||
else:
|
self._process_split_result(
|
||||||
result['source_file_id'] = file.id
|
item, knowledge_id, file.id, file.file_name,
|
||||||
paragraph_list = [result]
|
split_strategy, paragraph_title_relate_problem_type,
|
||||||
|
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
|
||||||
|
document_name_relate_problem_type, document_name_relate_problem,
|
||||||
|
document_name_relate_problem_reference
|
||||||
|
)
|
||||||
|
|
||||||
|
paragraph_list = results
|
||||||
|
break
|
||||||
|
|
||||||
self.context['paragraph_list'] = paragraph_list
|
self.context['paragraph_list'] = paragraph_list
|
||||||
|
|
||||||
|
|
||||||
return NodeResult({'paragraph_list': paragraph_list}, {})
|
return NodeResult({'paragraph_list': paragraph_list}, {})
|
||||||
|
|
||||||
def save_image(self, image_list):
|
def _save_image(self, image_list):
|
||||||
if image_list is not None and len(image_list) > 0:
|
if image_list is not None and len(image_list) > 0:
|
||||||
exist_image_list = [str(i.get('id')) for i in
|
exist_image_list = [str(i.get('id')) for i in
|
||||||
QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
|
QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
|
||||||
|
|
@ -81,6 +91,58 @@ class BaseDocumentSplitNode(IDocumentSplitNode):
|
||||||
file.source_id = self.context.get('knowledge_id')
|
file.source_id = self.context.get('knowledge_id')
|
||||||
file.save(file_bytes)
|
file.save(file_bytes)
|
||||||
|
|
||||||
|
def _process_split_result(
|
||||||
|
self, item, knowledge_id, source_file_id, file_name,
|
||||||
|
split_strategy, paragraph_title_relate_problem_type,
|
||||||
|
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
|
||||||
|
document_name_relate_problem_type, document_name_relate_problem,
|
||||||
|
document_name_relate_problem_reference
|
||||||
|
):
|
||||||
|
"""处理文档分割结果"""
|
||||||
|
item['meta'] = {
|
||||||
|
'knowledge_id': knowledge_id,
|
||||||
|
'source_file_id': source_file_id
|
||||||
|
}
|
||||||
|
item['paragraphs'] = item.pop('content', [])
|
||||||
|
|
||||||
|
for paragraph in item['paragraphs']:
|
||||||
|
paragraph['problem_list'] = self._generate_problem_list(
|
||||||
|
paragraph, file_name,
|
||||||
|
split_strategy, paragraph_title_relate_problem_type,
|
||||||
|
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
|
||||||
|
document_name_relate_problem_type, document_name_relate_problem,
|
||||||
|
document_name_relate_problem_reference
|
||||||
|
)
|
||||||
|
paragraph['is_active'] = True
|
||||||
|
|
||||||
|
def _generate_problem_list(
|
||||||
|
self, paragraph, document_name, split_strategy, paragraph_title_relate_problem_type,
|
||||||
|
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
|
||||||
|
document_name_relate_problem_type, document_name_relate_problem,
|
||||||
|
document_name_relate_problem_reference
|
||||||
|
):
|
||||||
|
if paragraph_title_relate_problem_type == 'referencing':
|
||||||
|
paragraph_title_relate_problem = self.get_reference_content(paragraph_title_relate_problem_reference)
|
||||||
|
if document_name_relate_problem_type == 'referencing':
|
||||||
|
document_name_relate_problem = self.get_reference_content(document_name_relate_problem_reference)
|
||||||
|
|
||||||
|
problem_list = []
|
||||||
|
if split_strategy == 'auto':
|
||||||
|
if paragraph_title_relate_problem and paragraph.get('title'):
|
||||||
|
problem_list.append(paragraph.get('title'))
|
||||||
|
if document_name_relate_problem and document_name:
|
||||||
|
problem_list.append(document_name)
|
||||||
|
elif split_strategy == 'custom':
|
||||||
|
if paragraph_title_relate_problem:
|
||||||
|
problem_list.extend(paragraph_title_relate_problem)
|
||||||
|
if document_name_relate_problem:
|
||||||
|
problem_list.extend(document_name_relate_problem)
|
||||||
|
elif split_strategy == 'qa':
|
||||||
|
if document_name_relate_problem and document_name:
|
||||||
|
problem_list.append(document_name)
|
||||||
|
|
||||||
|
return problem_list
|
||||||
|
|
||||||
def get_details(self, index: int, **kwargs):
|
def get_details(self, index: int, **kwargs):
|
||||||
return {
|
return {
|
||||||
'name': self.node.properties.get('stepName'),
|
'name': self.node.properties.get('stepName'),
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue