diff --git a/apps/application/flow/step_node/__init__.py b/apps/application/flow/step_node/__init__.py index e95f16cfa..528f1b9c4 100644 --- a/apps/application/flow/step_node/__init__.py +++ b/apps/application/flow/step_node/__init__.py @@ -32,6 +32,7 @@ from .tool_lib_node import * from .tool_node import * from .variable_assign_node import BaseVariableAssignNode from .variable_splitting_node import BaseVariableSplittingNode +from .video_understand_step_node import BaseVideoUnderstandNode node_list = [BaseStartStepNode, BaseChatNode, BaseSearchKnowledgeNode, BaseQuestionNode, BaseConditionNode, BaseReplyNode, @@ -39,6 +40,7 @@ node_list = [BaseStartStepNode, BaseChatNode, BaseSearchKnowledgeNode, BaseQuest BaseDocumentExtractNode, BaseImageUnderstandNode, BaseFormNode, BaseSpeechToTextNode, BaseTextToSpeechNode, BaseImageGenerateNode, BaseVariableAssignNode, BaseMcpNode, BaseTextToVideoNode, BaseImageToVideoNode, + BaseVideoUnderstandNode, BaseIntentNode, BaseLoopNode, BaseLoopStartStepNode, BaseLoopContinueNode, BaseLoopBreakNode, BaseVariableSplittingNode] diff --git a/apps/application/flow/step_node/image_understand_step_node/impl/base_image_understand_node.py b/apps/application/flow/step_node/image_understand_step_node/impl/base_image_understand_node.py index b1cb3efc2..732057048 100644 --- a/apps/application/flow/step_node/image_understand_step_node/impl/base_image_understand_node.py +++ b/apps/application/flow/step_node/image_understand_step_node/impl/base_image_understand_node.py @@ -169,13 +169,16 @@ class BaseImageUnderstandNode(IImageUnderstandNode): # 处理多张图片 images = [] for img in image: - file_id = img['file_id'] - file = QuerySet(File).filter(id=file_id).first() - image_bytes = file.get_bytes() - base64_image = base64.b64encode(image_bytes).decode("utf-8") - image_format = what(None, image_bytes) - images.append( - {'type': 'image_url', 'image_url': {'url': f'data:image/{image_format};base64,{base64_image}'}}) + if isinstance(img, str) and img.startswith('http'): + images.append({'type': 'image_url', 'image_url': {'url': img}}) + else: + file_id = img['file_id'] + file = QuerySet(File).filter(id=file_id).first() + image_bytes = file.get_bytes() + base64_image = base64.b64encode(image_bytes).decode("utf-8") + image_format = what(None, image_bytes) + images.append( + {'type': 'image_url', 'image_url': {'url': f'data:image/{image_format};base64,{base64_image}'}}) messages = [HumanMessage( content=[ {'type': 'text', 'text': self.workflow_manage.generate_prompt(prompt)}, diff --git a/apps/application/flow/step_node/start_node/impl/base_start_node.py b/apps/application/flow/step_node/start_node/impl/base_start_node.py index 29a64591d..af04225cd 100644 --- a/apps/application/flow/step_node/start_node/impl/base_start_node.py +++ b/apps/application/flow/step_node/start_node/impl/base_start_node.py @@ -48,6 +48,7 @@ class BaseStartStepNode(IStarNode): self.context['document'] = details.get('document_list') self.context['image'] = details.get('image_list') self.context['audio'] = details.get('audio_list') + self.context['video'] = details.get('video_list') self.context['other'] = details.get('other_list') self.status = details.get('status') self.err_message = details.get('err_message') @@ -73,6 +74,7 @@ class BaseStartStepNode(IStarNode): 'image': self.workflow_manage.image_list, 'document': self.workflow_manage.document_list, 'audio': self.workflow_manage.audio_list, + 'video': self.workflow_manage.video_list, 'other': self.workflow_manage.other_list, } @@ -97,6 +99,7 @@ class BaseStartStepNode(IStarNode): 'status': self.status, 'err_message': self.err_message, 'image_list': self.context.get('image'), + 'video_list': self.context.get('video'), 'document_list': self.context.get('document'), 'audio_list': self.context.get('audio'), 'other_list': self.context.get('other'), diff --git a/apps/application/flow/step_node/video_understand_step_node/__init__.py b/apps/application/flow/step_node/video_understand_step_node/__init__.py new file mode 100644 index 000000000..f3feecc9c --- /dev/null +++ b/apps/application/flow/step_node/video_understand_step_node/__init__.py @@ -0,0 +1,3 @@ +# coding=utf-8 + +from .impl import * diff --git a/apps/application/flow/step_node/video_understand_step_node/i_video_understand_node.py b/apps/application/flow/step_node/video_understand_step_node/i_video_understand_node.py new file mode 100644 index 000000000..3266a8e0b --- /dev/null +++ b/apps/application/flow/step_node/video_understand_step_node/i_video_understand_node.py @@ -0,0 +1,47 @@ +# coding=utf-8 + +from typing import Type + +from rest_framework import serializers + +from application.flow.i_step_node import INode, NodeResult + +from django.utils.translation import gettext_lazy as _ + + +class VideoUnderstandNodeSerializer(serializers.Serializer): + model_id = serializers.CharField(required=True, label=_("Model id")) + system = serializers.CharField(required=False, allow_blank=True, allow_null=True, + label=_("Role Setting")) + prompt = serializers.CharField(required=True, label=_("Prompt word")) + # 多轮对话数量 + dialogue_number = serializers.IntegerField(required=True, label=_("Number of multi-round conversations")) + + dialogue_type = serializers.CharField(required=True, label=_("Conversation storage type")) + + is_result = serializers.BooleanField(required=False, + label=_('Whether to return content')) + + video_list = serializers.ListField(required=False, label=_("video")) + + model_params_setting = serializers.JSONField(required=False, default=dict, + label=_("Model parameter settings")) + + +class IVideoUnderstandNode(INode): + type = 'video-understand-node' + + def get_node_params_serializer_class(self) -> Type[serializers.Serializer]: + return VideoUnderstandNodeSerializer + + def _run(self): + res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('video_list')[0], + self.node_params_serializer.data.get('video_list')[1:]) + return self.execute(video=res, **self.node_params_serializer.data, **self.flow_params_serializer.data) + + def execute(self, model_id, system, prompt, dialogue_number, dialogue_type, history_chat_record, stream, chat_id, + model_params_setting, + chat_record_id, + video, + **kwargs) -> NodeResult: + pass diff --git a/apps/application/flow/step_node/video_understand_step_node/impl/__init__.py b/apps/application/flow/step_node/video_understand_step_node/impl/__init__.py new file mode 100644 index 000000000..555faa26b --- /dev/null +++ b/apps/application/flow/step_node/video_understand_step_node/impl/__init__.py @@ -0,0 +1,3 @@ +# coding=utf-8 + +from .base_video_understand_node import BaseVideoUnderstandNode diff --git a/apps/application/flow/step_node/video_understand_step_node/impl/base_video_understand_node.py b/apps/application/flow/step_node/video_understand_step_node/impl/base_video_understand_node.py new file mode 100644 index 000000000..48d764212 --- /dev/null +++ b/apps/application/flow/step_node/video_understand_step_node/impl/base_video_understand_node.py @@ -0,0 +1,229 @@ +# coding=utf-8 +import base64 +import time +from functools import reduce +from imghdr import what +from typing import List, Dict + +from django.db.models import QuerySet +from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage, AIMessage + +from application.flow.i_step_node import NodeResult, INode +from application.flow.step_node.video_understand_step_node.i_video_understand_node import IVideoUnderstandNode +from knowledge.models import File +from models_provider.tools import get_model_instance_by_model_workspace_id + + +def _write_context(node_variable: Dict, workflow_variable: Dict, node: INode, workflow, answer: str): + chat_model = node_variable.get('chat_model') + message_tokens = node_variable['usage_metadata']['output_tokens'] if 'usage_metadata' in node_variable else 0 + answer_tokens = chat_model.get_num_tokens(answer) + node.context['message_tokens'] = message_tokens + node.context['answer_tokens'] = answer_tokens + node.context['answer'] = answer + node.context['history_message'] = node_variable['history_message'] + node.context['question'] = node_variable['question'] + node.context['run_time'] = time.time() - node.context['start_time'] + if workflow.is_result(node, NodeResult(node_variable, workflow_variable)): + node.answer_text = answer + + +def write_context_stream(node_variable: Dict, workflow_variable: Dict, node: INode, workflow): + """ + 写入上下文数据 (流式) + @param node_variable: 节点数据 + @param workflow_variable: 全局数据 + @param node: 节点 + @param workflow: 工作流管理器 + """ + response = node_variable.get('result') + answer = '' + for chunk in response: + answer += chunk.content + yield chunk.content + _write_context(node_variable, workflow_variable, node, workflow, answer) + + +def write_context(node_variable: Dict, workflow_variable: Dict, node: INode, workflow): + """ + 写入上下文数据 + @param node_variable: 节点数据 + @param workflow_variable: 全局数据 + @param node: 节点实例对象 + @param workflow: 工作流管理器 + """ + response = node_variable.get('result') + answer = response.content + _write_context(node_variable, workflow_variable, node, workflow, answer) + + +def file_id_to_base64(file_id: str): + file = QuerySet(File).filter(id=file_id).first() + file_bytes = file.get_bytes() + base64_video = base64.b64encode(file_bytes).decode("utf-8") + return [base64_video, what(None, file_bytes)] + + +class BaseVideoUnderstandNode(IVideoUnderstandNode): + def save_context(self, details, workflow_manage): + self.context['answer'] = details.get('answer') + self.context['question'] = details.get('question') + if self.node_params.get('is_result', False): + self.answer_text = details.get('answer') + + def execute(self, model_id, system, prompt, dialogue_number, dialogue_type, history_chat_record, stream, chat_id, + model_params_setting, + chat_record_id, + video, + **kwargs) -> NodeResult: + # 处理不正确的参数 + if video is None or not isinstance(video, list): + video = [] + workspace_id = self.workflow_manage.get_body().get('workspace_id') + video_model = get_model_instance_by_model_workspace_id(model_id, workspace_id, + **model_params_setting) + # 执行详情中的历史消息不需要图片内容 + history_message = self.get_history_message_for_details(history_chat_record, dialogue_number) + self.context['history_message'] = history_message + question = self.generate_prompt_question(prompt) + self.context['question'] = question.content + # 生成消息列表, 真实的history_message + message_list = self.generate_message_list(video_model, system, prompt, + self.get_history_message(history_chat_record, dialogue_number), video) + self.context['message_list'] = message_list + self.context['video_list'] = video + self.context['dialogue_type'] = dialogue_type + if stream: + r = video_model.stream(message_list) + return NodeResult({'result': r, 'chat_model': video_model, 'message_list': message_list, + 'history_message': history_message, 'question': question.content}, {}, + _write_context=write_context_stream) + else: + r = video_model.invoke(message_list) + return NodeResult({'result': r, 'chat_model': video_model, 'message_list': message_list, + 'history_message': history_message, 'question': question.content}, {}, + _write_context=write_context) + + def get_history_message_for_details(self, history_chat_record, dialogue_number): + start_index = len(history_chat_record) - dialogue_number + history_message = reduce(lambda x, y: [*x, *y], [ + [self.generate_history_human_message_for_details(history_chat_record[index]), + self.generate_history_ai_message(history_chat_record[index])] + for index in + range(start_index if start_index > 0 else 0, len(history_chat_record))], []) + return history_message + + def generate_history_ai_message(self, chat_record): + for val in chat_record.details.values(): + if self.node.id == val['node_id'] and 'video_list' in val: + if val['dialogue_type'] == 'WORKFLOW': + return chat_record.get_ai_message() + return AIMessage(content=val['answer']) + return chat_record.get_ai_message() + + def generate_history_human_message_for_details(self, chat_record): + for data in chat_record.details.values(): + if self.node.id == data['node_id'] and 'video_list' in data: + video_list = data['video_list'] + if len(video_list) == 0 or data['dialogue_type'] == 'WORKFLOW': + return HumanMessage(content=chat_record.problem_text) + file_id_list = [video.get('file_id') for video in video_list] + return HumanMessage(content=[ + {'type': 'text', 'text': data['question']}, + *[{'type': 'video_url', 'video_url': {'url': f'./oss/file/{file_id}'}} for file_id in file_id_list] + + ]) + return HumanMessage(content=chat_record.problem_text) + + def get_history_message(self, history_chat_record, dialogue_number): + start_index = len(history_chat_record) - dialogue_number + history_message = reduce(lambda x, y: [*x, *y], [ + [self.generate_history_human_message(history_chat_record[index]), + self.generate_history_ai_message(history_chat_record[index])] + for index in + range(start_index if start_index > 0 else 0, len(history_chat_record))], []) + return history_message + + def generate_history_human_message(self, chat_record): + + for data in chat_record.details.values(): + if self.node.id == data['node_id'] and 'video_list' in data: + video_list = data['video_list'] + if len(video_list) == 0 or data['dialogue_type'] == 'WORKFLOW': + return HumanMessage(content=chat_record.problem_text) + video_base64_list = [file_id_to_base64(video.get('file_id')) for video in video_list] + return HumanMessage( + content=[ + {'type': 'text', 'text': data['question']}, + *[{'type': 'video_url', + 'video_url': {'url': f'data:video/{base64_video[1]};base64,{base64_video[0]}'}} for + base64_video in video_base64_list] + ]) + return HumanMessage(content=chat_record.problem_text) + + def generate_prompt_question(self, prompt): + return HumanMessage(self.workflow_manage.generate_prompt(prompt)) + + def generate_message_list(self, video_model, system: str, prompt: str, history_message, video): + if video is not None and len(video) > 0: + # 处理多张图片 + videos = [] + for img in video: + if isinstance(img, str) and img.startswith('http'): + videos.append({'type': 'video_url', 'video_url': {'url': img}}) + else: + file_id = img['file_id'] + file = QuerySet(File).filter(id=file_id).first() + video_bytes = file.get_bytes() + base64_video = base64.b64encode(video_bytes).decode("utf-8") + video_format = what(None, video_bytes) + videos.append( + {'type': 'video_url', 'video_url': {'url': f'data:video/{video_format};base64,{base64_video}'}}) + messages = [HumanMessage( + content=[ + {'type': 'text', 'text': self.workflow_manage.generate_prompt(prompt)}, + *videos + ])] + else: + messages = [HumanMessage(self.workflow_manage.generate_prompt(prompt))] + + if system is not None and len(system) > 0: + return [ + SystemMessage(self.workflow_manage.generate_prompt(system)), + *history_message, + *messages + ] + else: + return [ + *history_message, + *messages + ] + + @staticmethod + def reset_message_list(message_list: List[BaseMessage], answer_text): + result = [{'role': 'user' if isinstance(message, HumanMessage) else 'ai', 'content': message.content} for + message + in + message_list] + result.append({'role': 'ai', 'content': answer_text}) + return result + + def get_details(self, index: int, **kwargs): + return { + 'name': self.node.properties.get('stepName'), + "index": index, + 'run_time': self.context.get('run_time'), + 'system': self.node_params.get('system'), + 'history_message': [{'content': message.content, 'role': message.type} for message in + (self.context.get('history_message') if self.context.get( + 'history_message') is not None else [])], + 'question': self.context.get('question'), + 'answer': self.context.get('answer'), + 'type': self.node.type, + 'message_tokens': self.context.get('message_tokens'), + 'answer_tokens': self.context.get('answer_tokens'), + 'status': self.status, + 'err_message': self.err_message, + 'video_list': self.context.get('video_list'), + 'dialogue_type': self.context.get('dialogue_type') + } diff --git a/apps/application/flow/workflow_manage.py b/apps/application/flow/workflow_manage.py index 5fd82299c..271ddc795 100644 --- a/apps/application/flow/workflow_manage.py +++ b/apps/application/flow/workflow_manage.py @@ -94,6 +94,7 @@ class WorkflowManage: base_to_response: BaseToResponse = SystemToResponse(), form_data=None, image_list=None, document_list=None, audio_list=None, + video_list=None, other_list=None, start_node_id=None, start_node_data=None, chat_record=None, child_node=None): @@ -105,12 +106,15 @@ class WorkflowManage: document_list = [] if audio_list is None: audio_list = [] + if video_list is None: + video_list = [] if other_list is None: other_list = [] self.start_node_id = start_node_id self.start_node = None self.form_data = form_data self.image_list = image_list + self.video_list = video_list self.document_list = document_list self.audio_list = audio_list self.other_list = other_list diff --git a/apps/chat/serializers/chat.py b/apps/chat/serializers/chat.py index eec64e1ba..d064a0c9e 100644 --- a/apps/chat/serializers/chat.py +++ b/apps/chat/serializers/chat.py @@ -375,6 +375,7 @@ class ChatSerializers(serializers.Serializer): chat_user_type = self.data.get('chat_user_type') form_data = instance.get('form_data') image_list = instance.get('image_list') + video_list = instance.get('video_list') document_list = instance.get('document_list') audio_list = instance.get('audio_list') other_list = instance.get('other_list') @@ -401,6 +402,7 @@ class ChatSerializers(serializers.Serializer): 'application_id': str(chat_info.application_id)}, WorkFlowPostHandler(chat_info), base_to_response, form_data, image_list, document_list, audio_list, + video_list, other_list, instance.get('runtime_node_id'), instance.get('node_data'), chat_record, instance.get('child_node')) diff --git a/ui/src/assets/workflow/icon_file-video.svg b/ui/src/assets/workflow/icon_file-video.svg new file mode 100644 index 000000000..4511c36f5 --- /dev/null +++ b/ui/src/assets/workflow/icon_file-video.svg @@ -0,0 +1,6 @@ + diff --git a/ui/src/assets/workflow/icon_video.svg b/ui/src/assets/workflow/icon_video.svg new file mode 100644 index 000000000..4696b5838 --- /dev/null +++ b/ui/src/assets/workflow/icon_video.svg @@ -0,0 +1,11 @@ + diff --git a/ui/src/components/ai-chat/component/chat-input-operate/index.vue b/ui/src/components/ai-chat/component/chat-input-operate/index.vue index 9bc13b328..b0e866732 100644 --- a/ui/src/components/ai-chat/component/chat-input-operate/index.vue +++ b/ui/src/components/ai-chat/component/chat-input-operate/index.vue @@ -3,7 +3,8 @@
+ {{ $t('common.fileUpload.video') }}
+
{{ videoExtensions.join('、') }}
+