feat: add video message handling and improve video context processing

2025-12-26 01:33:05 +00:00 · 2025-10-16 15:14:41 +08:00 · 2025-10-16 15:14:41 +08:00 · 5fdb6dc34b
parent 57b3aa1254
commit 5fdb6dc34b
8 changed files with 106 additions and 50 deletions
--- a/apps/application/flow/step_node/image_understand_step_node/impl/base_image_understand_node.py
+++ b/apps/application/flow/step_node/image_understand_step_node/impl/base_image_understand_node.py
@ -77,8 +77,6 @@ class BaseImageUnderstandNode(IImageUnderstandNode):
                image,
                **kwargs) -> NodeResult:
        # 处理不正确的参数
-        if image is None or not isinstance(image, list):
-            image = []
        workspace_id = self.workflow_manage.get_body().get('workspace_id')
        image_model = get_model_instance_by_model_workspace_id(model_id, workspace_id,
                                                               **model_params_setting)
@ -91,7 +89,7 @@ class BaseImageUnderstandNode(IImageUnderstandNode):
        message_list = self.generate_message_list(image_model, system, prompt,
                                                  self.get_history_message(history_chat_record, dialogue_number), image)
        self.context['message_list'] = message_list
-        self.context['image_list'] = image
+        self.generate_context_image(image)
        self.context['dialogue_type'] = dialogue_type
        if stream:
            r = image_model.stream(message_list)
@ -104,6 +102,12 @@ class BaseImageUnderstandNode(IImageUnderstandNode):
                               'history_message': history_message, 'question': question.content}, {},
                              _write_context=write_context)

+    def generate_context_image(self, image):
+        if isinstance(image, str) and image.startswith('http'):
+            self.context['image_list'] = [{'url': image}]
+        elif image is not None and len(image) > 0:
+            self.context['image_list'] = image
+
    def get_history_message_for_details(self, history_chat_record, dialogue_number):
        start_index = len(history_chat_record) - dialogue_number
        history_message = reduce(lambda x, y: [*x, *y], [
@ -164,28 +168,32 @@ class BaseImageUnderstandNode(IImageUnderstandNode):
    def generate_prompt_question(self, prompt):
        return HumanMessage(self.workflow_manage.generate_prompt(prompt))

-    def generate_message_list(self, image_model, system: str, prompt: str, history_message, image):
-        if image is not None and len(image) > 0:
-            # 处理多张图片
-            images = []
+    def _process_images(self, image):
+        """
+        处理图像数据，转换为模型可识别的格式
+        """
+        images = []
+        if isinstance(image, str) and image.startswith('http'):
+            images.append({'type': 'image_url', 'image_url': {'url': image}})
+        elif image is not None and len(image) > 0:
            for img in image:
-                if isinstance(img, str) and img.startswith('http'):
-                    images.append({'type': 'image_url', 'image_url': {'url': img}})
-                else:
-                    file_id = img['file_id']
-                    file = QuerySet(File).filter(id=file_id).first()
-                    image_bytes = file.get_bytes()
-                    base64_image = base64.b64encode(image_bytes).decode("utf-8")
-                    image_format = what(None, image_bytes)
-                    images.append(
-                        {'type': 'image_url', 'image_url': {'url': f'data:image/{image_format};base64,{base64_image}'}})
-            messages = [HumanMessage(
-                content=[
-                    {'type': 'text', 'text': self.workflow_manage.generate_prompt(prompt)},
-                    *images
-                ])]
+                file_id = img['file_id']
+                file = QuerySet(File).filter(id=file_id).first()
+                image_bytes = file.get_bytes()
+                base64_image = base64.b64encode(image_bytes).decode("utf-8")
+                image_format = what(None, image_bytes)
+                images.append(
+                    {'type': 'image_url', 'image_url': {'url': f'data:image/{image_format};base64,{base64_image}'}})
+        return images
+
+    def generate_message_list(self, image_model, system: str, prompt: str, history_message, image):
+        prompt_text = self.workflow_manage.generate_prompt(prompt)
+        images = self._process_images(image)
+
+        if images:
+            messages = [HumanMessage(content=[{'type': 'text', 'text': prompt_text}, *images])]
        else:
-            messages = [HumanMessage(self.workflow_manage.generate_prompt(prompt))]
+            messages = [HumanMessage(prompt_text)]

        if system is not None and len(system) > 0:
            return [
--- a/apps/application/flow/step_node/video_understand_step_node/impl/base_video_understand_node.py
+++ b/apps/application/flow/step_node/video_understand_step_node/impl/base_video_understand_node.py
@ -1,5 +1,6 @@
 # coding=utf-8
 import base64
+import mimetypes
 import time
 from functools import reduce
 from imghdr import what
@ -76,9 +77,6 @@ class BaseVideoUnderstandNode(IVideoUnderstandNode):
                chat_record_id,
                video,
                **kwargs) -> NodeResult:
-        # 处理不正确的参数
-        if video is None or not isinstance(video, list):
-            video = []
        workspace_id = self.workflow_manage.get_body().get('workspace_id')
        video_model = get_model_instance_by_model_workspace_id(model_id, workspace_id,
                                                               **model_params_setting)
@ -91,7 +89,7 @@ class BaseVideoUnderstandNode(IVideoUnderstandNode):
        message_list = self.generate_message_list(video_model, system, prompt,
                                                  self.get_history_message(history_chat_record, dialogue_number), video)
        self.context['message_list'] = message_list
-        self.context['video_list'] = video
+        self.generate_context_video(video)
        self.context['dialogue_type'] = dialogue_type
        if stream:
            r = video_model.stream(message_list)
@ -104,6 +102,12 @@ class BaseVideoUnderstandNode(IVideoUnderstandNode):
                               'history_message': history_message, 'question': question.content}, {},
                              _write_context=write_context)

+    def generate_context_video(self, video):
+        if isinstance(video, str) and video.startswith('http'):
+            self.context['video_list'] = [{'url': video}]
+        elif video is not None and len(video) > 0:
+            self.context['video_list'] = video
+
    def get_history_message_for_details(self, history_chat_record, dialogue_number):
        start_index = len(history_chat_record) - dialogue_number
        history_message = reduce(lambda x, y: [*x, *y], [
@ -164,28 +168,29 @@ class BaseVideoUnderstandNode(IVideoUnderstandNode):
    def generate_prompt_question(self, prompt):
        return HumanMessage(self.workflow_manage.generate_prompt(prompt))

+    def _process_videos(self, image):
+        videos = []
+        if isinstance(image, str) and image.startswith('http'):
+            videos.append({'type': 'video_url', 'video_url': {'url': image}})
+        elif image is not None and len(image) > 0:
+            for img in image:
+                file_id = img['file_id']
+                file = QuerySet(File).filter(id=file_id).first()
+                video_bytes = file.get_bytes()
+                base64_video = base64.b64encode(video_bytes).decode("utf-8")
+                video_format = mimetypes.guess_type(file.file_name)[0]  # 获取MIME类型
+                videos.append(
+                    {'type': 'video_url', 'video_url': {'url': f'data:{video_format};base64,{base64_video}'}})
+        return videos
+
    def generate_message_list(self, video_model, system: str, prompt: str, history_message, video):
-        if video is not None and len(video) > 0:
-            # 处理多张图片
-            videos = []
-            for img in video:
-                if isinstance(img, str) and img.startswith('http'):
-                    videos.append({'type': 'video_url', 'video_url': {'url': img}})
-                else:
-                    file_id = img['file_id']
-                    file = QuerySet(File).filter(id=file_id).first()
-                    video_bytes = file.get_bytes()
-                    base64_video = base64.b64encode(video_bytes).decode("utf-8")
-                    video_format = what(None, video_bytes)
-                    videos.append(
-                        {'type': 'video_url', 'video_url': {'url': f'data:video/{video_format};base64,{base64_video}'}})
-            messages = [HumanMessage(
-                content=[
-                    {'type': 'text', 'text': self.workflow_manage.generate_prompt(prompt)},
-                    *videos
-                ])]
+        prompt_text = self.workflow_manage.generate_prompt(prompt)
+        videos = self._process_videos(video)
+
+        if videos:
+            messages = [HumanMessage(content=[{'type': 'text', 'text': prompt_text}, *videos])]
        else:
-            messages = [HumanMessage(self.workflow_manage.generate_prompt(prompt))]
+            messages = [HumanMessage(prompt_text)]

        if system is not None and len(system) > 0:
            return [
--- a/ui/src/components/ai-chat/component/chat-input-operate/index.vue
+++ b/ui/src/components/ai-chat/component/chat-input-operate/index.vue
@ -807,6 +807,7 @@ const getQuestion = () => {
      uploadImageList.value.length > 0,
      uploadDocumentList.value.length > 0,
      uploadAudioList.value.length > 0,
+      uploadVideoList.value.length > 0,
      uploadOtherList.value.length > 0,
    ]
    if (fileLength.filter((f) => f).length > 1) {
@ -818,6 +819,8 @@ const getQuestion = () => {
    } else if (fileLength[2]) {
      return t('chat.uploadFile.audioMessage')
    } else if (fileLength[3]) {
+      return t('chat.uploadFile.videoMessage')
+    } else if (fileLength[4]) {
      return t('chat.uploadFile.otherMessage')
    }
  }
--- a/ui/src/components/ai-chat/component/knowledge-source-component/ExecutionDetailCard.vue
+++ b/ui/src/components/ai-chat/component/knowledge-source-component/ExecutionDetailCard.vue
@ -102,6 +102,21 @@
                    </template>
                  </el-space>
                </div>
+                <div v-if="data.video_list?.length > 0">
+                  <p class="mb-8 color-secondary">{{ $t('common.fileUpload.image') }}:</p>
+
+                  <el-space wrap>
+                    <template v-for="(f, i) in data.video_list" :key="i">
+                      <video
+                        :src="f.url"
+                        style="width: 170px; display: block"
+                        controls
+                        autoplay
+                        class="border-r-6"
+                      />
+                    </template>
+                  </el-space>
+                </div>
                <div v-if="data.other_list?.length > 0">
                  <p class="mb-8 color-secondary">{{ $t('common.fileUpload.document') }}:</p>

@ -581,8 +596,6 @@
                        <video
                          v-if="h.type === 'video_url'"
                          :src="h.video_url.url"
-                          alt=""
-                          fit="cover"
                          style="width: 40px; height: 40px; display: inline-block"
                          class="border-r-6 mr-8"
                        />
--- a/ui/src/locales/lang/en-US/ai-chat.ts
+++ b/ui/src/locales/lang/en-US/ai-chat.ts
@ -76,6 +76,7 @@ export default {
    imageMessage: 'Please process the image content',
    documentMessage: 'Please understand the content of the document',
    audioMessage: 'Please understand the audio content',
+    videoMessage: 'Please understand the video content',
    otherMessage: 'Please understand the file content',
    errorMessage: 'Upload Failed',
    fileMessage: 'Please process the file content',
--- a/ui/src/locales/lang/zh-CN/ai-chat.ts
+++ b/ui/src/locales/lang/zh-CN/ai-chat.ts
@ -74,6 +74,7 @@ export default {
    imageMessage: '请解析图片内容',
    documentMessage: '请理解文档内容',
    audioMessage: '请理解音频内容',
+    videoMessage: '请理解视频内容',
    otherMessage: '请理解文件内容',
    errorMessage: '上传失败',
    fileMessage: '请解析文件内容',
--- a/ui/src/locales/lang/zh-Hant/ai-chat.ts
+++ b/ui/src/locales/lang/zh-Hant/ai-chat.ts
@ -74,6 +74,7 @@ export default {
    imageMessage: '請解析圖片內容',
    documentMessage: '請理解檔案內容',
    audioMessage: '請理解音訊內容',
+    videoMessage: '請理解視頻內容',
    otherMessage: '請理解檔案內容',
    fileMessage: '請解析文件內容',
    errorMessage: '上傳失敗',
--- a/ui/src/workflow/nodes/application-node/index.vue
+++ b/ui/src/workflow/nodes/application-node/index.vue
@ -92,6 +92,28 @@
            v-model="form_data.audio_list"
          />
        </el-form-item>
+        <el-form-item
+          v-if="form_data.hasOwnProperty('video_list') || 'video_list' in form_data"
+          :label="$t('views.applicationWorkflow.nodes.videoUnderstandNode.video.label')"
+          prop="video_list"
+          :rules="{
+            message: $t(
+              'views.applicationWorkflow.nodes.videoUnderstandNode.video.requiredMessage',
+            ),
+            trigger: 'blur',
+            required: false,
+          }"
+        >
+          <NodeCascader
+            ref="nodeCascaderRef"
+            :nodeModel="nodeModel"
+            class="w-full"
+            :placeholder="
+              $t('views.applicationWorkflow.nodes.videoUnderstandNode.video.requiredMessage')
+            "
+            v-model="form_data.video_list"
+          />
+        </el-form-item>
        <div v-for="(field, index) in form_data.api_input_field_list" :key="'api-input-' + index">
          <el-form-item
            :label="field.variable"
@ -191,6 +213,7 @@ const form = {
  document_list: ['start-node', 'document'],
  image_list: ['start-node', 'image'],
  audio_list: ['start-node', 'audio'],
+  video_list: ['start-node', 'video'],
 }

 const applicationNodeFormRef = ref<FormInstance>()
@ -294,8 +317,9 @@ const update_field = () => {
          handleFileUpload('document', fileUploadSetting.document)
          handleFileUpload('image', fileUploadSetting.image)
          handleFileUpload('audio', fileUploadSetting.audio)
+          handleFileUpload('video', fileUploadSetting.video)
        } else {
-          ;['document_list', 'image_list', 'audio_list'].forEach((list) => {
+          ;['document_list', 'image_list', 'audio_list', 'video_list'].forEach((list) => {
            // eslint-disable-next-line vue/no-mutating-props
            delete props.nodeModel.properties.node_data[list]
          })