feat: Application text to speech and speech to text functions (#3433)

2025-12-26 01:33:05 +00:00 · 2025-06-30 18:03:11 +08:00 · 2025-06-30 18:03:11 +08:00 · d8a9c9ccdd
parent 2aa86ebfaa
commit d8a9c9ccdd
16 changed files with 506 additions and 79 deletions
--- a/apps/application/api/application_api.py
+++ b/apps/application/api/application_api.py
@ -12,7 +12,7 @@ from drf_spectacular.utils import OpenApiParameter
 from rest_framework import serializers

 from application.serializers.application import ApplicationCreateSerializer, ApplicationListResponse, \
-    ApplicationImportRequest, ApplicationEditSerializer
+    ApplicationImportRequest, ApplicationEditSerializer, TextToSpeechRequest, SpeechToTextRequest, PlayDemoTextRequest
 from common.mixins.api_mixin import APIMixin
 from common.result import ResultSerializer, ResultPageSerializer, DefaultResultSerializer

@ -167,3 +167,45 @@ class ApplicationEditAPI(APIMixin):
    @staticmethod
    def get_request():
        return ApplicationEditSerializer
+
+
+class TextToSpeechAPI(APIMixin):
+    @staticmethod
+    def get_parameters():
+        return ApplicationOperateAPI.get_parameters()
+
+    @staticmethod
+    def get_request():
+        return TextToSpeechRequest
+
+    @staticmethod
+    def get_response():
+        return DefaultResultSerializer
+
+
+class SpeechToTextAPI(APIMixin):
+    @staticmethod
+    def get_parameters():
+        return ApplicationOperateAPI.get_parameters()
+
+    @staticmethod
+    def get_request():
+        return SpeechToTextRequest
+
+    @staticmethod
+    def get_response():
+        return DefaultResultSerializer
+
+
+class PlayDemoTextAPI(APIMixin):
+    @staticmethod
+    def get_parameters():
+        return ApplicationOperateAPI.get_parameters()
+
+    @staticmethod
+    def get_request():
+        return PlayDemoTextRequest
+
+    @staticmethod
+    def get_response():
+        return DefaultResultSerializer
--- a/apps/application/chat_pipeline/step/search_dataset_step/impl/base_search_dataset_step.py
+++ b/apps/application/chat_pipeline/step/search_dataset_step/impl/base_search_dataset_step.py
@ -103,7 +103,7 @@ class BaseSearchDatasetStep(ISearchDatasetStep):
        paragraph_list = native_search(QuerySet(Paragraph).filter(id__in=paragraph_id_list),
                                       get_file_content(
                                           os.path.join(PROJECT_DIR, "apps", "application", 'sql',
-                                                        'list_dataset_paragraph_by_paragraph_id.sql')),
+                                                        'list_knowledge_paragraph_by_paragraph_id.sql')),
                                       with_table_name=True)
        # 如果向量库中存在脏数据 直接删除
        if len(paragraph_list) != len(paragraph_id_list):
--- a/apps/application/serializers/application.py
+++ b/apps/application/serializers/application.py
@ -6,8 +6,10 @@
    @date：2025/5/26 17:03
    @desc:
 """
+import asyncio
 import datetime
 import hashlib
+import json
 import os
 import pickle
 import re
@ -19,6 +21,7 @@ from django.db import models, transaction
 from django.db.models import QuerySet, Q
 from django.http import HttpResponse
 from django.utils.translation import gettext_lazy as _
+from langchain_mcp_adapters.client import MultiServerMCPClient
 from rest_framework import serializers, status
 from rest_framework.utils.formatting import lazy_format

@ -36,6 +39,7 @@ from knowledge.models import Knowledge, KnowledgeScope
 from knowledge.serializers.knowledge import KnowledgeSerializer, KnowledgeModelSerializer
 from maxkb.conf import PROJECT_DIR
 from models_provider.models import Model
+from models_provider.tools import get_model_instance_by_model_workspace_id
 from system_manage.models import WorkspaceUserResourcePermission
 from tools.models import Tool, ToolScope
 from tools.serializers.tool import ToolModelSerializer
@ -384,9 +388,9 @@ class ApplicationEditSerializer(serializers.Serializer):
                                               label=_("Historical chat records"))
    prologue = serializers.CharField(required=False, allow_null=True, allow_blank=True, max_length=102400,
                                     label=_("Opening remarks"))
-    dataset_id_list = serializers.ListSerializer(required=False, child=serializers.UUIDField(required=True),
-                                                 label=_("Related Knowledge Base")
-                                                 )
+    knowledge_id_list = serializers.ListSerializer(required=False, child=serializers.UUIDField(required=True),
+                                                   label=_("Related Knowledge Base")
+                                                   )
    # 数据集相关设置
    knowledge_setting = KnowledgeSettingSerializer(required=False, allow_null=True,
                                                   label=_("Dataset settings"))
@ -441,8 +445,8 @@ class ApplicationSerializer(serializers.Serializer):
        return ApplicationCreateSerializer.ApplicationResponse(application_model).data

    @staticmethod
-    def to_application_knowledge_mapping(application_id: str, dataset_id: str):
-        return ApplicationKnowledgeMapping(id=uuid.uuid7(), application_id=application_id, dataset_id=dataset_id)
+    def to_application_knowledge_mapping(application_id: str, knowledge_id: str):
+        return ApplicationKnowledgeMapping(id=uuid.uuid7(), application_id=application_id, knowledge_id=knowledge_id)

    def insert_simple(self, instance: Dict):
        self.is_valid(raise_exception=True)
@ -451,10 +455,10 @@ class ApplicationSerializer(serializers.Serializer):
        ApplicationCreateSerializer.SimplateRequest(data=instance).is_valid(user_id=user_id, raise_exception=True)
        application_model = ApplicationCreateSerializer.SimplateRequest.to_application_model(user_id, workspace_id,
                                                                                             instance)
-        dataset_id_list = instance.get('knowledge_id_list', [])
+        knowledge_id_list = instance.get('knowledge_id_list', [])
        application_knowledge_mapping_model_list = [
-            self.to_application_knowledge_mapping(application_model.id, dataset_id) for
-            dataset_id in dataset_id_list]
+            self.to_application_knowledge_mapping(application_model.id, knowledge_id) for
+            knowledge_id in knowledge_id_list]
        # 插入应用
        application_model.save()
        # 插入认证信息
@ -519,15 +523,15 @@ class ApplicationSerializer(serializers.Serializer):
    def to_application(application, workspace_id, user_id):
        work_flow = application.get('work_flow')
        for node in work_flow.get('nodes', []):
-            if node.get('type') == 'search-dataset-node':
-                node.get('properties', {}).get('node_data', {})['dataset_id_list'] = []
+            if node.get('type') == 'search-knowledge-node':
+                node.get('properties', {}).get('node_data', {})['knowledge_id_list'] = []
        return Application(id=uuid.uuid7(),
                           user_id=user_id,
                           name=application.get('name'),
                           workspace_id=workspace_id,
                           desc=application.get('desc'),
                           prologue=application.get('prologue'), dialogue_number=application.get('dialogue_number'),
-                           dataset_setting=application.get('dataset_setting'),
+                           knowledge_setting=application.get('knowledge_setting'),
                           model_setting=application.get('model_setting'),
                           model_params_setting=application.get('model_params_setting'),
                           tts_model_params_setting=application.get('tts_model_params_setting'),
@ -545,6 +549,27 @@ class ApplicationSerializer(serializers.Serializer):
                           )


+class TextToSpeechRequest(serializers.Serializer):
+    text = serializers.CharField(required=True, label=_('Text'))
+
+
+class SpeechToTextRequest(serializers.Serializer):
+    file = UploadedFileField(required=True, label=_("file"))
+
+
+class PlayDemoTextRequest(serializers.Serializer):
+    tts_model_id = serializers.UUIDField(required=True, label=_('Text to speech model ID'))
+
+
+async def get_mcp_tools(servers):
+    async with MultiServerMCPClient(servers) as client:
+        return client.get_tools()
+
+
+class McpServersSerializer(serializers.Serializer):
+    mcp_servers = serializers.JSONField(required=True)
+
+
 class ApplicationOperateSerializer(serializers.Serializer):
    application_id = serializers.UUIDField(required=True, label=_("Application ID"))
    user_id = serializers.UUIDField(required=True, label=_("User ID"))
@ -559,6 +584,23 @@ class ApplicationOperateSerializer(serializers.Serializer):
        if not query_set.exists():
            raise AppApiException(500, _('Application id does not exist'))

+    def get_mcp_servers(self, instance, with_valid=True):
+        if with_valid:
+            self.is_valid(raise_exception=True)
+            McpServersSerializer(data=instance).is_valid(raise_exception=True)
+        servers = json.loads(instance.get('mcp_servers'))
+        tools = []
+        for server in servers:
+            tools += [
+                {
+                    'server': server,
+                    'name': tool.name,
+                    'description': tool.description,
+                    'args_schema': tool.args_schema,
+                }
+                for tool in asyncio.run(get_mcp_tools({server: servers[server]}))]
+        return tools
+
    def delete(self, with_valid=True):
        if with_valid:
            self.is_valid()
@ -691,7 +733,7 @@ class ApplicationOperateSerializer(serializers.Serializer):
        if application.type == ApplicationTypeChoices.SIMPLE.value:
            application.is_publish = True
        update_keys = ['name', 'desc', 'model_id', 'multiple_rounds_dialogue', 'prologue', 'status',
-                       'dataset_setting', 'model_setting', 'problem_optimization', 'dialogue_number',
+                       'knowledge_setting', 'model_setting', 'problem_optimization', 'dialogue_number',
                       'stt_model_id', 'tts_model_id', 'tts_model_enable', 'stt_model_enable', 'tts_type',
                       'tts_autoplay', 'stt_autosend', 'file_upload_enable', 'file_upload_setting',
                       'api_key_is_active', 'icon', 'work_flow', 'model_params_setting', 'tts_model_params_setting',
@ -746,7 +788,7 @@ class ApplicationOperateSerializer(serializers.Serializer):
        """
        修改知识库检索节点 数据
        定义 all_knowledge_id_list:    所有的关联知识库
-            dataset_id_list:          当前用户可看到的关联知识库列表
+            knowledge_id_list:          当前用户可看到的关联知识库列表
            knowledge_list:           用户
        @param workflow:              知识库
        @param available_knowledge_dict:   当前用户可用的知识库
@ -802,3 +844,35 @@ class ApplicationOperateSerializer(serializers.Serializer):
        QuerySet(ApplicationKnowledgeMapping).bulk_create(
            [ApplicationKnowledgeMapping(application_id=application_id, knowledge_id=knowledge_id) for knowledge_id in
             knowledge_id_list]) if len(knowledge_id_list) > 0 else None
+
+    def speech_to_text(self, instance, with_valid=True):
+        if with_valid:
+            self.is_valid(raise_exception=True)
+            SpeechToTextRequest(data=instance).is_valid(raise_exception=True)
+        application_id = self.data.get('application_id')
+        application = QuerySet(Application).filter(id=application_id).first()
+        if application.stt_model_enable:
+            model = get_model_instance_by_model_workspace_id(application.stt_model_id, application.workspace_id)
+            text = model.speech_to_text(instance.get('file'))
+            return text
+
+    def text_to_speech(self, instance, with_valid=True):
+        if with_valid:
+            self.is_valid(raise_exception=True)
+            TextToSpeechRequest(data=instance).is_valid(raise_exception=True)
+        application_id = self.data.get('application_id')
+        application = QuerySet(Application).filter(id=application_id).first()
+        if application.tts_model_enable:
+            model = get_model_instance_by_model_workspace_id(application.tts_model_id, application.workspace_id,
+                                                             **application.tts_model_params_setting)
+
+            return model.text_to_speech(instance.get('text'))
+
+    def play_demo_text(self, instance, with_valid=True):
+        text = '你好，这里是语音播放测试'
+        if with_valid:
+            self.is_valid(raise_exception=True)
+            PlayDemoTextRequest(data=instance).is_valid(raise_exception=True)
+        tts_model_id = instance.pop('tts_model_id')
+        model = get_model_instance_by_model_workspace_id(tts_model_id, self.data.get('workspace_id'), **instance)
+        return model.text_to_speech(text)
--- a/apps/application/urls.py
+++ b/apps/application/urls.py
@ -30,6 +30,10 @@ urlpatterns = [
    path('workspace/<str:workspace_id>/application/<str:application_id>/work_flow_version/<int:current_page>/<int:page_size>', views.ApplicationVersionView.Page.as_view()),
    path('workspace/<str:workspace_id>/application/<str:application_id>/work_flow_version/<str:work_flow_version_id>', views.ApplicationVersionView.Operate.as_view()),
    path('workspace/<str:workspace_id>/application/<str:application_id>/open', views.OpenView.as_view()),
+    path('workspace/<str:workspace_id>/application/<str:application_id>/text_to_speech', views.TextToSpeech.as_view()),
+    path('workspace/<str:workspace_id>/application/<str:application_id>/speech_to_text', views.SpeechToText.as_view()),
+    path('workspace/<str:workspace_id>/application/<str:application_id>/play_demo_text', views.PlayDemoText.as_view()),
+    path('workspace/<str:workspace_id>/application/<str:application_id>/mcp_tools', views.McpServers.as_view()),
    path('chat_message/<str:chat_id>', views.ChatView.as_view()),

 ]
--- a/apps/application/views/application.py
+++ b/apps/application/views/application.py
@ -7,6 +7,7 @@
    @desc:
 """
 from django.db.models import QuerySet
+from django.http import HttpResponse
 from django.utils.translation import gettext_lazy as _
 from drf_spectacular.utils import extend_schema
 from rest_framework.parsers import MultiPartParser
@ -14,13 +15,14 @@ from rest_framework.request import Request
 from rest_framework.views import APIView

 from application.api.application_api import ApplicationCreateAPI, ApplicationQueryAPI, ApplicationImportAPI, \
-    ApplicationExportAPI, ApplicationOperateAPI, ApplicationEditAPI
+    ApplicationExportAPI, ApplicationOperateAPI, ApplicationEditAPI, TextToSpeechAPI, SpeechToTextAPI, PlayDemoTextAPI
 from application.models import Application
-from application.serializers.application import ApplicationSerializer, Query, ApplicationOperateSerializer
+from application.serializers.application import ApplicationSerializer, Query, ApplicationOperateSerializer, \
+    McpServersSerializer
 from common import result
 from common.auth import TokenAuth
 from common.auth.authentication import has_permissions
-from common.constants.permission_constants import PermissionConstants, RoleConstants
+from common.constants.permission_constants import PermissionConstants, RoleConstants, CompareConstants
 from common.log.log import log


@ -233,3 +235,101 @@ class ApplicationAPI(APIView):
                ApplicationOperateSerializer(
                    data={'application_id': application_id, 'user_id': request.user.id,
                          'workspace_id': workspace_id, }).publish(request.data))
+
+
+class McpServers(APIView):
+    authentication_classes = [TokenAuth]
+
+    @extend_schema(
+        methods=['GET'],
+        description=_("speech to text"),
+        summary=_("speech to text"),
+        operation_id=_("speech to text"),  # type: ignore
+        parameters=SpeechToTextAPI.get_parameters(),
+        request=SpeechToTextAPI.get_request(),
+        responses=SpeechToTextAPI.get_response(),
+        tags=[_('Application')]  # type: ignore
+    )
+    @has_permissions(PermissionConstants.APPLICATION_READ.get_workspace_application_permission(),
+                     PermissionConstants.APPLICATION_READ.get_workspace_permission_workspace_manage_role(),
+                     RoleConstants.USER.get_workspace_role(),
+                     RoleConstants.WORKSPACE_MANAGE.get_workspace_role())
+    def get(self, request: Request, workspace_id, application_id: str):
+        return result.success(ApplicationOperateSerializer(
+            data={'mcp_servers': request.query_params.get('mcp_servers')}).get_mcp_servers())
+
+
+class SpeechToText(APIView):
+    authentication_classes = [TokenAuth]
+
+    @extend_schema(
+        methods=['POST'],
+        description=_("speech to text"),
+        summary=_("speech to text"),
+        operation_id=_("speech to text"),  # type: ignore
+        parameters=SpeechToTextAPI.get_parameters(),
+        request=SpeechToTextAPI.get_request(),
+        responses=SpeechToTextAPI.get_response(),
+        tags=[_('Application')]  # type: ignore
+    )
+    @has_permissions(PermissionConstants.APPLICATION_EDIT.get_workspace_application_permission(),
+                     PermissionConstants.APPLICATION_EDIT.get_workspace_permission_workspace_manage_role(),
+                     RoleConstants.USER.get_workspace_role(),
+                     RoleConstants.WORKSPACE_MANAGE.get_workspace_role())
+    def post(self, request: Request, workspace_id: str, application_id: str):
+        return result.success(
+            ApplicationOperateSerializer(
+                data={'application_id': application_id, 'workspace_id': workspace_id, 'user_id': request.user.id})
+            .speech_to_text({'file': request.FILES.get('file')}))
+
+
+class TextToSpeech(APIView):
+    authentication_classes = [TokenAuth]
+
+    @extend_schema(
+        methods=['POST'],
+        description=_("text to speech"),
+        summary=_("text to speech"),
+        operation_id=_("text to speech"),  # type: ignore
+        parameters=TextToSpeechAPI.get_parameters(),
+        request=TextToSpeechAPI.get_request(),
+        responses=TextToSpeechAPI.get_response(),
+        tags=[_('Application')]  # type: ignore
+    )
+    @has_permissions(PermissionConstants.APPLICATION_EDIT.get_workspace_application_permission(),
+                     PermissionConstants.APPLICATION_EDIT.get_workspace_permission_workspace_manage_role(),
+                     RoleConstants.USER.get_workspace_role(),
+                     RoleConstants.WORKSPACE_MANAGE.get_workspace_role())
+    def post(self, request: Request, workspace_id: str, application_id: str):
+        byte_data = ApplicationOperateSerializer(
+            data={'application_id': application_id, 'workspace_id': workspace_id,
+                  'user_id': request.user.id}).text_to_speech(request.data)
+        return HttpResponse(byte_data, status=200, headers={'Content-Type': 'audio/mp3',
+                                                            'Content-Disposition': 'attachment; filename="abc.mp3"'})
+
+
+class PlayDemoText(APIView):
+    authentication_classes = [TokenAuth]
+
+    @extend_schema(
+        methods=['POST'],
+        description=_("PlayDemo"),
+        summary=_("PlayDemo"),
+        operation_id=_("PlayDemo"),  # type: ignore
+        parameters=PlayDemoTextAPI.get_parameters(),
+        request=PlayDemoTextAPI.get_request(),
+        responses=PlayDemoTextAPI.get_response(),
+        tags=[_('Application')]  # type: ignore
+    )
+    @has_permissions(PermissionConstants.APPLICATION_EDIT.get_workspace_application_permission(),
+                     PermissionConstants.APPLICATION_EDIT.get_workspace_permission_workspace_manage_role(),
+                     RoleConstants.USER.get_workspace_role(),
+                     RoleConstants.WORKSPACE_MANAGE.get_workspace_role())
+    @log(menu='Application', operate="trial listening",
+         get_operation_object=lambda r, k: get_application_operation_object(k.get('application_id')))
+    def post(self, request: Request, workspace_id: str, application_id: str):
+        byte_data = ApplicationOperateSerializer(
+            data={'application_id': application_id, 'workspace_id': workspace_id,
+                  'user_id': request.user.id}).play_demo_text(request.data)
+        return HttpResponse(byte_data, status=200, headers={'Content-Type': 'audio/mp3',
+                                                            'Content-Disposition': 'attachment; filename="abc.mp3"'})
--- a/apps/chat/serializers/chat.py
+++ b/apps/chat/serializers/chat.py
@ -27,6 +27,7 @@ from application.flow.i_step_node import WorkFlowPostHandler
 from application.flow.workflow_manage import WorkflowManage
 from application.models import Application, ApplicationTypeChoices, WorkFlowVersion, ApplicationKnowledgeMapping, \
    ChatUserType, ApplicationChatUserStats, ApplicationAccessToken, ChatRecord, Chat
+from application.serializers.application import ApplicationOperateSerializer
 from application.serializers.common import ChatInfo
 from common.exception.app_exception import AppApiException, AppChatNumOutOfBoundsFailed, ChatException
 from common.handle.base_to_response import BaseToResponse
@ -282,7 +283,7 @@ class ChatSerializers(serializers.Serializer):

    def re_open_chat_simple(self, chat_id, application):
        # 数据集id列表
-        knowledge_id_list = [str(row.dataset_id) for row in
+        knowledge_id_list = [str(row.knowledge_id) for row in
                             QuerySet(ApplicationKnowledgeMapping).filter(
                                 application_id=application.id)]

@ -292,7 +293,7 @@ class ChatSerializers(serializers.Serializer):
                                        knowledge_id__in=knowledge_id_list,
                                        is_active=False)]
        chat_info = ChatInfo(chat_id, self.data.get('chat_user_id'), self.data.get('chat_user_type'), knowledge_id_list,
-                             exclude_document_id_list, application)
+                             exclude_document_id_list, application.id, application)
        chat_record_list = list(QuerySet(ChatRecord).filter(chat_id=chat_id).order_by('-create_time')[0:5])
        chat_record_list.sort(key=lambda r: r.create_time)
        for chat_record in chat_record_list:
@ -378,3 +379,27 @@ class OpenChatSerializers(serializers.Serializer):
                 application_id,
                 application, debug=debug).set_cache()
        return chat_id
+
+
+class TextToSpeechSerializers(serializers.Serializer):
+    application_id = serializers.UUIDField(required=True, label=_("Application ID"))
+
+    def text_to_speech(self, instance):
+        self.is_valid(raise_exception=True)
+        application_id = self.data.get('application_id')
+        application = QuerySet(Application).filter(id=application_id).first()
+        return ApplicationOperateSerializer(
+            data={'application_id': application_id,
+                  'user_id': application.user_id}).text_to_speech(instance)
+
+
+class SpeechToTextSerializers(serializers.Serializer):
+    application_id = serializers.UUIDField(required=True, label=_("Application ID"))
+
+    def speech_to_text(self, instance):
+        self.is_valid(raise_exception=True)
+        application_id = self.data.get('application_id')
+        application = QuerySet(Application).filter(id=application_id).first()
+        return ApplicationOperateSerializer(
+            data={'application_id': application_id,
+                  'user_id': application.user_id}).speech_to_text(instance)
--- a/apps/chat/urls.py
+++ b/apps/chat/urls.py
@ -11,6 +11,8 @@ urlpatterns = [
    path('application/profile', views.ApplicationProfile.as_view()),
    path('chat_message/<str:chat_id>', views.ChatView.as_view()),
    path('open', views.OpenView.as_view()),
+    path('text_to_speech', views.TextToSpeech.as_view()),
+    path('speech_to_text', views.SpeechToText.as_view()),
    path('captcha', views.CaptchaView.as_view(), name='captcha'),
    path('vote/chat/<str:chat_id>/chat_record/<str:chat_record_id>', views.VoteView.as_view(), name='vote'),
    path('historical_conversation', views.HistoricalConversationView.as_view(), name='historical_conversation'),
--- a/apps/chat/views/chat.py
+++ b/apps/chat/views/chat.py
@ -12,12 +12,15 @@ from drf_spectacular.utils import extend_schema
 from rest_framework.request import Request
 from rest_framework.views import APIView

+from application.api.application_api import SpeechToTextAPI, TextToSpeechAPI
+from application.serializers.application import ApplicationOperateSerializer
 from chat.api.chat_api import ChatAPI
 from chat.api.chat_authentication_api import ChatAuthenticationAPI, ChatAuthenticationProfileAPI, ChatOpenAPI
-from chat.serializers.chat import OpenChatSerializers, ChatSerializers
+from chat.serializers.chat import OpenChatSerializers, ChatSerializers, SpeechToTextSerializers, TextToSpeechSerializers
 from chat.serializers.chat_authentication import AnonymousAuthenticationSerializer, ApplicationProfileSerializer, \
    AuthProfileSerializer
 from common.auth import TokenAuth
+from common.auth.authentication import has_permissions
 from common.constants.permission_constants import ChatAuth
 from common.exception.app_exception import AppAuthenticationFailed
 from common.result import result
@ -135,3 +138,41 @@ class CaptchaView(APIView):
                   responses=CaptchaAPI.get_response())
    def get(self, request: Request):
        return result.success(CaptchaSerializer().generate())
+
+
+class SpeechToText(APIView):
+    authentication_classes = [TokenAuth]
+
+    @extend_schema(
+        methods=['POST'],
+        description=_("speech to text"),
+        summary=_("speech to text"),
+        operation_id=_("speech to text"),  # type: ignore
+        request=SpeechToTextAPI.get_request(),
+        responses=SpeechToTextAPI.get_response(),
+        tags=[_('Application')]  # type: ignore
+    )
+    def post(self, request: Request):
+        return result.success(
+            SpeechToTextSerializers(
+                data={'application_id': request.auth.application_id})
+            .speech_to_text({'file': request.FILES.get('file')}))
+
+
+class TextToSpeech(APIView):
+    authentication_classes = [TokenAuth]
+
+    @extend_schema(
+        methods=['POST'],
+        description=_("text to speech"),
+        summary=_("text to speech"),
+        operation_id=_("text to speech"),  # type: ignore
+        request=TextToSpeechAPI.get_request(),
+        responses=TextToSpeechAPI.get_response(),
+        tags=[_('Application')]  # type: ignore
+    )
+    def post(self, request: Request):
+        byte_data = TextToSpeechSerializers(
+            data={'application_id': request.auth.application_id}).text_to_speech(request.data)
+        return HttpResponse(byte_data, status=200, headers={'Content-Type': 'audio/mp3',
+                                                            'Content-Disposition': 'attachment; filename="abc.mp3"'})
--- a/ui/public/tipIMG.jpg
+++ b/ui/public/tipIMG.jpg
--- a/ui/src/api/application/application.ts
+++ b/ui/src/api/application/application.ts
@ -1,14 +1,14 @@
-import {Result} from '@/request/Result'
-import {get, post, postStream, del, put, request, download, exportFile} from '@/request/index'
-import type {pageRequest} from '@/api/type/common'
-import type {ApplicationFormType} from '@/api/type/application'
-import {type Ref} from 'vue'
+import { Result } from '@/request/Result'
+import { get, post, postStream, del, put, request, download, exportFile } from '@/request/index'
+import type { pageRequest } from '@/api/type/common'
+import type { ApplicationFormType } from '@/api/type/application'
+import { type Ref } from 'vue'
 import useStore from '@/stores'

-const prefix: any = {_value: '/workspace/'}
+const prefix: any = { _value: '/workspace/' }
 Object.defineProperty(prefix, 'value', {
  get: function () {
-    const {user} = useStore()
+    const { user } = useStore()
    return this._value + user.getWorkspaceId() + '/application'
  },
 })
@ -219,7 +219,7 @@ const updatePlatformConfig: (
  application_id: string,
  type: string,
  data: any,
-  loading?: Ref<boolean>
+  loading?: Ref<boolean>,
 ) => Promise<Result<any>> = (application_id, type, data, loading) => {
  return post(`${prefix.value}/${application_id}/platform/${type}`, data, undefined, loading)
 }
@ -236,6 +236,55 @@ const publish: (
 ) => Promise<Result<any>> = (application_id, data, loading) => {
  return put(`${prefix.value}/${application_id}/publish`, data, {}, loading)
 }
+
+/**
+ *
+ * @param application_id
+ * @param data
+ * @param loading
+ * @returns
+ */
+const playDemoText: (application_id: string, data: any, loading?: Ref<boolean>) => Promise<any> = (
+  application_id,
+  data,
+  loading,
+) => {
+  return download(
+    `${prefix.value}/${application_id}/play_demo_text`,
+    'post',
+    data,
+    undefined,
+    loading,
+  )
+}
+
+/**
+ * 文本转语音
+ */
+const textToSpeech: (
+  application_id: String,
+  data: any,
+  loading?: Ref<boolean>,
+) => Promise<Result<any>> = (application_id, data, loading) => {
+  return download(
+    `${prefix.value}/${application_id}/text_to_speech`,
+    'post',
+    data,
+    undefined,
+    loading,
+  )
+}
+/**
+ * 语音转文本
+ */
+const speechToText: (
+  application_id: String,
+  data: any,
+  loading?: Ref<boolean>,
+) => Promise<Result<any>> = (application_id, data, loading) => {
+  return post(`${prefix.value}/${application_id}/speech_to_text`, data, undefined, loading)
+}
+
 export default {
  getAllApplication,
  getApplication,
@ -256,5 +305,8 @@ export default {
  updatePlatformStatus,
  getPlatformConfig,
  publish,
-  updatePlatformConfig
+  updatePlatformConfig,
+  playDemoText,
+  textToSpeech,
+  speechToText,
 }
--- a/ui/src/api/chat/chat.ts
+++ b/ui/src/api/chat/chat.ts
@ -244,7 +244,25 @@ const getChatRecord: (
 ) => Promise<Result<any>> = (chat_id, chat_record_id, loading) => {
  return get(`historical_conversation/${chat_id}/record/${chat_record_id}`, {}, loading)
 }
+/**
+ * 文本转语音
+ */
+const textToSpeech: (data: any, loading?: Ref<boolean>) => Promise<Result<any>> = (
+  data,
+  loading,
+) => {
+  return download(`text_to_speech`, 'post', data, undefined, loading)
+}

+/**
+ * 语音转文本
+ */
+const speechToText: (data: any, loading?: Ref<boolean>) => Promise<Result<any>> = (
+  data,
+  loading,
+) => {
+  return post(`speech_to_text`, data, undefined, loading)
+}
 export default {
  open,
  chat,
@ -269,4 +287,6 @@ export default {
  resetCurrentPassword,
  getChatUserProfile,
  getChatRecord,
+  textToSpeech,
+  speechToText,
 }
--- a/ui/src/components/ai-chat/component/chat-input-operate/index.vue
+++ b/ui/src/components/ai-chat/component/chat-input-operate/index.vue
@ -301,7 +301,7 @@
  </div>
 </template>
 <script setup lang="ts">
-import { ref, computed, onMounted, nextTick, watch } from 'vue'
+import { ref, computed, onMounted, nextTick, watch, type Ref } from 'vue'
 import Recorder from 'recorder-core'
 import TouchChat from './TouchChat.vue'
 import applicationApi from '@/api/application/application'
@ -314,6 +314,7 @@ import 'recorder-core/src/engine/mp3'
 import 'recorder-core/src/engine/mp3-engine'
 import { MsgWarning } from '@/utils/message'
 import { t } from '@/locales'
+import chatAPI from '@/api/chat/chat'
 const router = useRouter()
 const route = useRoute()
 const {
@ -687,7 +688,7 @@ class RecorderManage {
        `${err}
        <div style="width: 100%;height:1px;border-top:1px var(--el-border-color) var(--el-border-style);margin:10px 0;"></div>
        ${t('chat.tip.recorderTip')}
-    <img src="${new URL(`@/assets/tipIMG.jpg`, import.meta.url).href}" style="width: 100%;" />`,
+    <img src="${new URL(`/tipIMG.jpg`, import.meta.url).href}" style="width: 100%;" />`,
        {
          confirmButtonText: t('chat.tip.confirm'),
          dangerouslyUseHTMLString: true,
@ -697,6 +698,16 @@ class RecorderManage {
    }
  }
 }
+const getSpeechToTextAPI = () => {
+  if (props.type === 'ai-chat') {
+    return (application_id?: string, data?: any, loading?: Ref<boolean>) => {
+      return chatAPI.speechToText(data, loading)
+    }
+  } else {
+    return applicationApi.textToSpeech
+  }
+}
+const speechToTextAPI = getSpeechToTextAPI()
 // 上传录音文件
 const uploadRecording = async (audioBlob: Blob) => {
  try {
@ -710,8 +721,7 @@ const uploadRecording = async (audioBlob: Blob) => {
    if (props.applicationDetails.stt_autosend) {
      bus.emit('on:transcribing', true)
    }
-    applicationApi
-      .postSpeechToText(props.applicationDetails.id as string, formData, localLoading)
+    speechToTextAPI(props.applicationDetails.id as string, formData, localLoading)
      .then((response) => {
        inputValue.value = typeof response.data === 'string' ? response.data : ''
        // 自动发送
--- a/ui/src/components/ai-chat/component/operation-button/ChatOperationButton.vue
+++ b/ui/src/components/ai-chat/component/operation-button/ChatOperationButton.vue
@ -99,7 +99,7 @@
  </div>
 </template>
 <script setup lang="ts">
-import { nextTick, onMounted, ref, onBeforeUnmount } from 'vue'
+import { nextTick, onMounted, ref, onBeforeUnmount, type Ref } from 'vue'
 import { useRoute } from 'vue-router'
 import { copyClick } from '@/utils/clipboard'
 import applicationApi from '@/api/application/application'
@ -262,6 +262,16 @@ enum AudioStatus {
   */
  ERROR = 'ERROR',
 }
+const getTextToSpeechAPI = () => {
+  if (props.type === 'ai-chat') {
+    return (application_id?: string, data?: any, loading?: Ref<boolean>) => {
+      return chatAPI.textToSpeech(data, loading)
+    }
+  } else {
+    return applicationApi.textToSpeech
+  }
+}
+const textToSpeechAPI = getTextToSpeechAPI()
 class AudioManage {
  textList: Array<string>
  statusList: Array<AudioStatus>
@ -313,12 +323,11 @@ class AudioManage {
          audioElement.src = text.match(/src="([^"]*)"/)?.[1] || ''
          this.statusList[index] = AudioStatus.READY
        } else {
-          applicationApi
-            .postTextToSpeech(
-              (props.applicationId as string) || (id as string),
-              { text: text },
-              loading,
-            )
+          textToSpeechAPI(
+            (props.applicationId as string) || (id as string),
+            { text: text },
+            loading,
+          )
            .then(async (res: any) => {
              if (res.type === 'application/json') {
                const text = await res.text()
@ -376,12 +385,11 @@ class AudioManage {
        if (audioElement instanceof HTMLAudioElement) {
          const text = this.textList[index]
          this.statusList[index] = AudioStatus.MOUNTED
-          applicationApi
-            .postTextToSpeech(
-              (props.applicationId as string) || (id as string),
-              { text: text },
-              loading,
-            )
+          textToSpeechAPI(
+            (props.applicationId as string) || (id as string),
+            { text: text },
+            loading,
+          )
            .then(async (res: any) => {
              if (res.type === 'application/json') {
                const text = await res.text()
--- a/ui/src/components/app-icon/icons/application.ts
+++ b/ui/src/components/app-icon/icons/application.ts
@ -500,4 +500,54 @@ export default {
      ])
    },
  },
+  'app-video-play': {
+    iconReader: () => {
+      return h('i', [
+        h(
+          'svg',
+          {
+            style: { height: '100%', width: '100%' },
+            viewBox: '0 0 1024 1024',
+            version: '1.1',
+            xmlns: 'http://www.w3.org/2000/svg',
+          },
+          [
+            h('path', {
+              d: 'M512 896a384 384 0 1 0 0-768 384 384 0 0 0 0 768z m469.333333-384c0 259.2-210.133333 469.333333-469.333333 469.333333S42.666667 771.2 42.666667 512 252.8 42.666667 512 42.666667s469.333333 210.133333 469.333333 469.333333z',
+              fill: 'currentColor',
+            }),
+            h('path', {
+              d: 'M686.890667 539.776l-253.141334 159.274667a32.298667 32.298667 0 0 1-44.8-10.453334 32.896 32.896 0 0 1-4.949333-17.322666V352.768a32.64 32.64 0 0 1 32.512-32.768c6.101333 0 12.074667 1.706667 17.28 4.992l253.098667 159.232a32.853333 32.853333 0 0 1 0 55.552z',
+              fill: 'currentColor',
+            }),
+          ],
+        ),
+      ])
+    },
+  },
+  'app-video-pause': {
+    iconReader: () => {
+      return h('i', [
+        h(
+          'svg',
+          {
+            style: { height: '100%', width: '100%' },
+            viewBox: '0 0 1024 1024',
+            version: '1.1',
+            xmlns: 'http://www.w3.org/2000/svg',
+          },
+          [
+            h('path', {
+              d: 'M405.333333 341.333333a21.333333 21.333333 0 0 0-21.333333 21.333334v298.666666a21.333333 21.333333 0 0 0 21.333333 21.333334h42.666667a21.333333 21.333333 0 0 0 21.333333-21.333334v-298.666666a21.333333 21.333333 0 0 0-21.333333-21.333334h-42.666667zM576 341.333333a21.333333 21.333333 0 0 0-21.333333 21.333334v298.666666a21.333333 21.333333 0 0 0 21.333333 21.333334h42.666667a21.333333 21.333333 0 0 0 21.333333-21.333334v-298.666666a21.333333 21.333333 0 0 0-21.333333-21.333334h-42.666667z',
+              fill: 'currentColor',
+            }),
+            h('path', {
+              d: 'M512 42.666667C252.8 42.666667 42.666667 252.8 42.666667 512s210.133333 469.333333 469.333333 469.333333 469.333333-210.133333 469.333333-469.333333S771.2 42.666667 512 42.666667zM128 512a384 384 0 1 1 768 0 384 384 0 0 1-768 0z',
+              fill: 'currentColor',
+            }),
+          ],
+        ),
+      ])
+    },
+  },
 }
--- a/ui/src/router/chat/index.ts
+++ b/ui/src/router/chat/index.ts
@ -42,6 +42,10 @@ router.beforeEach(
      })
      return
    }
+    const p_token = to.query.token
+    if (p_token) {
+      chatUser.setToken(p_token)
+    }
    const token = chatUser.getToken()
    if (authentication) {
      if (!token && to.name != 'login') {
--- a/ui/src/views/application/component/TTSModeParamSettingDialog.vue
+++ b/ui/src/views/application/component/TTSModeParamSettingDialog.vue
@ -47,6 +47,7 @@ import ModelAPI from '@/api/model/model'
 import applicationApi from '@/api/application/application'
 import DynamicsForm from '@/components/dynamics-form/index.vue'
 import { useRoute } from 'vue-router'
+import { MsgError } from '@/utils/message'
 const route = useRoute()
 const {
  params: { id },
@ -60,16 +61,11 @@ const form_data = ref<any>({})
 const dialogVisible = ref(false)
 const loading = ref(false)
 const playLoading = ref(false)
-const getApi = (model_id: string, application_id?: string) => {
-  return application_id
-    ? applicationApi.getModelParamsForm(application_id, model_id, loading)
-    : ModelAPI.getModelParamsForm(model_id, loading)
-}
+
 const open = (model_id: string, application_id?: string, model_setting_data?: any) => {
  form_data.value = {}
  tts_model_id.value = model_id
-  const api = getApi(model_id, application_id)
-  api.then((ok) => {
+  ModelAPI.getModelParamsForm(model_id, loading).then((ok) => {
    model_form_field.value = ok.data
    const resp = ok.data
      .map((item: any) => ({
@ -92,8 +88,7 @@ const open = (model_id: string, application_id?: string, model_setting_data?: an
 }

 const reset_default = (model_id: string, application_id?: string) => {
-  const api = getApi(model_id, application_id)
-  api.then((ok) => {
+  ModelAPI.getModelParamsForm(model_id, loading).then((ok) => {
    model_form_field.value = ok.data
    const model_setting_data = ok.data
      .map((item) => ({
@ -118,31 +113,31 @@ const testPlay = () => {
    ...form_data.value,
    tts_model_id: tts_model_id.value,
  }
-  // applicationApi
-  //   .playDemoText(id as string, data, playLoading)
-  //   .then(async (res: any) => {
-  //     if (res.type === 'application/json') {
-  //       const text = await res.text()
-  //       MsgError(text)
-  //       return
-  //     }
-  //     // 创建 Blob 对象
-  //     const blob = new Blob([res], { type: 'audio/mp3' })
+  applicationApi
+    .playDemoText(id as string, data, playLoading)
+    .then(async (res: any) => {
+      if (res.type === 'application/json') {
+        const text = await res.text()
+        MsgError(text)
+        return
+      }
+      // 创建 Blob 对象
+      const blob = new Blob([res], { type: 'audio/mp3' })

-  //     // 创建对象 URL
-  //     const url = URL.createObjectURL(blob)
+      // 创建对象 URL
+      const url = URL.createObjectURL(blob)

-  //     // 检查 audioPlayer 是否已经引用了 DOM 元素
-  //     if (audioPlayer.value instanceof HTMLAudioElement) {
-  //       audioPlayer.value.src = url
-  //       audioPlayer.value.play() // 自动播放音频
-  //     } else {
-  //       console.error('audioPlayer.value is not an instance of HTMLAudioElement')
-  //     }
-  //   })
-  //   .catch((err) => {
-  //     console.log('err: ', err)
-  //   })
+      // 检查 audioPlayer 是否已经引用了 DOM 元素
+      if (audioPlayer.value instanceof HTMLAudioElement) {
+        audioPlayer.value.src = url
+        audioPlayer.value.play() // 自动播放音频
+      } else {
+        console.error('audioPlayer.value is not an instance of HTMLAudioElement')
+      }
+    })
+    .catch((err) => {
+      console.log('err: ', err)
+    })
 }

 defineExpose({ open, reset_default })