diff --git a/apps/application/flow/step_node/__init__.py b/apps/application/flow/step_node/__init__.py index 535560b5f..5e992079b 100644 --- a/apps/application/flow/step_node/__init__.py +++ b/apps/application/flow/step_node/__init__.py @@ -21,12 +21,14 @@ from .image_understand_step_node import * from .image_generate_step_node import * from .search_dataset_node import * +from .speech_to_text_step_node import BaseSpeechToTextNode from .start_node import * +from .text_to_speech_step_node.impl.base_text_to_speech_node import BaseTextToSpeechNode node_list = [BaseStartStepNode, BaseChatNode, BaseSearchDatasetNode, BaseQuestionNode, BaseConditionNode, BaseReplyNode, BaseFunctionNodeNode, BaseFunctionLibNodeNode, BaseRerankerNode, BaseApplicationNode, BaseDocumentExtractNode, - BaseImageUnderstandNode, BaseImageGenerateNode, BaseFormNode] + BaseImageUnderstandNode, BaseFormNode, BaseSpeechToTextNode, BaseTextToSpeechNode,BaseImageGenerateNode] def get_node(node_type): diff --git a/apps/application/flow/step_node/application_node/i_application_node.py b/apps/application/flow/step_node/application_node/i_application_node.py index 8c4675ea7..c0fb158fd 100644 --- a/apps/application/flow/step_node/application_node/i_application_node.py +++ b/apps/application/flow/step_node/application_node/i_application_node.py @@ -14,6 +14,7 @@ class ApplicationNodeSerializer(serializers.Serializer): user_input_field_list = serializers.ListField(required=False, error_messages=ErrMessage.uuid("用户输入字段")) image_list = serializers.ListField(required=False, error_messages=ErrMessage.list("图片")) document_list = serializers.ListField(required=False, error_messages=ErrMessage.list("文档")) + audio_list = serializers.ListField(required=False, error_messages=ErrMessage.list("音频")) child_node = serializers.DictField(required=False, allow_null=True, error_messages=ErrMessage.dict("子节点")) node_data = serializers.DictField(required=False, allow_null=True, error_messages=ErrMessage.dict("表单数据")) @@ -43,7 +44,7 @@ class IApplicationNode(INode): app_document_list[1:]) for document in app_document_list: if 'file_id' not in document: - raise ValueError("参数值错误: 上传的文档中缺少file_id") + raise ValueError("参数值错误: 上传的文档中缺少file_id,文档上传失败") app_image_list = self.node_params_serializer.data.get('image_list', []) if app_image_list and len(app_image_list) > 0: app_image_list = self.workflow_manage.get_reference_field( @@ -51,11 +52,22 @@ class IApplicationNode(INode): app_image_list[1:]) for image in app_image_list: if 'file_id' not in image: - raise ValueError("参数值错误: 上传的图片中缺少file_id") + raise ValueError("参数值错误: 上传的图片中缺少file_id,图片上传失败") + + app_audio_list = self.node_params_serializer.data.get('audio_list', []) + if app_audio_list and len(app_audio_list) > 0: + app_audio_list = self.workflow_manage.get_reference_field( + app_audio_list[0], + app_audio_list[1:]) + for audio in app_audio_list: + if 'file_id' not in audio: + raise ValueError("参数值错误: 上传的图片中缺少file_id,音频上传失败") return self.execute(**self.node_params_serializer.data, **self.flow_params_serializer.data, app_document_list=app_document_list, app_image_list=app_image_list, + app_audio_list=app_audio_list, message=str(question), **kwargs) def execute(self, application_id, message, chat_id, chat_record_id, stream, re_chat, client_id, client_type, - app_document_list=None, app_image_list=None, child_node=None, node_data=None, **kwargs) -> NodeResult: + app_document_list=None, app_image_list=None, app_audio_list=None, child_node=None, node_data=None, + **kwargs) -> NodeResult: pass diff --git a/apps/application/flow/step_node/application_node/impl/base_application_node.py b/apps/application/flow/step_node/application_node/impl/base_application_node.py index 76f92f878..c6bb29be5 100644 --- a/apps/application/flow/step_node/application_node/impl/base_application_node.py +++ b/apps/application/flow/step_node/application_node/impl/base_application_node.py @@ -154,7 +154,7 @@ class BaseApplicationNode(IApplicationNode): self.answer_text = details.get('answer') def execute(self, application_id, message, chat_id, chat_record_id, stream, re_chat, client_id, client_type, - app_document_list=None, app_image_list=None, child_node=None, node_data=None, + app_document_list=None, app_image_list=None, app_audio_list=None, child_node=None, node_data=None, **kwargs) -> NodeResult: from application.serializers.chat_message_serializers import ChatMessageSerializer # 生成嵌入应用的chat_id @@ -167,6 +167,8 @@ class BaseApplicationNode(IApplicationNode): app_document_list = [] if app_image_list is None: app_image_list = [] + if app_audio_list is None: + app_audio_list = [] runtime_node_id = None record_id = None child_node_value = None @@ -186,6 +188,7 @@ class BaseApplicationNode(IApplicationNode): 'client_type': client_type, 'document_list': app_document_list, 'image_list': app_image_list, + 'audio_list': app_audio_list, 'runtime_node_id': runtime_node_id, 'chat_record_id': record_id, 'child_node': child_node_value, @@ -234,5 +237,6 @@ class BaseApplicationNode(IApplicationNode): 'global_fields': global_fields, 'document_list': self.workflow_manage.document_list, 'image_list': self.workflow_manage.image_list, + 'audio_list': self.workflow_manage.audio_list, 'application_node_dict': self.context.get('application_node_dict') } diff --git a/apps/application/flow/step_node/speech_to_text_step_node/__init__.py b/apps/application/flow/step_node/speech_to_text_step_node/__init__.py new file mode 100644 index 000000000..f3feecc9c --- /dev/null +++ b/apps/application/flow/step_node/speech_to_text_step_node/__init__.py @@ -0,0 +1,3 @@ +# coding=utf-8 + +from .impl import * diff --git a/apps/application/flow/step_node/speech_to_text_step_node/i_speech_to_text_node.py b/apps/application/flow/step_node/speech_to_text_step_node/i_speech_to_text_node.py new file mode 100644 index 000000000..7e2a79b56 --- /dev/null +++ b/apps/application/flow/step_node/speech_to_text_step_node/i_speech_to_text_node.py @@ -0,0 +1,37 @@ +# coding=utf-8 + +from typing import Type + +from rest_framework import serializers + +from application.flow.i_step_node import INode, NodeResult +from common.util.field_message import ErrMessage + + +class SpeechToTextNodeSerializer(serializers.Serializer): + stt_model_id = serializers.CharField(required=True, error_messages=ErrMessage.char("模型id")) + + is_result = serializers.BooleanField(required=False, error_messages=ErrMessage.boolean('是否返回内容')) + + audio_list = serializers.ListField(required=False, error_messages=ErrMessage.list("音频")) + + +class ISpeechToTextNode(INode): + type = 'speech-to-text-node' + + def get_node_params_serializer_class(self) -> Type[serializers.Serializer]: + return SpeechToTextNodeSerializer + + def _run(self): + res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('audio_list')[0], + self.node_params_serializer.data.get('audio_list')[1:]) + for audio in res: + if 'file_id' not in audio: + raise ValueError("参数值错误: 上传的图片中缺少file_id,音频上传失败") + + return self.execute(audio=res, **self.node_params_serializer.data, **self.flow_params_serializer.data) + + def execute(self, stt_model_id, chat_id, + audio, + **kwargs) -> NodeResult: + pass diff --git a/apps/application/flow/step_node/speech_to_text_step_node/impl/__init__.py b/apps/application/flow/step_node/speech_to_text_step_node/impl/__init__.py new file mode 100644 index 000000000..9d2da6158 --- /dev/null +++ b/apps/application/flow/step_node/speech_to_text_step_node/impl/__init__.py @@ -0,0 +1,3 @@ +# coding=utf-8 + +from .base_speech_to_text_node import BaseSpeechToTextNode diff --git a/apps/application/flow/step_node/speech_to_text_step_node/impl/base_speech_to_text_node.py b/apps/application/flow/step_node/speech_to_text_step_node/impl/base_speech_to_text_node.py new file mode 100644 index 000000000..ed2ca9a68 --- /dev/null +++ b/apps/application/flow/step_node/speech_to_text_step_node/impl/base_speech_to_text_node.py @@ -0,0 +1,58 @@ +# coding=utf-8 +import os +import tempfile +import time +import io +from typing import List, Dict + +from django.db.models import QuerySet +from pydub import AudioSegment +from concurrent.futures import ThreadPoolExecutor +from application.flow.i_step_node import NodeResult, INode +from application.flow.step_node.speech_to_text_step_node.i_speech_to_text_node import ISpeechToTextNode +from common.util.common import split_and_transcribe +from dataset.models import File +from setting.models_provider.tools import get_model_instance_by_model_user_id + + +class BaseSpeechToTextNode(ISpeechToTextNode): + + def save_context(self, details, workflow_manage): + self.context['answer'] = details.get('answer') + self.answer_text = details.get('answer') + + def execute(self, stt_model_id, chat_id, audio, **kwargs) -> NodeResult: + stt_model = get_model_instance_by_model_user_id(stt_model_id, self.flow_params_serializer.data.get('user_id')) + audio_list = audio + self.context['audio_list'] = audio + + + def process_audio_item(audio_item, model): + file = QuerySet(File).filter(id=audio_item['file_id']).first() + with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_file: + temp_file.write(file.get_byte().tobytes()) + temp_file_path = temp_file.name + try: + return split_and_transcribe(temp_file_path, model) + finally: + os.remove(temp_file_path) + + def process_audio_items(audio_list, model): + with ThreadPoolExecutor(max_workers=5) as executor: + results = list(executor.map(lambda item: process_audio_item(item, model), audio_list)) + return '\n\n'.join(results) + + result = process_audio_items(audio_list, stt_model) + return NodeResult({'answer': result, 'result': result}, {}) + + def get_details(self, index: int, **kwargs): + return { + 'name': self.node.properties.get('stepName'), + "index": index, + 'run_time': self.context.get('run_time'), + 'answer': self.context.get('answer'), + 'type': self.node.type, + 'status': self.status, + 'err_message': self.err_message, + 'audio_list': self.context.get('audio_list'), + } diff --git a/apps/application/flow/step_node/start_node/impl/base_start_node.py b/apps/application/flow/step_node/start_node/impl/base_start_node.py index 59f875fcc..bf5203274 100644 --- a/apps/application/flow/step_node/start_node/impl/base_start_node.py +++ b/apps/application/flow/step_node/start_node/impl/base_start_node.py @@ -39,6 +39,7 @@ class BaseStartStepNode(IStarNode): self.context['run_time'] = details.get('run_time') self.context['document'] = details.get('document_list') self.context['image'] = details.get('image_list') + self.context['audio'] = details.get('audio_list') self.status = details.get('status') self.err_message = details.get('err_message') for key, value in workflow_variable.items(): @@ -57,7 +58,8 @@ class BaseStartStepNode(IStarNode): node_variable = { 'question': question, 'image': self.workflow_manage.image_list, - 'document': self.workflow_manage.document_list + 'document': self.workflow_manage.document_list, + 'audio': self.workflow_manage.audio_list } return NodeResult(node_variable, workflow_variable) @@ -80,5 +82,6 @@ class BaseStartStepNode(IStarNode): 'err_message': self.err_message, 'image_list': self.context.get('image'), 'document_list': self.context.get('document'), + 'audio_list': self.context.get('audio'), 'global_fields': global_fields } diff --git a/apps/application/flow/step_node/text_to_speech_step_node/__init__.py b/apps/application/flow/step_node/text_to_speech_step_node/__init__.py new file mode 100644 index 000000000..f3feecc9c --- /dev/null +++ b/apps/application/flow/step_node/text_to_speech_step_node/__init__.py @@ -0,0 +1,3 @@ +# coding=utf-8 + +from .impl import * diff --git a/apps/application/flow/step_node/text_to_speech_step_node/i_text_to_speech_node.py b/apps/application/flow/step_node/text_to_speech_step_node/i_text_to_speech_node.py new file mode 100644 index 000000000..8b16301a5 --- /dev/null +++ b/apps/application/flow/step_node/text_to_speech_step_node/i_text_to_speech_node.py @@ -0,0 +1,35 @@ +# coding=utf-8 + +from typing import Type + +from rest_framework import serializers + +from application.flow.i_step_node import INode, NodeResult +from common.util.field_message import ErrMessage + + +class TextToSpeechNodeSerializer(serializers.Serializer): + tts_model_id = serializers.CharField(required=True, error_messages=ErrMessage.char("模型id")) + + is_result = serializers.BooleanField(required=False, error_messages=ErrMessage.boolean('是否返回内容')) + + content_list = serializers.ListField(required=False, error_messages=ErrMessage.list("文本内容")) + model_params_setting = serializers.DictField(required=False, + error_messages=ErrMessage.integer("模型参数相关设置")) + + +class ITextToSpeechNode(INode): + type = 'text-to-speech-node' + + def get_node_params_serializer_class(self) -> Type[serializers.Serializer]: + return TextToSpeechNodeSerializer + + def _run(self): + content = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('content_list')[0], + self.node_params_serializer.data.get('content_list')[1:]) + return self.execute(content=content, **self.node_params_serializer.data, **self.flow_params_serializer.data) + + def execute(self, tts_model_id, chat_id, + content, model_params_setting=None, + **kwargs) -> NodeResult: + pass diff --git a/apps/application/flow/step_node/text_to_speech_step_node/impl/__init__.py b/apps/application/flow/step_node/text_to_speech_step_node/impl/__init__.py new file mode 100644 index 000000000..385b9718f --- /dev/null +++ b/apps/application/flow/step_node/text_to_speech_step_node/impl/__init__.py @@ -0,0 +1,3 @@ +# coding=utf-8 + +from .base_text_to_speech_node import BaseTextToSpeechNode diff --git a/apps/application/flow/step_node/text_to_speech_step_node/impl/base_text_to_speech_node.py b/apps/application/flow/step_node/text_to_speech_step_node/impl/base_text_to_speech_node.py new file mode 100644 index 000000000..9bd36fc22 --- /dev/null +++ b/apps/application/flow/step_node/text_to_speech_step_node/impl/base_text_to_speech_node.py @@ -0,0 +1,73 @@ +# coding=utf-8 +import io +import mimetypes + +from django.core.files.uploadedfile import InMemoryUploadedFile + +from application.flow.i_step_node import NodeResult, INode +from application.flow.step_node.image_understand_step_node.i_image_understand_node import IImageUnderstandNode +from application.flow.step_node.text_to_speech_step_node.i_text_to_speech_node import ITextToSpeechNode +from dataset.models import File +from dataset.serializers.file_serializers import FileSerializer +from setting.models_provider.tools import get_model_instance_by_model_user_id + + +def bytes_to_uploaded_file(file_bytes, file_name="generated_audio.mp3"): + content_type, _ = mimetypes.guess_type(file_name) + if content_type is None: + # 如果未能识别,设置为默认的二进制文件类型 + content_type = "application/octet-stream" + # 创建一个内存中的字节流对象 + file_stream = io.BytesIO(file_bytes) + + # 获取文件大小 + file_size = len(file_bytes) + + uploaded_file = InMemoryUploadedFile( + file=file_stream, + field_name=None, + name=file_name, + content_type=content_type, + size=file_size, + charset=None, + ) + return uploaded_file + + +class BaseTextToSpeechNode(ITextToSpeechNode): + def save_context(self, details, workflow_manage): + self.context['answer'] = details.get('answer') + self.answer_text = details.get('answer') + + def execute(self, tts_model_id, chat_id, + content, model_params_setting=None, + **kwargs) -> NodeResult: + self.context['content'] = content + model = get_model_instance_by_model_user_id(tts_model_id, self.flow_params_serializer.data.get('user_id'), + **model_params_setting) + audio_byte = model.text_to_speech(content) + # 需要把这个音频文件存储到数据库中 + file_name = 'generated_audio.mp3' + file = bytes_to_uploaded_file(audio_byte, file_name) + application = self.workflow_manage.work_flow_post_handler.chat_info.application + meta = { + 'debug': False if application.id else True, + 'chat_id': chat_id, + 'application_id': str(application.id) if application.id else None, + } + file_url = FileSerializer(data={'file': file, 'meta': meta}).upload() + # 拼接一个audio标签的src属性 + audio_label = f'