diff --git a/apps/locales/en_US/LC_MESSAGES/django.po b/apps/locales/en_US/LC_MESSAGES/django.po index 63134d6df..0ad37986d 100644 --- a/apps/locales/en_US/LC_MESSAGES/django.po +++ b/apps/locales/en_US/LC_MESSAGES/django.po @@ -8660,4 +8660,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data msgstr "" msgid "resource authorization" +msgstr "" + +msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition." msgstr "" \ No newline at end of file diff --git a/apps/locales/zh_CN/LC_MESSAGES/django.po b/apps/locales/zh_CN/LC_MESSAGES/django.po index 8abdf653b..833af94a7 100644 --- a/apps/locales/zh_CN/LC_MESSAGES/django.po +++ b/apps/locales/zh_CN/LC_MESSAGES/django.po @@ -8786,4 +8786,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data msgstr "Qwen-Omni 系列模型支持输入多种模态的数据,包括视频、音频、图片、文本,并输出音频与文本" msgid "resource authorization" -msgstr "资源授权" \ No newline at end of file +msgstr "资源授权" + +msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition." +msgstr "基于Qwen-Audio的端到端语音识别大模型,支持3分钟以内的音频识别,目前主要支持中英文识别。" \ No newline at end of file diff --git a/apps/locales/zh_Hant/LC_MESSAGES/django.po b/apps/locales/zh_Hant/LC_MESSAGES/django.po index 06ec56659..2ca5cbf59 100644 --- a/apps/locales/zh_Hant/LC_MESSAGES/django.po +++ b/apps/locales/zh_Hant/LC_MESSAGES/django.po @@ -8786,4 +8786,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data msgstr "Qwen-Omni系列模型支持輸入多種模態的數據,包括視頻、音訊、圖片、文字,並輸出音訊與文字" msgid "resource authorization" -msgstr "資源授權" \ No newline at end of file +msgstr "資源授權" + +msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition." +msgstr "基於Qwen-Audio的端到端語音辨識大模型,支持3分鐘以內的音訊識別,現時主要支持中英文識別。" \ No newline at end of file diff --git a/apps/models_provider/impl/aliyun_bai_lian_model_provider/aliyun_bai_lian_model_provider.py b/apps/models_provider/impl/aliyun_bai_lian_model_provider/aliyun_bai_lian_model_provider.py index 7264001b2..d402ae72d 100644 --- a/apps/models_provider/impl/aliyun_bai_lian_model_provider/aliyun_bai_lian_model_provider.py +++ b/apps/models_provider/impl/aliyun_bai_lian_model_provider/aliyun_bai_lian_model_provider.py @@ -11,6 +11,7 @@ import os from common.utils.common import get_file_content from models_provider.base_model_provider import ModelProvideInfo, ModelTypeConst, ModelInfo, IModelProvider, \ ModelInfoManage +from models_provider.impl.aliyun_bai_lian_model_provider.credential.asr_stt import AliyunBaiLianAsrSTTModelCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.embedding import \ AliyunBaiLianEmbeddingCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential @@ -21,6 +22,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.reranker imp from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.tti import QwenTextToImageModelCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.tts import AliyunBaiLianTTSModelCredential +from models_provider.impl.aliyun_bai_lian_model_provider.model.asr_stt import AliyunBaiLianAsrSpeechToText from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel @@ -36,6 +38,7 @@ aliyun_bai_lian_model_credential = AliyunBaiLianRerankerCredential() aliyun_bai_lian_tts_model_credential = AliyunBaiLianTTSModelCredential() aliyun_bai_lian_stt_model_credential = AliyunBaiLianSTTModelCredential() aliyun_bai_lian_omi_stt_model_credential = AliyunBaiLianOmiSTTModelCredential() +aliyun_bai_lian_asr_stt_model_credential = AliyunBaiLianAsrSTTModelCredential() aliyun_bai_lian_embedding_model_credential = AliyunBaiLianEmbeddingCredential() aliyun_bai_lian_llm_model_credential = BaiLianLLMModelCredential() qwenvl_model_credential = QwenVLModelCredential() @@ -79,10 +82,16 @@ model_info_list = [ModelInfo('gte-rerank', BaiLianChatModel), ModelInfo('qwen-omni-turbo', _('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'), - ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText), + ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, + AliyunBaiLianOmiSpeechToText), ModelInfo('qwen2.5-omni-7b', _('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'), - ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText), + ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, + AliyunBaiLianOmiSpeechToText), + ModelInfo('qwen-audio-asr', + _('The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition.'), + ModelTypeConst.STT, aliyun_bai_lian_asr_stt_model_credential, + AliyunBaiLianAsrSpeechToText), ] module_info_vl_list = [ diff --git a/apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/asr_stt.py b/apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/asr_stt.py new file mode 100644 index 000000000..e9d4f2308 --- /dev/null +++ b/apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/asr_stt.py @@ -0,0 +1,67 @@ +# coding=utf-8 +import traceback +from typing import Dict, Any + +from common import forms +from common.exception.app_exception import AppApiException +from common.forms import BaseForm +from models_provider.base_model_provider import BaseModelCredential, ValidCode +from django.utils.translation import gettext as _ + +from models_provider.impl.aliyun_bai_lian_model_provider.credential.omni_stt import AliyunBaiLianOmiSTTModelParams + + +class AliyunBaiLianAsrSTTModelCredential(BaseForm, BaseModelCredential): + api_url = forms.TextInputField(_('API URL'), required=True) + api_key = forms.PasswordInputField(_('API Key'), required=True) + + def is_valid(self, + model_type: str, + model_name: str, + model_credential: Dict[str, Any], + model_params: Dict[str, Any], + provider, + raise_exception: bool = False + ) -> bool: + model_type_list = provider.get_model_type_list() + if not any(mt.get('value') == model_type for mt in model_type_list): + raise AppApiException( + ValidCode.valid_error.value, + _('{model_type} Model type is not supported').format(model_type=model_type) + ) + + required_keys = ['api_key'] + for key in required_keys: + if key not in model_credential: + if raise_exception: + raise AppApiException( + ValidCode.valid_error.value, + _('{key} is required').format(key=key) + ) + return False + + try: + model = provider.get_model(model_type, model_name, model_credential) + except Exception as e: + traceback.print_exc() + if isinstance(e, AppApiException): + raise e + if raise_exception: + raise AppApiException( + ValidCode.valid_error.value, + _('Verification failed, please check whether the parameters are correct: {error}').format( + error=str(e)) + ) + return False + return True + + def encryption_dict(self, model: Dict[str, object]) -> Dict[str, object]: + + return { + **model, + 'api_key': super().encryption(model.get('api_key', '')) + } + + def get_model_params_setting_form(self, model_name): + + return AliyunBaiLianOmiSTTModelParams() diff --git a/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/asr_stt.py b/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/asr_stt.py new file mode 100644 index 000000000..c326e452e --- /dev/null +++ b/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/asr_stt.py @@ -0,0 +1,70 @@ +import base64 +import os.path +import traceback +from typing import Dict + +import dashscope + +from common.utils.logger import maxkb_logger +from models_provider.base_model_provider import MaxKBBaseModel +from models_provider.impl.base_stt import BaseSpeechToText + + +class AliyunBaiLianAsrSpeechToText(MaxKBBaseModel, BaseSpeechToText): + api_key: str + api_url: str + model: str + params: dict + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.api_key = kwargs.get('api_key') + self.model = kwargs.get('model') + self.params = kwargs.get('params') + self.api_url = kwargs.get('api_url') + + @staticmethod + def is_cache_model(): + return False + + @staticmethod + def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs): + return AliyunBaiLianAsrSpeechToText( + model=model_name, + api_key=model_credential.get('api_key'), + api_url=model_credential.get('api_url'), + params=model_kwargs, + **model_kwargs + ) + + def check_auth(self): + cwd = os.path.dirname(os.path.abspath(__file__)) + with open(f'{cwd}/iat_mp3_16k.mp3', 'rb') as audio_file: + self.speech_to_text(audio_file) + + def speech_to_text(self, audio_file): + try: + + base64_audio = base64.b64encode(audio_file.read()).decode("utf-8") + + messages = [ + { + "role": "user", + "content": [ + {"audio": f"data:audio/mp3;base64,{base64_audio}"}, + ] + } + ] + response = dashscope.MultiModalConversation.call( + api_key=self.api_key, + model=self.model, + messages=messages, + result_format="message", + ) + + text = response["output"]["choices"][0]["message"].content[0]["text"] + + return text + + except Exception as err: + maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}")