feat: Qwen asr speech recognition

This commit is contained in:
zhangzhanwei 2025-08-19 15:39:33 +08:00 committed by zhanweizhang7
parent 9461ca8071
commit 354f85ddb6
6 changed files with 159 additions and 4 deletions

View File

@ -8660,4 +8660,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data
msgstr ""
msgid "resource authorization"
msgstr ""
msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
msgstr ""

View File

@ -8786,4 +8786,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data
msgstr "Qwen-Omni 系列模型支持输入多种模态的数据,包括视频、音频、图片、文本,并输出音频与文本"
msgid "resource authorization"
msgstr "资源授权"
msgstr "资源授权"
msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
msgstr "基于Qwen-Audio的端到端语音识别大模型支持3分钟以内的音频识别目前主要支持中英文识别。"

View File

@ -8786,4 +8786,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data
msgstr "Qwen-Omni系列模型支持輸入多種模態的數據包括視頻、音訊、圖片、文字並輸出音訊與文字"
msgid "resource authorization"
msgstr "資源授權"
msgstr "資源授權"
msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
msgstr "基於Qwen-Audio的端到端語音辨識大模型支持3分鐘以內的音訊識別現時主要支持中英文識別。"

View File

@ -11,6 +11,7 @@ import os
from common.utils.common import get_file_content
from models_provider.base_model_provider import ModelProvideInfo, ModelTypeConst, ModelInfo, IModelProvider, \
ModelInfoManage
from models_provider.impl.aliyun_bai_lian_model_provider.credential.asr_stt import AliyunBaiLianAsrSTTModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.embedding import \
AliyunBaiLianEmbeddingCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential
@ -21,6 +22,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.reranker imp
from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.tti import QwenTextToImageModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.tts import AliyunBaiLianTTSModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.model.asr_stt import AliyunBaiLianAsrSpeechToText
from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding
from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel
from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel
@ -36,6 +38,7 @@ aliyun_bai_lian_model_credential = AliyunBaiLianRerankerCredential()
aliyun_bai_lian_tts_model_credential = AliyunBaiLianTTSModelCredential()
aliyun_bai_lian_stt_model_credential = AliyunBaiLianSTTModelCredential()
aliyun_bai_lian_omi_stt_model_credential = AliyunBaiLianOmiSTTModelCredential()
aliyun_bai_lian_asr_stt_model_credential = AliyunBaiLianAsrSTTModelCredential()
aliyun_bai_lian_embedding_model_credential = AliyunBaiLianEmbeddingCredential()
aliyun_bai_lian_llm_model_credential = BaiLianLLMModelCredential()
qwenvl_model_credential = QwenVLModelCredential()
@ -79,10 +82,16 @@ model_info_list = [ModelInfo('gte-rerank',
BaiLianChatModel),
ModelInfo('qwen-omni-turbo',
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential,
AliyunBaiLianOmiSpeechToText),
ModelInfo('qwen2.5-omni-7b',
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential,
AliyunBaiLianOmiSpeechToText),
ModelInfo('qwen-audio-asr',
_('The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition.'),
ModelTypeConst.STT, aliyun_bai_lian_asr_stt_model_credential,
AliyunBaiLianAsrSpeechToText),
]
module_info_vl_list = [

View File

@ -0,0 +1,67 @@
# coding=utf-8
import traceback
from typing import Dict, Any
from common import forms
from common.exception.app_exception import AppApiException
from common.forms import BaseForm
from models_provider.base_model_provider import BaseModelCredential, ValidCode
from django.utils.translation import gettext as _
from models_provider.impl.aliyun_bai_lian_model_provider.credential.omni_stt import AliyunBaiLianOmiSTTModelParams
class AliyunBaiLianAsrSTTModelCredential(BaseForm, BaseModelCredential):
api_url = forms.TextInputField(_('API URL'), required=True)
api_key = forms.PasswordInputField(_('API Key'), required=True)
def is_valid(self,
model_type: str,
model_name: str,
model_credential: Dict[str, Any],
model_params: Dict[str, Any],
provider,
raise_exception: bool = False
) -> bool:
model_type_list = provider.get_model_type_list()
if not any(mt.get('value') == model_type for mt in model_type_list):
raise AppApiException(
ValidCode.valid_error.value,
_('{model_type} Model type is not supported').format(model_type=model_type)
)
required_keys = ['api_key']
for key in required_keys:
if key not in model_credential:
if raise_exception:
raise AppApiException(
ValidCode.valid_error.value,
_('{key} is required').format(key=key)
)
return False
try:
model = provider.get_model(model_type, model_name, model_credential)
except Exception as e:
traceback.print_exc()
if isinstance(e, AppApiException):
raise e
if raise_exception:
raise AppApiException(
ValidCode.valid_error.value,
_('Verification failed, please check whether the parameters are correct: {error}').format(
error=str(e))
)
return False
return True
def encryption_dict(self, model: Dict[str, object]) -> Dict[str, object]:
return {
**model,
'api_key': super().encryption(model.get('api_key', ''))
}
def get_model_params_setting_form(self, model_name):
return AliyunBaiLianOmiSTTModelParams()

View File

@ -0,0 +1,70 @@
import base64
import os.path
import traceback
from typing import Dict
import dashscope
from common.utils.logger import maxkb_logger
from models_provider.base_model_provider import MaxKBBaseModel
from models_provider.impl.base_stt import BaseSpeechToText
class AliyunBaiLianAsrSpeechToText(MaxKBBaseModel, BaseSpeechToText):
api_key: str
api_url: str
model: str
params: dict
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.api_key = kwargs.get('api_key')
self.model = kwargs.get('model')
self.params = kwargs.get('params')
self.api_url = kwargs.get('api_url')
@staticmethod
def is_cache_model():
return False
@staticmethod
def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
return AliyunBaiLianAsrSpeechToText(
model=model_name,
api_key=model_credential.get('api_key'),
api_url=model_credential.get('api_url'),
params=model_kwargs,
**model_kwargs
)
def check_auth(self):
cwd = os.path.dirname(os.path.abspath(__file__))
with open(f'{cwd}/iat_mp3_16k.mp3', 'rb') as audio_file:
self.speech_to_text(audio_file)
def speech_to_text(self, audio_file):
try:
base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{"audio": f"data:audio/mp3;base64,{base64_audio}"},
]
}
]
response = dashscope.MultiModalConversation.call(
api_key=self.api_key,
model=self.model,
messages=messages,
result_format="message",
)
text = response["output"]["choices"][0]["message"].content[0]["text"]
return text
except Exception as err:
maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}")