mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
feat: Qwen asr speech recognition
This commit is contained in:
parent
9461ca8071
commit
354f85ddb6
|
|
@ -8660,4 +8660,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data
|
|||
msgstr ""
|
||||
|
||||
msgid "resource authorization"
|
||||
msgstr ""
|
||||
|
||||
msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
|
||||
msgstr ""
|
||||
|
|
@ -8786,4 +8786,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data
|
|||
msgstr "Qwen-Omni 系列模型支持输入多种模态的数据,包括视频、音频、图片、文本,并输出音频与文本"
|
||||
|
||||
msgid "resource authorization"
|
||||
msgstr "资源授权"
|
||||
msgstr "资源授权"
|
||||
|
||||
msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
|
||||
msgstr "基于Qwen-Audio的端到端语音识别大模型,支持3分钟以内的音频识别,目前主要支持中英文识别。"
|
||||
|
|
@ -8786,4 +8786,7 @@ msgid "The Qwen Omni series model supports inputting multiple modalities of data
|
|||
msgstr "Qwen-Omni系列模型支持輸入多種模態的數據,包括視頻、音訊、圖片、文字,並輸出音訊與文字"
|
||||
|
||||
msgid "resource authorization"
|
||||
msgstr "資源授權"
|
||||
msgstr "資源授權"
|
||||
|
||||
msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
|
||||
msgstr "基於Qwen-Audio的端到端語音辨識大模型,支持3分鐘以內的音訊識別,現時主要支持中英文識別。"
|
||||
|
|
@ -11,6 +11,7 @@ import os
|
|||
from common.utils.common import get_file_content
|
||||
from models_provider.base_model_provider import ModelProvideInfo, ModelTypeConst, ModelInfo, IModelProvider, \
|
||||
ModelInfoManage
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.asr_stt import AliyunBaiLianAsrSTTModelCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.embedding import \
|
||||
AliyunBaiLianEmbeddingCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential
|
||||
|
|
@ -21,6 +22,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.reranker imp
|
|||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.tti import QwenTextToImageModelCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.tts import AliyunBaiLianTTSModelCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.asr_stt import AliyunBaiLianAsrSpeechToText
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel
|
||||
|
|
@ -36,6 +38,7 @@ aliyun_bai_lian_model_credential = AliyunBaiLianRerankerCredential()
|
|||
aliyun_bai_lian_tts_model_credential = AliyunBaiLianTTSModelCredential()
|
||||
aliyun_bai_lian_stt_model_credential = AliyunBaiLianSTTModelCredential()
|
||||
aliyun_bai_lian_omi_stt_model_credential = AliyunBaiLianOmiSTTModelCredential()
|
||||
aliyun_bai_lian_asr_stt_model_credential = AliyunBaiLianAsrSTTModelCredential()
|
||||
aliyun_bai_lian_embedding_model_credential = AliyunBaiLianEmbeddingCredential()
|
||||
aliyun_bai_lian_llm_model_credential = BaiLianLLMModelCredential()
|
||||
qwenvl_model_credential = QwenVLModelCredential()
|
||||
|
|
@ -79,10 +82,16 @@ model_info_list = [ModelInfo('gte-rerank',
|
|||
BaiLianChatModel),
|
||||
ModelInfo('qwen-omni-turbo',
|
||||
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
|
||||
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
|
||||
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential,
|
||||
AliyunBaiLianOmiSpeechToText),
|
||||
ModelInfo('qwen2.5-omni-7b',
|
||||
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
|
||||
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
|
||||
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential,
|
||||
AliyunBaiLianOmiSpeechToText),
|
||||
ModelInfo('qwen-audio-asr',
|
||||
_('The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition.'),
|
||||
ModelTypeConst.STT, aliyun_bai_lian_asr_stt_model_credential,
|
||||
AliyunBaiLianAsrSpeechToText),
|
||||
]
|
||||
|
||||
module_info_vl_list = [
|
||||
|
|
|
|||
|
|
@ -0,0 +1,67 @@
|
|||
# coding=utf-8
|
||||
import traceback
|
||||
from typing import Dict, Any
|
||||
|
||||
from common import forms
|
||||
from common.exception.app_exception import AppApiException
|
||||
from common.forms import BaseForm
|
||||
from models_provider.base_model_provider import BaseModelCredential, ValidCode
|
||||
from django.utils.translation import gettext as _
|
||||
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.omni_stt import AliyunBaiLianOmiSTTModelParams
|
||||
|
||||
|
||||
class AliyunBaiLianAsrSTTModelCredential(BaseForm, BaseModelCredential):
|
||||
api_url = forms.TextInputField(_('API URL'), required=True)
|
||||
api_key = forms.PasswordInputField(_('API Key'), required=True)
|
||||
|
||||
def is_valid(self,
|
||||
model_type: str,
|
||||
model_name: str,
|
||||
model_credential: Dict[str, Any],
|
||||
model_params: Dict[str, Any],
|
||||
provider,
|
||||
raise_exception: bool = False
|
||||
) -> bool:
|
||||
model_type_list = provider.get_model_type_list()
|
||||
if not any(mt.get('value') == model_type for mt in model_type_list):
|
||||
raise AppApiException(
|
||||
ValidCode.valid_error.value,
|
||||
_('{model_type} Model type is not supported').format(model_type=model_type)
|
||||
)
|
||||
|
||||
required_keys = ['api_key']
|
||||
for key in required_keys:
|
||||
if key not in model_credential:
|
||||
if raise_exception:
|
||||
raise AppApiException(
|
||||
ValidCode.valid_error.value,
|
||||
_('{key} is required').format(key=key)
|
||||
)
|
||||
return False
|
||||
|
||||
try:
|
||||
model = provider.get_model(model_type, model_name, model_credential)
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
if isinstance(e, AppApiException):
|
||||
raise e
|
||||
if raise_exception:
|
||||
raise AppApiException(
|
||||
ValidCode.valid_error.value,
|
||||
_('Verification failed, please check whether the parameters are correct: {error}').format(
|
||||
error=str(e))
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def encryption_dict(self, model: Dict[str, object]) -> Dict[str, object]:
|
||||
|
||||
return {
|
||||
**model,
|
||||
'api_key': super().encryption(model.get('api_key', ''))
|
||||
}
|
||||
|
||||
def get_model_params_setting_form(self, model_name):
|
||||
|
||||
return AliyunBaiLianOmiSTTModelParams()
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
import base64
|
||||
import os.path
|
||||
import traceback
|
||||
from typing import Dict
|
||||
|
||||
import dashscope
|
||||
|
||||
from common.utils.logger import maxkb_logger
|
||||
from models_provider.base_model_provider import MaxKBBaseModel
|
||||
from models_provider.impl.base_stt import BaseSpeechToText
|
||||
|
||||
|
||||
class AliyunBaiLianAsrSpeechToText(MaxKBBaseModel, BaseSpeechToText):
|
||||
api_key: str
|
||||
api_url: str
|
||||
model: str
|
||||
params: dict
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.api_key = kwargs.get('api_key')
|
||||
self.model = kwargs.get('model')
|
||||
self.params = kwargs.get('params')
|
||||
self.api_url = kwargs.get('api_url')
|
||||
|
||||
@staticmethod
|
||||
def is_cache_model():
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
|
||||
return AliyunBaiLianAsrSpeechToText(
|
||||
model=model_name,
|
||||
api_key=model_credential.get('api_key'),
|
||||
api_url=model_credential.get('api_url'),
|
||||
params=model_kwargs,
|
||||
**model_kwargs
|
||||
)
|
||||
|
||||
def check_auth(self):
|
||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||
with open(f'{cwd}/iat_mp3_16k.mp3', 'rb') as audio_file:
|
||||
self.speech_to_text(audio_file)
|
||||
|
||||
def speech_to_text(self, audio_file):
|
||||
try:
|
||||
|
||||
base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"audio": f"data:audio/mp3;base64,{base64_audio}"},
|
||||
]
|
||||
}
|
||||
]
|
||||
response = dashscope.MultiModalConversation.call(
|
||||
api_key=self.api_key,
|
||||
model=self.model,
|
||||
messages=messages,
|
||||
result_format="message",
|
||||
)
|
||||
|
||||
text = response["output"]["choices"][0]["message"].content[0]["text"]
|
||||
|
||||
return text
|
||||
|
||||
except Exception as err:
|
||||
maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}")
|
||||
Loading…
Reference in New Issue