feat: Qwen2.5-omni-7b full modal speech recognition

This commit is contained in:
zhangzhanwei 2025-08-18 11:01:54 +08:00 committed by zhanweizhang7
parent a85c36f289
commit b32b06391f
4 changed files with 15 additions and 11 deletions

View File

@ -15,7 +15,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.embedding im
AliyunBaiLianEmbeddingCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.llm import BaiLianLLMModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.omi_stt import AliyunBaiLianOmiSTTModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.omni_stt import AliyunBaiLianOmiSTTModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.reranker import \
AliyunBaiLianRerankerCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential
@ -24,7 +24,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.tts import A
from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding
from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel
from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel
from models_provider.impl.aliyun_bai_lian_model_provider.model.omi_stt import AliyunBaiLianOmiSpeechToText
from models_provider.impl.aliyun_bai_lian_model_provider.model.omni_stt import AliyunBaiLianOmiSpeechToText
from models_provider.impl.aliyun_bai_lian_model_provider.model.reranker import AliyunBaiLianReranker
from models_provider.impl.aliyun_bai_lian_model_provider.model.stt import AliyunBaiLianSpeechToText
from models_provider.impl.aliyun_bai_lian_model_provider.model.tti import QwenTextToImageModel
@ -80,6 +80,9 @@ model_info_list = [ModelInfo('gte-rerank',
ModelInfo('qwen-omni-turbo',
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
ModelInfo('qwen2.5-omni-7b',
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
]
module_info_vl_list = [

View File

@ -17,7 +17,8 @@ class AliyunBaiLianOmiSTTModelParams(BaseForm):
class AliyunBaiLianOmiSTTModelCredential(BaseForm, BaseModelCredential):
api_key = PasswordInputField("API key", required=True)
api_url = forms.TextInputField(_('API URL'), required=True)
api_key = forms.PasswordInputField(_('API Key'), required=True)
def is_valid(self,
model_type: str,

View File

@ -12,6 +12,7 @@ from models_provider.impl.base_stt import BaseSpeechToText
class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
api_key: str
api_url: str
model: str
params: dict
@ -20,6 +21,7 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
self.api_key = kwargs.get('api_key')
self.model = kwargs.get('model')
self.params = kwargs.get('params')
self.api_url = kwargs.get('api_url')
@staticmethod
def is_cache_model():
@ -30,6 +32,7 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
return AliyunBaiLianOmiSpeechToText(
model=model_name,
api_key=model_credential.get('api_key'),
api_url=model_credential.get('api_url') ,
params= model_kwargs,
**model_kwargs
)
@ -47,13 +50,13 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
client = OpenAI(
# 若没有配置环境变量请用阿里云百炼API Key将下行替换为api_key="sk-xxx",
api_key=self.api_key,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
base_url=self.api_url,
)
base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
completion = client.chat.completions.create(
model="qwen-omni-turbo-0119",
model=self.model,
messages=[
{
"role": "user",
@ -71,16 +74,15 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
],
# 设置输出数据的模态,当前支持两种:["text","audio"]、["text"]
modalities=["text"],
audio={"voice": "Cherry", "format": "mp3"},
# stream 必须设置为 True否则会报错
stream=True,
stream_options={"include_usage": True},
)
result = []
for chunk in completion:
if chunk.choices and hasattr(chunk.choices[0].delta, 'audio'):
transcript = chunk.choices[0].delta.audio.get('transcript')
result.append(transcript)
if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
content = chunk.choices[0].delta.content
result.append(content)
return "".join(result)
except Exception as err:

View File

@ -30,8 +30,6 @@ class AliyunBaiLianSpeechToText(MaxKBBaseModel, BaseSpeechToText):
optional_params['max_tokens'] = model_kwargs['max_tokens']
if 'temperature' in model_kwargs and model_kwargs['temperature'] is not None:
optional_params['temperature'] = model_kwargs['temperature']
if model_name == 'qwen-omni-turbo':
optional_params['streaming'] = True
return AliyunBaiLianSpeechToText(
model=model_name,
api_key=model_credential.get('api_key'),