From b32b06391f08034450c1d700d6641c4ac12b8cac Mon Sep 17 00:00:00 2001 From: zhangzhanwei Date: Mon, 18 Aug 2025 11:01:54 +0800 Subject: [PATCH] feat: Qwen2.5-omni-7b full modal speech recognition --- .../aliyun_bai_lian_model_provider.py | 7 +++++-- .../credential/{omi_stt.py => omni_stt.py} | 3 ++- .../model/{omi_stt.py => omni_stt.py} | 14 ++++++++------ .../aliyun_bai_lian_model_provider/model/stt.py | 2 -- 4 files changed, 15 insertions(+), 11 deletions(-) rename apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/{omi_stt.py => omni_stt.py} (95%) rename apps/models_provider/impl/aliyun_bai_lian_model_provider/model/{omi_stt.py => omni_stt.py} (89%) diff --git a/apps/models_provider/impl/aliyun_bai_lian_model_provider/aliyun_bai_lian_model_provider.py b/apps/models_provider/impl/aliyun_bai_lian_model_provider/aliyun_bai_lian_model_provider.py index 56d570c04..7264001b2 100644 --- a/apps/models_provider/impl/aliyun_bai_lian_model_provider/aliyun_bai_lian_model_provider.py +++ b/apps/models_provider/impl/aliyun_bai_lian_model_provider/aliyun_bai_lian_model_provider.py @@ -15,7 +15,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.embedding im AliyunBaiLianEmbeddingCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.llm import BaiLianLLMModelCredential -from models_provider.impl.aliyun_bai_lian_model_provider.credential.omi_stt import AliyunBaiLianOmiSTTModelCredential +from models_provider.impl.aliyun_bai_lian_model_provider.credential.omni_stt import AliyunBaiLianOmiSTTModelCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.reranker import \ AliyunBaiLianRerankerCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential @@ -24,7 +24,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.tts import A from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel -from models_provider.impl.aliyun_bai_lian_model_provider.model.omi_stt import AliyunBaiLianOmiSpeechToText +from models_provider.impl.aliyun_bai_lian_model_provider.model.omni_stt import AliyunBaiLianOmiSpeechToText from models_provider.impl.aliyun_bai_lian_model_provider.model.reranker import AliyunBaiLianReranker from models_provider.impl.aliyun_bai_lian_model_provider.model.stt import AliyunBaiLianSpeechToText from models_provider.impl.aliyun_bai_lian_model_provider.model.tti import QwenTextToImageModel @@ -80,6 +80,9 @@ model_info_list = [ModelInfo('gte-rerank', ModelInfo('qwen-omni-turbo', _('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'), ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText), + ModelInfo('qwen2.5-omni-7b', + _('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'), + ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText), ] module_info_vl_list = [ diff --git a/apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/omi_stt.py b/apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/omni_stt.py similarity index 95% rename from apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/omi_stt.py rename to apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/omni_stt.py index 960b6e8ef..82dcb4b55 100644 --- a/apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/omi_stt.py +++ b/apps/models_provider/impl/aliyun_bai_lian_model_provider/credential/omni_stt.py @@ -17,7 +17,8 @@ class AliyunBaiLianOmiSTTModelParams(BaseForm): class AliyunBaiLianOmiSTTModelCredential(BaseForm, BaseModelCredential): - api_key = PasswordInputField("API key", required=True) + api_url = forms.TextInputField(_('API URL'), required=True) + api_key = forms.PasswordInputField(_('API Key'), required=True) def is_valid(self, model_type: str, diff --git a/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/omi_stt.py b/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/omni_stt.py similarity index 89% rename from apps/models_provider/impl/aliyun_bai_lian_model_provider/model/omi_stt.py rename to apps/models_provider/impl/aliyun_bai_lian_model_provider/model/omni_stt.py index 4528f1cef..56e060f6b 100644 --- a/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/omi_stt.py +++ b/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/omni_stt.py @@ -12,6 +12,7 @@ from models_provider.impl.base_stt import BaseSpeechToText class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText): api_key: str + api_url: str model: str params: dict @@ -20,6 +21,7 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText): self.api_key = kwargs.get('api_key') self.model = kwargs.get('model') self.params = kwargs.get('params') + self.api_url = kwargs.get('api_url') @staticmethod def is_cache_model(): @@ -30,6 +32,7 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText): return AliyunBaiLianOmiSpeechToText( model=model_name, api_key=model_credential.get('api_key'), + api_url=model_credential.get('api_url') , params= model_kwargs, **model_kwargs ) @@ -47,13 +50,13 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText): client = OpenAI( # 若没有配置环境变量,请用阿里云百炼API Key将下行替换为:api_key="sk-xxx", api_key=self.api_key, - base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + base_url=self.api_url, ) base64_audio = base64.b64encode(audio_file.read()).decode("utf-8") completion = client.chat.completions.create( - model="qwen-omni-turbo-0119", + model=self.model, messages=[ { "role": "user", @@ -71,16 +74,15 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText): ], # 设置输出数据的模态,当前支持两种:["text","audio"]、["text"] modalities=["text"], - audio={"voice": "Cherry", "format": "mp3"}, # stream 必须设置为 True,否则会报错 stream=True, stream_options={"include_usage": True}, ) result = [] for chunk in completion: - if chunk.choices and hasattr(chunk.choices[0].delta, 'audio'): - transcript = chunk.choices[0].delta.audio.get('transcript') - result.append(transcript) + if chunk.choices and hasattr(chunk.choices[0].delta, 'content'): + content = chunk.choices[0].delta.content + result.append(content) return "".join(result) except Exception as err: diff --git a/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/stt.py b/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/stt.py index b5c9f240d..7017caf79 100644 --- a/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/stt.py +++ b/apps/models_provider/impl/aliyun_bai_lian_model_provider/model/stt.py @@ -30,8 +30,6 @@ class AliyunBaiLianSpeechToText(MaxKBBaseModel, BaseSpeechToText): optional_params['max_tokens'] = model_kwargs['max_tokens'] if 'temperature' in model_kwargs and model_kwargs['temperature'] is not None: optional_params['temperature'] = model_kwargs['temperature'] - if model_name == 'qwen-omni-turbo': - optional_params['streaming'] = True return AliyunBaiLianSpeechToText( model=model_name, api_key=model_credential.get('api_key'),