mirror of
https://github.com/1Panel-dev/MaxKB.git
synced 2025-12-26 01:33:05 +00:00
feat: Qwen2.5-omni-7b full modal speech recognition
This commit is contained in:
parent
a85c36f289
commit
b32b06391f
|
|
@ -15,7 +15,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.embedding im
|
|||
AliyunBaiLianEmbeddingCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.llm import BaiLianLLMModelCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.omi_stt import AliyunBaiLianOmiSTTModelCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.omni_stt import AliyunBaiLianOmiSTTModelCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.reranker import \
|
||||
AliyunBaiLianRerankerCredential
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential
|
||||
|
|
@ -24,7 +24,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.tts import A
|
|||
from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.omi_stt import AliyunBaiLianOmiSpeechToText
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.omni_stt import AliyunBaiLianOmiSpeechToText
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.reranker import AliyunBaiLianReranker
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.stt import AliyunBaiLianSpeechToText
|
||||
from models_provider.impl.aliyun_bai_lian_model_provider.model.tti import QwenTextToImageModel
|
||||
|
|
@ -80,6 +80,9 @@ model_info_list = [ModelInfo('gte-rerank',
|
|||
ModelInfo('qwen-omni-turbo',
|
||||
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
|
||||
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
|
||||
ModelInfo('qwen2.5-omni-7b',
|
||||
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
|
||||
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
|
||||
]
|
||||
|
||||
module_info_vl_list = [
|
||||
|
|
|
|||
|
|
@ -17,7 +17,8 @@ class AliyunBaiLianOmiSTTModelParams(BaseForm):
|
|||
|
||||
|
||||
class AliyunBaiLianOmiSTTModelCredential(BaseForm, BaseModelCredential):
|
||||
api_key = PasswordInputField("API key", required=True)
|
||||
api_url = forms.TextInputField(_('API URL'), required=True)
|
||||
api_key = forms.PasswordInputField(_('API Key'), required=True)
|
||||
|
||||
def is_valid(self,
|
||||
model_type: str,
|
||||
|
|
@ -12,6 +12,7 @@ from models_provider.impl.base_stt import BaseSpeechToText
|
|||
|
||||
class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
|
||||
api_key: str
|
||||
api_url: str
|
||||
model: str
|
||||
params: dict
|
||||
|
||||
|
|
@ -20,6 +21,7 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
|
|||
self.api_key = kwargs.get('api_key')
|
||||
self.model = kwargs.get('model')
|
||||
self.params = kwargs.get('params')
|
||||
self.api_url = kwargs.get('api_url')
|
||||
|
||||
@staticmethod
|
||||
def is_cache_model():
|
||||
|
|
@ -30,6 +32,7 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
|
|||
return AliyunBaiLianOmiSpeechToText(
|
||||
model=model_name,
|
||||
api_key=model_credential.get('api_key'),
|
||||
api_url=model_credential.get('api_url') ,
|
||||
params= model_kwargs,
|
||||
**model_kwargs
|
||||
)
|
||||
|
|
@ -47,13 +50,13 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
|
|||
client = OpenAI(
|
||||
# 若没有配置环境变量,请用阿里云百炼API Key将下行替换为:api_key="sk-xxx",
|
||||
api_key=self.api_key,
|
||||
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
base_url=self.api_url,
|
||||
)
|
||||
|
||||
base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
|
||||
|
||||
completion = client.chat.completions.create(
|
||||
model="qwen-omni-turbo-0119",
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
|
|
@ -71,16 +74,15 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
|
|||
],
|
||||
# 设置输出数据的模态,当前支持两种:["text","audio"]、["text"]
|
||||
modalities=["text"],
|
||||
audio={"voice": "Cherry", "format": "mp3"},
|
||||
# stream 必须设置为 True,否则会报错
|
||||
stream=True,
|
||||
stream_options={"include_usage": True},
|
||||
)
|
||||
result = []
|
||||
for chunk in completion:
|
||||
if chunk.choices and hasattr(chunk.choices[0].delta, 'audio'):
|
||||
transcript = chunk.choices[0].delta.audio.get('transcript')
|
||||
result.append(transcript)
|
||||
if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
|
||||
content = chunk.choices[0].delta.content
|
||||
result.append(content)
|
||||
return "".join(result)
|
||||
|
||||
except Exception as err:
|
||||
|
|
@ -30,8 +30,6 @@ class AliyunBaiLianSpeechToText(MaxKBBaseModel, BaseSpeechToText):
|
|||
optional_params['max_tokens'] = model_kwargs['max_tokens']
|
||||
if 'temperature' in model_kwargs and model_kwargs['temperature'] is not None:
|
||||
optional_params['temperature'] = model_kwargs['temperature']
|
||||
if model_name == 'qwen-omni-turbo':
|
||||
optional_params['streaming'] = True
|
||||
return AliyunBaiLianSpeechToText(
|
||||
model=model_name,
|
||||
api_key=model_credential.get('api_key'),
|
||||
|
|
|
|||
Loading…
Reference in New Issue