diff --git a/apps/locales/en_US/LC_MESSAGES/django.po b/apps/locales/en_US/LC_MESSAGES/django.po index 0ad37986d..d5180b0d7 100644 --- a/apps/locales/en_US/LC_MESSAGES/django.po +++ b/apps/locales/en_US/LC_MESSAGES/django.po @@ -8663,4 +8663,7 @@ msgid "resource authorization" msgstr "" msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition." +msgstr "" + +msgid "If not passed, the default value is 'zh'" msgstr "" \ No newline at end of file diff --git a/apps/locales/zh_CN/LC_MESSAGES/django.po b/apps/locales/zh_CN/LC_MESSAGES/django.po index 833af94a7..e040c6e9e 100644 --- a/apps/locales/zh_CN/LC_MESSAGES/django.po +++ b/apps/locales/zh_CN/LC_MESSAGES/django.po @@ -8789,4 +8789,7 @@ msgid "resource authorization" msgstr "资源授权" msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition." -msgstr "基于Qwen-Audio的端到端语音识别大模型,支持3分钟以内的音频识别,目前主要支持中英文识别。" \ No newline at end of file +msgstr "基于Qwen-Audio的端到端语音识别大模型,支持3分钟以内的音频识别,目前主要支持中英文识别。" + +msgid "If not passed, the default value is 'zh'" +msgstr "如果未传递,则默认值为'zh'" \ No newline at end of file diff --git a/apps/locales/zh_Hant/LC_MESSAGES/django.po b/apps/locales/zh_Hant/LC_MESSAGES/django.po index 2ca5cbf59..f969052a0 100644 --- a/apps/locales/zh_Hant/LC_MESSAGES/django.po +++ b/apps/locales/zh_Hant/LC_MESSAGES/django.po @@ -8789,4 +8789,7 @@ msgid "resource authorization" msgstr "資源授權" msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition." -msgstr "基於Qwen-Audio的端到端語音辨識大模型,支持3分鐘以內的音訊識別,現時主要支持中英文識別。" \ No newline at end of file +msgstr "基於Qwen-Audio的端到端語音辨識大模型,支持3分鐘以內的音訊識別,現時主要支持中英文識別。" + +msgid "If not passed, the default value is 'zh'" +msgstr "如果未傳遞,則預設值為'zh'" \ No newline at end of file diff --git a/apps/models_provider/impl/vllm_model_provider/credential/whisper_stt.py b/apps/models_provider/impl/vllm_model_provider/credential/whisper_stt.py new file mode 100644 index 000000000..f65b38eac --- /dev/null +++ b/apps/models_provider/impl/vllm_model_provider/credential/whisper_stt.py @@ -0,0 +1,62 @@ +# coding=utf-8 +import traceback +from typing import Dict + +from django.utils.translation import gettext_lazy as _, gettext +from langchain_core.messages import HumanMessage + +from common import forms +from common.exception.app_exception import AppApiException +from common.forms import BaseForm, TooltipLabel +from models_provider.base_model_provider import BaseModelCredential, ValidCode + + +class VLLMWhisperModelParams(BaseForm): + Language = forms.TextInputField( + TooltipLabel(_('Language'), + _("If not passed, the default value is 'zh'")), + required=True, + default_value='zh', + ) + + +class VLLMWhisperModelCredential(BaseForm, BaseModelCredential): + api_url = forms.TextInputField('API URL', required=True) + api_key = forms.PasswordInputField('API Key', required=True) + + def is_valid(self, + model_type: str, + model_name, + model_credential: Dict[str, object], + model_params, + provider, + raise_exception=False): + + model_type_list = provider.get_model_type_list() + + if not any(list(filter(lambda mt: mt.get('value') == model_type, model_type_list))): + raise AppApiException(ValidCode.valid_error.value, + gettext('{model_type} Model type is not supported').format(model_type=model_type)) + try: + model_list = provider.get_base_model_list(model_credential.get('api_url'), model_credential.get('api_key')) + except Exception as e: + raise AppApiException(ValidCode.valid_error.value, gettext('API domain name is invalid')) + exist = provider.get_model_info_by_name(model_list, model_name) + if len(exist) == 0: + raise AppApiException(ValidCode.valid_error.value, + gettext('The model does not exist, please download the model first')) + model = provider.get_model(model_type, model_name, model_credential, **model_params) + return True + + def encryption_dict(self, model_info: Dict[str, object]): + return {**model_info, 'api_key': super().encryption(model_info.get('api_key', ''))} + + def build_model(self, model_info: Dict[str, object]): + for key in ['api_key', 'model']: + if key not in model_info: + raise AppApiException(500, gettext('{key} is required').format(key=key)) + self.api_key = model_info.get('api_key') + return self + + def get_model_params_setting_form(self, model_name): + return VLLMWhisperModelParams() \ No newline at end of file diff --git a/apps/models_provider/impl/vllm_model_provider/model/iat_mp3_16k.mp3 b/apps/models_provider/impl/vllm_model_provider/model/iat_mp3_16k.mp3 new file mode 100644 index 000000000..75e744c8f Binary files /dev/null and b/apps/models_provider/impl/vllm_model_provider/model/iat_mp3_16k.mp3 differ diff --git a/apps/models_provider/impl/vllm_model_provider/model/whisper_sst.py b/apps/models_provider/impl/vllm_model_provider/model/whisper_sst.py new file mode 100644 index 000000000..62075a611 --- /dev/null +++ b/apps/models_provider/impl/vllm_model_provider/model/whisper_sst.py @@ -0,0 +1,64 @@ +import base64 +import os +import traceback +from typing import Dict + +from openai import OpenAI + +from common.utils.logger import maxkb_logger +from models_provider.base_model_provider import MaxKBBaseModel +from models_provider.impl.base_stt import BaseSpeechToText + + + +class VllmWhisperSpeechToText(MaxKBBaseModel, BaseSpeechToText): + api_key: str + api_url: str + model: str + params: dict + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.api_key = kwargs.get('api_key') + self.model = kwargs.get('model') + self.params = kwargs.get('params') + self.api_url = kwargs.get('api_url') + + @staticmethod + def is_cache_model(): + return False + + @staticmethod + def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs): + return VllmWhisperSpeechToText( + model=model_name, + api_key=model_credential.get('api_key'), + api_url=model_credential.get('api_url'), + params=model_kwargs, + **model_kwargs + ) + + def check_auth(self): + cwd = os.path.dirname(os.path.abspath(__file__)) + with open(f'{cwd}/iat_mp3_16k.mp3', 'rb') as audio_file: + self.speech_to_text(audio_file) + + def speech_to_text(self, audio_file): + base_url = f"{self.api_url}/v1" + try: + client = OpenAI( + api_key=self.api_key, + base_url=base_url + ) + + result = client.audio.transcriptions.create( + file=audio_file, + model=self.model, + language=self.params.get('Language'), + response_format="json" + ) + + return result.text + + except Exception as err: + maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}") \ No newline at end of file diff --git a/apps/models_provider/impl/vllm_model_provider/vllm_model_provider.py b/apps/models_provider/impl/vllm_model_provider/vllm_model_provider.py index 6206817f4..12da7ce66 100644 --- a/apps/models_provider/impl/vllm_model_provider/vllm_model_provider.py +++ b/apps/models_provider/impl/vllm_model_provider/vllm_model_provider.py @@ -10,20 +10,27 @@ from models_provider.base_model_provider import IModelProvider, ModelProvideInfo from models_provider.impl.vllm_model_provider.credential.embedding import VllmEmbeddingCredential from models_provider.impl.vllm_model_provider.credential.image import VllmImageModelCredential from models_provider.impl.vllm_model_provider.credential.llm import VLLMModelCredential +from models_provider.impl.vllm_model_provider.credential.whisper_stt import VLLMWhisperModelCredential from models_provider.impl.vllm_model_provider.model.embedding import VllmEmbeddingModel from models_provider.impl.vllm_model_provider.model.image import VllmImage from models_provider.impl.vllm_model_provider.model.llm import VllmChatModel from maxkb.conf import PROJECT_DIR from django.utils.translation import gettext as _ +from models_provider.impl.vllm_model_provider.model.whisper_sst import VllmWhisperSpeechToText + v_llm_model_credential = VLLMModelCredential() image_model_credential = VllmImageModelCredential() embedding_model_credential = VllmEmbeddingCredential() +whisper_model_credential = VLLMWhisperModelCredential() model_info_list = [ - ModelInfo('facebook/opt-125m', _('Facebook’s 125M parameter model'), ModelTypeConst.LLM, v_llm_model_credential, VllmChatModel), - ModelInfo('BAAI/Aquila-7B', _('BAAI’s 7B parameter model'), ModelTypeConst.LLM, v_llm_model_credential, VllmChatModel), - ModelInfo('BAAI/AquilaChat-7B', _('BAAI’s 13B parameter mode'), ModelTypeConst.LLM, v_llm_model_credential, VllmChatModel), + ModelInfo('facebook/opt-125m', _('Facebook’s 125M parameter model'), ModelTypeConst.LLM, v_llm_model_credential, + VllmChatModel), + ModelInfo('BAAI/Aquila-7B', _('BAAI’s 7B parameter model'), ModelTypeConst.LLM, v_llm_model_credential, + VllmChatModel), + ModelInfo('BAAI/AquilaChat-7B', _('BAAI’s 13B parameter mode'), ModelTypeConst.LLM, v_llm_model_credential, + VllmChatModel), ] @@ -32,7 +39,15 @@ image_model_info_list = [ ] embedding_model_info_list = [ - ModelInfo('HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5', '', ModelTypeConst.EMBEDDING, embedding_model_credential, VllmEmbeddingModel), + ModelInfo('HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5', '', ModelTypeConst.EMBEDDING, + embedding_model_credential, VllmEmbeddingModel), +] + +whisper_model_info_list = [ + ModelInfo('whisper-tiny', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText), + ModelInfo('whisper-large-v3-turbo', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText), + ModelInfo('whisper-small', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText), + ModelInfo('whisper-large-v3', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText), ] model_info_manage = ( @@ -45,6 +60,8 @@ model_info_manage = ( .append_default_model_info(image_model_info_list[0]) .append_model_info_list(embedding_model_info_list) .append_default_model_info(embedding_model_info_list[0]) + .append_model_info_list(whisper_model_info_list) + .append_default_model_info(whisper_model_info_list[0]) .build() )