diff --git a/apps/setting/models_provider/impl/aliyun_bai_lian_model_provider/model/stt.py b/apps/setting/models_provider/impl/aliyun_bai_lian_model_provider/model/stt.py index 89ebd508e..0170cce15 100644 --- a/apps/setting/models_provider/impl/aliyun_bai_lian_model_provider/model/stt.py +++ b/apps/setting/models_provider/impl/aliyun_bai_lian_model_provider/model/stt.py @@ -4,6 +4,7 @@ from typing import Dict import dashscope from dashscope.audio.asr import (Recognition) +from pydub import AudioSegment from setting.models_provider.base_model_provider import MaxKBBaseModel from setting.models_provider.impl.base_stt import BaseSpeechToText @@ -40,7 +41,6 @@ class AliyunBaiLianSpeechToText(MaxKBBaseModel, BaseSpeechToText): dashscope.api_key = self.api_key recognition = Recognition(model=self.model, format='mp3', - sample_rate=16000, callback=None) with tempfile.NamedTemporaryFile(delete=False) as temp_file: # 将上传的文件保存到临时文件中 @@ -49,6 +49,13 @@ class AliyunBaiLianSpeechToText(MaxKBBaseModel, BaseSpeechToText): temp_file_path = temp_file.name try: + audio = AudioSegment.from_file(temp_file_path) + if audio.channels != 1: + audio = audio.set_channels(1) + audio = audio.set_frame_rate(16000) + + # 将转换后的音频文件保存到临时文件中 + audio.export(temp_file_path, format='mp3') # 识别临时文件 result = recognition.call(temp_file_path) text = ''