feat: Text to speech support streaming playback (#2661)

This commit is contained in:
shaohuzhang1 2025-03-24 14:21:29 +08:00 committed by GitHub
parent 0ce6dd0795
commit dcee1b6d55
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 343 additions and 136 deletions

View File

@ -1,5 +1,6 @@
import { type Dict } from '@/api/type/common'
import { type Ref } from 'vue'
import bus from '@/bus'
interface ApplicationFormType {
name?: string
desc?: string
@ -144,8 +145,8 @@ export class ChatRecordManage {
})
}
}
this.chat.answer_text = this.chat.answer_text + chunk_answer
bus.emit('change:answer', { record_id: this.chat.record_id, is_end: false })
}
get_current_up_node(run_node: any) {
const index = this.node_list.findIndex((item) => item == run_node)
@ -232,6 +233,7 @@ export class ChatRecordManage {
if (this.loading) {
this.loading.value = false
}
bus.emit('change:answer', { record_id: this.chat.record_id, is_end: true })
if (this.id) {
clearInterval(this.id)
}

View File

@ -8,20 +8,35 @@
<!-- 语音播放 -->
<span v-if="tts">
<el-tooltip
v-if="audioManage?.isPlaying()"
effect="dark"
:content="$t('chat.operation.play')"
:content="$t('chat.operation.pause')"
placement="top"
v-if="!audioPlayerStatus"
>
<el-button text :disabled="!data?.write_ed" @click="playAnswerText(data?.answer_text)">
<AppIcon iconName="app-video-play"></AppIcon>
</el-button>
</el-tooltip>
<el-tooltip v-else effect="dark" :content="$t('chat.operation.pause')" placement="top">
<el-button type="primary" text :disabled="!data?.write_ed" @click="pausePlayAnswerText()">
<el-button
type="primary"
text
:disabled="!data?.write_ed"
@click="audioManage?.pause(true)"
>
<AppIcon iconName="app-video-pause"></AppIcon>
</el-button>
</el-tooltip>
<el-tooltip effect="dark" :content="$t('chat.operation.play')" placement="top" v-else>
<el-button
text
:disabled="!data?.write_ed"
@click="
() => {
bus.emit('play:pause', props.data.record_id)
audioManage?.play(props.data.answer_text, true)
}
"
>
<AppIcon iconName="app-video-play"></AppIcon>
</el-button>
</el-tooltip>
<el-divider direction="vertical" />
</span>
<span v-if="type == 'ai-chat' || type == 'log'">
@ -82,6 +97,7 @@
</div>
<!-- 先渲染不然不能播放 -->
<audio ref="audioPlayer" v-for="item in audioList" :key="item" controls hidden="hidden"></audio>
<div ref="audioCiontainer"></div>
</div>
</template>
<script setup lang="ts">
@ -91,8 +107,9 @@ import { copyClick } from '@/utils/clipboard'
import applicationApi from '@/api/application'
import { datetimeFormat } from '@/utils/time'
import { MsgError } from '@/utils/message'
import { t } from '@/locales'
import bus from '@/bus'
import { da } from 'element-plus/es/locale'
const route = useRoute()
const {
params: { id }
@ -118,12 +135,12 @@ const props = withDefaults(
const emit = defineEmits(['update:data', 'regeneration'])
const audioPlayer = ref<HTMLAudioElement[] | null>([])
const audioCiontainer = ref<HTMLDivElement>()
const audioPlayerStatus = ref(false)
const buttonData = ref(props.data)
const loading = ref(false)
const utterance = ref<SpeechSynthesisUtterance | null>(null)
const audioList = ref<string[]>([])
const currentAudioIndex = ref(0)
function regeneration() {
emit('regeneration')
@ -166,144 +183,331 @@ function markdownToPlainText(md: string) {
function removeFormRander(text: string) {
return text.replace(/<form_rander>[\s\S]*?<\/form_rander>/g, '').trim()
}
const playAnswerText = (text: string) => {
if (!text) {
text = t('chat.tip.answerMessage')
function getKey(keys: Array<number>, index: number) {
// index
for (let i = keys.length - 1; i >= 0; i--) {
if (keys[i] <= index) {
return keys[i]
}
}
//
text = removeFormRander(text)
// text
text = markdownToPlainText(text)
// console.log(text)
audioPlayerStatus.value = true
//
audioList.value = text.split(/(<audio[^>]*><\/audio>)/).filter((item) => item.trim().length > 0)
nextTick(() => {
// console.log(audioList.value, audioPlayer.value)
playAnswerTextPart()
})
return 0
}
function smartSplit(
str: string,
minLengthConfig: any = {
0: 10,
1: 25,
3: 50,
5: 100
},
is_end = false
) {
// /20
const regex = /([。?\n])|(<audio[^>]*><\/audio>)/g
//
const parts = str.split(regex)
const result = []
const keys = Object.keys(minLengthConfig).map(Number)
let minLength = minLengthConfig[0]
let temp_str = ''
for (let i = 0; i < parts.length; i++) {
const content = parts[i]
if (content == undefined) {
continue
}
if (/^<audio[^>]*><\/audio>$/.test(content)) {
if (temp_str.length > 0) {
result.push(temp_str)
temp_str = ''
}
result.push(content)
continue
}
temp_str += parts[i]
if (temp_str.length > minLength && /[。?\n]$/.test(temp_str)) {
minLength = minLengthConfig[getKey(keys, i)]
result.push(temp_str)
temp_str = ''
}
}
if (temp_str.length > 0 && is_end) {
result.push(temp_str)
}
return result
}
const playAnswerTextPart = () => {
// console.log(audioList.value, currentAudioIndex.value)
if (currentAudioIndex.value === audioList.value.length) {
audioPlayerStatus.value = false
currentAudioIndex.value = 0
return
enum AudioStatus {
/**
* 结束
*/
END = 'END',
/**
* 播放中
*/
PLAY_INT = 'PLAY_INT',
/**
* 刚挂载
*/
MOUNTED = 'MOUNTED',
/**
* 就绪
*/
READY = 'READY',
/**
* 错误
*/
ERROR = 'ERROR'
}
class AudioManage {
textList: Array<string>
statusList: Array<AudioStatus>
audioList: Array<HTMLAudioElement | SpeechSynthesisUtterance>
ttsType: string
root: Element
constructor(ttsType: string, root: HTMLDivElement) {
this.textList = []
this.audioList = []
this.statusList = []
this.ttsType = ttsType
this.root = root
}
if (audioList.value[currentAudioIndex.value].includes('<audio')) {
if (audioPlayer.value) {
audioPlayer.value[currentAudioIndex.value].src =
audioList.value[currentAudioIndex.value].match(/src="([^"]*)"/)?.[1] || ''
audioPlayer.value[currentAudioIndex.value].play() //
audioPlayer.value[currentAudioIndex.value].onended = () => {
currentAudioIndex.value += 1
playAnswerTextPart()
appendTextList(textList: Array<string>) {
const newTextList = textList.slice(this.textList.length)
//
if (newTextList.length <= 0) {
return
}
newTextList.forEach((text, index) => {
this.textList.push(text)
this.statusList.push(AudioStatus.MOUNTED)
index = this.textList.length - 1
if (this.ttsType === 'TTS') {
const audioElement: HTMLAudioElement = document.createElement('audio')
audioElement.controls = true
audioElement.hidden = true
/**
* 播放结束事件
*/
audioElement.onended = () => {
this.statusList[index] = AudioStatus.END
//
if (this.statusList.every((item) => item === AudioStatus.END)) {
this.statusList = this.statusList.map((item) => AudioStatus.READY)
} else {
// next
this.play()
}
}
this.root.appendChild(audioElement)
if (/^<audio[^>]*><\/audio>$/.test(text)) {
audioElement.src = text.match(/src="([^"]*)"/)?.[1] || ''
this.statusList[index] = AudioStatus.READY
} else {
applicationApi
.postTextToSpeech(
(props.applicationId as string) || (id as string),
{ text: text },
loading
)
.then(async (res: any) => {
if (res.type === 'application/json') {
const text = await res.text()
MsgError(text)
this.statusList[index] = AudioStatus.ERROR
this.play()
return
}
// MP3
// Blob
const blob = new Blob([res], { type: 'audio/mp3' })
// URL
const url = URL.createObjectURL(blob)
audioElement.src = url
this.statusList[index] = AudioStatus.READY
this.play()
})
.catch((err) => {
console.log('err: ', err)
this.statusList[index] = AudioStatus.ERROR
this.play()
})
}
this.audioList.push(audioElement)
} else {
const speechSynthesisUtterance: SpeechSynthesisUtterance = new SpeechSynthesisUtterance(
text
)
speechSynthesisUtterance.onpause = () => {
console.log('onpause')
}
speechSynthesisUtterance.onend = () => {
this.statusList[index] = AudioStatus.END
//
if (this.statusList.every((item) => item === AudioStatus.END)) {
this.statusList = this.statusList.map((item) => AudioStatus.READY)
} else {
// next
this.play()
}
}
speechSynthesisUtterance.onerror = (e) => {
this.statusList[index] = AudioStatus.READY
}
this.statusList[index] = AudioStatus.READY
this.audioList.push(speechSynthesisUtterance)
this.play()
}
})
}
reTryError() {
this.statusList.forEach((status, index) => {
if (status === AudioStatus.ERROR) {
const audioElement = this.audioList[index]
if (audioElement instanceof HTMLAudioElement) {
const text = this.textList[index]
applicationApi
.postTextToSpeech(
(props.applicationId as string) || (id as string),
{ text: text },
loading
)
.then(async (res: any) => {
if (res.type === 'application/json') {
const text = await res.text()
MsgError(text)
this.statusList[index] = AudioStatus.ERROR
return
}
// MP3
// Blob
const blob = new Blob([res], { type: 'audio/mp3' })
// URL
const url = URL.createObjectURL(blob)
audioElement.src = url
this.statusList[index] = AudioStatus.READY
this.play()
})
.catch((err) => {
console.log('err: ', err)
this.statusList[index] = AudioStatus.ERROR
})
}
}
})
}
isPlaying() {
return this.statusList.some((item) => [AudioStatus.PLAY_INT].includes(item))
}
play(text?: string, is_end?: boolean) {
if (text) {
const textList = this.getTextList(text, is_end ? true : false)
this.appendTextList(textList)
}
//
if (this.statusList.some((item) => [AudioStatus.PLAY_INT].includes(item))) {
return
}
this.reTryError()
//
const index = this.statusList.findIndex((status) => [AudioStatus.READY].includes(status))
if (index < 0 || this.statusList[index] === AudioStatus.MOUNTED) {
return
}
const audioElement = this.audioList[index]
if (audioElement instanceof SpeechSynthesisUtterance) {
if (window.speechSynthesis.paused) {
window.speechSynthesis.resume()
} else {
if (window.speechSynthesis.pending) {
window.speechSynthesis.cancel()
}
speechSynthesis.speak(audioElement)
this.statusList[index] = AudioStatus.PLAY_INT
}
} else {
//
try {
audioElement.play()
this.statusList[index] = AudioStatus.PLAY_INT
} catch (e) {
this.statusList[index] = AudioStatus.ERROR
}
}
} else if (props.tts_type === 'BROWSER') {
if (audioList.value[currentAudioIndex.value] !== utterance.value?.text) {
window.speechSynthesis.cancel()
}
if (
window.speechSynthesis.paused &&
audioList.value[currentAudioIndex.value] === utterance.value?.text
) {
window.speechSynthesis.resume()
}
pause(self?: boolean) {
const index = this.statusList.findIndex((status) => status === AudioStatus.PLAY_INT)
if (index < 0) {
return
}
// SpeechSynthesisUtterance
utterance.value = new SpeechSynthesisUtterance(audioList.value[currentAudioIndex.value])
utterance.value.onend = () => {
utterance.value = null
currentAudioIndex.value += 1
playAnswerTextPart()
}
utterance.value.onerror = () => {
audioPlayerStatus.value = false
utterance.value = null
}
//
window.speechSynthesis.speak(utterance.value)
} else if (props.tts_type === 'TTS') {
//
if (audioPlayer.value && audioPlayer.value[currentAudioIndex.value]?.src) {
audioPlayer.value[currentAudioIndex.value].play()
return
}
applicationApi
.postTextToSpeech(
(props.applicationId as string) || (id as string),
{ text: audioList.value[currentAudioIndex.value] },
loading
)
.then(async (res: any) => {
if (res.type === 'application/json') {
const text = await res.text()
MsgError(text)
return
}
// MP3
// Blob
const blob = new Blob([res], { type: 'audio/mp3' })
// URL
const url = URL.createObjectURL(blob)
// blob
// const link = document.createElement('a')
// link.href = window.URL.createObjectURL(blob)
// link.download = "abc.mp3"
// link.click()
// audioPlayer DOM
if (audioPlayer.value) {
audioPlayer.value[currentAudioIndex.value].src = url
audioPlayer.value[currentAudioIndex.value].play() //
audioPlayer.value[currentAudioIndex.value].onended = () => {
currentAudioIndex.value += 1
playAnswerTextPart()
const audioElement = this.audioList[index]
if (audioElement instanceof SpeechSynthesisUtterance) {
this.statusList[index] = AudioStatus.READY
if (self) {
window.speechSynthesis.pause()
nextTick(() => {
if (!window.speechSynthesis.paused) {
window.speechSynthesis.cancel()
}
} else {
console.error('audioPlayer.value is not an instance of HTMLAudioElement')
}
})
.catch((err) => {
console.log('err: ', err)
})
}
}
const pausePlayAnswerText = () => {
audioPlayerStatus.value = false
if (props.tts_type === 'TTS') {
if (audioPlayer.value) {
audioPlayer.value?.forEach((item) => {
item.pause()
})
})
} else {
window.speechSynthesis.cancel()
}
} else {
if (this.statusList[index] === AudioStatus.PLAY_INT) {
//
this.statusList[index] = AudioStatus.READY
audioElement.pause()
}
}
}
if (props.tts_type === 'BROWSER') {
window.speechSynthesis.pause()
getTextList(text: string, is_end: boolean) {
//
text = removeFormRander(text)
// text
text = markdownToPlainText(text)
const split = smartSplit(
props.data.answer_text,
{
0: 20,
1: 50,
5: 100
},
is_end
)
return split
}
}
const audioManage = ref<AudioManage>()
onMounted(() => {
bus.on('pause-autoplay', () => {
pausePlayAnswerText()
// console.log(1234)
})
bus.emit('pause-autoplay')
//
if (
props.tts &&
props.tts_autoplay &&
buttonData.value.write_ed &&
!buttonData.value.update_time
) {
playAnswerText(buttonData.value.answer_text)
if (audioCiontainer.value) {
audioManage.value = new AudioManage(props.tts_type, audioCiontainer.value)
}
bus.on('play:pause', (record_id: string) => {
if (record_id !== props.data.record_id) {
if (audioManage.value) {
audioManage.value?.pause()
}
}
})
bus.on('change:answer', (data: any) => {
const record_id = data.record_id
bus.emit('play:pause', record_id)
if (props.data.record_id == record_id) {
if (props.tts && props.tts_autoplay) {
if (audioManage.value) {
audioManage.value.play(props.data.answer_text, data.is_end)
}
}
}
})
})
</script>
<style lang="scss" scoped>

View File

@ -26,7 +26,7 @@
</div>
<ChatOperationButton
v-if="chatRecord.write_ed && 500 != chatRecord.status"
v-show="chatRecord.write_ed && 500 != chatRecord.status"
:tts="application.tts_model_enable"
:tts_type="application.tts_type"
:tts_autoplay="application.tts_autoplay"

View File

@ -490,6 +490,7 @@ const handleScroll = () => {
}
onMounted(() => {
window.speechSynthesis.cancel()
window.sendMessage = sendMessage
})