echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
148 lines (108 loc) • 4.22 kB
text/typescript
import * as SpeechSDK from 'microsoft-cognitiveservices-speech-sdk'
import * as FFMpegTranscoder from '../codecs/FFMpegTranscoder.js'
import { Logger } from '../utilities/Logger.js'
import { Timeline } from '../utilities/Timeline.js'
import { RawAudio, getRawAudioDuration } from '../audio/AudioUtilities.js'
import { concatUint8Arrays } from '../utilities/Utilities.js'
import { escapeHtml } from '../encodings/HtmlEscape.js'
export async function synthesize(
text: string,
subscriptionKey: string,
serviceRegion: string,
languageCode = 'en-US',
voice = 'Microsoft Server Speech Text to Speech Voice (en-US, AvaNeural)',
ssmlEnabled = false,
ssmlPitchString = '+0Hz',
ssmlRateString = '+0%') {
return new Promise<{ rawAudio: RawAudio, timeline: Timeline }>((resolve, reject) => {
const logger = new Logger()
logger.start('Request synthesis from Azure Cognitive Services')
const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion)
speechConfig.speechSynthesisLanguage = languageCode
speechConfig.speechSynthesisVoiceName = voice
speechConfig.speechSynthesisOutputFormat = SpeechSDK.SpeechSynthesisOutputFormat.Ogg24Khz16BitMonoOpus
const audioOutputStream = SpeechSDK.AudioOutputStream.createPullStream()
const audioConfig = SpeechSDK.AudioConfig.fromStreamOutput(audioOutputStream)
const synthesis = new SpeechSDK.SpeechSynthesizer(speechConfig, audioConfig)
const events: SpeechSDK.SpeechSynthesisWordBoundaryEventArgs[] = []
synthesis.wordBoundary = (sender, event) => {
events.push(event)
}
const onResult = async (result: SpeechSDK.SpeechSynthesisResult) => {
if (result.errorDetails != null) {
reject(result.errorDetails)
return
}
let encodedAudio: Uint8Array
if (false) {
const bufferSize = 2 ** 16
const buffers: Uint8Array[] = []
while (true) {
const buffer = new Uint8Array(bufferSize)
const amountRead = await audioOutputStream.read(buffer.buffer)
if (amountRead == 0) {
audioOutputStream.close()
break
}
buffers.push(buffer.subarray(0, amountRead))
}
encodedAudio = concatUint8Arrays(buffers)
} else {
encodedAudio = new Uint8Array(result.audioData)
}
logger.end()
const rawAudio = await FFMpegTranscoder.decodeToChannels(encodedAudio, 24000, 1)
logger.start('Convert boundary events to a timeline')
const timeline = boundaryEventsToTimeline(events, getRawAudioDuration(rawAudio))
logger.end()
resolve({ rawAudio, timeline: timeline })
}
const onError = (error: string) => {
reject(error)
}
if (!ssmlEnabled && ssmlPitchString != '+0%' || ssmlRateString != '+0Hz') {
ssmlEnabled = true
text = escapeHtml(text)
}
if (ssmlEnabled) {
text =
`<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">` +
`<voice name="${voice}">` +
`<prosody pitch="${ssmlPitchString}" rate="${ssmlRateString}">` +
text +
`</prosody>` +
`</voice>` +
`</speak>`
synthesis.speakSsmlAsync(text, onResult, onError)
} else {
synthesis.speakTextAsync(text, onResult, onError)
}
})
}
export async function getVoiceList(subscriptionKey: string, serviceRegion: string) {
const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion)
const synthesis = new SpeechSDK.SpeechSynthesizer(speechConfig, undefined)
const result = await synthesis.getVoicesAsync()
return result.voices
}
export function boundaryEventsToTimeline(events: any[], totalDuration: number) {
const timeline: Timeline = []
for (const event of events) {
const boundaryType = event.boundaryType != null ? event.boundaryType : event.Type
if (boundaryType != 'WordBoundary') {
continue
}
const text: string = event.text != null ? event.text : event.Data.text.Text
const offset: number = event.audioOffset != null ? event.audioOffset : event.Data.Offset
const duration: number = event.duration != null ? event.duration : event.Data.Duration
const startTime = offset / 10000000
const endTime = (offset + duration) / 10000000
timeline.push({
type: 'word',
text,
startTime,
endTime
})
}
return timeline
}