echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

github.com/echogarden-project/echogarden

echogarden-project/echogarden

148 lines (108 loc) • 4.22 kB

text/typescript

import * as SpeechSDK from 'microsoft-cognitiveservices-speech-sdk' import * as FFMpegTranscoder from '../codecs/FFMpegTranscoder.js' import { Logger } from '../utilities/Logger.js' import { Timeline } from '../utilities/Timeline.js' import { RawAudio, getRawAudioDuration } from '../audio/AudioUtilities.js' import { concatUint8Arrays } from '../utilities/Utilities.js' import { escapeHtml } from '../encodings/HtmlEscape.js' export async function synthesize( text: string, subscriptionKey: string, serviceRegion: string, languageCode = 'en-US', voice = 'Microsoft Server Speech Text to Speech Voice (en-US, AvaNeural)', ssmlEnabled = false, ssmlPitchString = '+0Hz', ssmlRateString = '+0%') { return new Promise<{ rawAudio: RawAudio, timeline: Timeline }>((resolve, reject) => { const logger = new Logger() logger.start('Request synthesis from Azure Cognitive Services') const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion) speechConfig.speechSynthesisLanguage = languageCode speechConfig.speechSynthesisVoiceName = voice speechConfig.speechSynthesisOutputFormat = SpeechSDK.SpeechSynthesisOutputFormat.Ogg24Khz16BitMonoOpus const audioOutputStream = SpeechSDK.AudioOutputStream.createPullStream() const audioConfig = SpeechSDK.AudioConfig.fromStreamOutput(audioOutputStream) const synthesis = new SpeechSDK.SpeechSynthesizer(speechConfig, audioConfig) const events: SpeechSDK.SpeechSynthesisWordBoundaryEventArgs[] = [] synthesis.wordBoundary = (sender, event) => { events.push(event) } const onResult = async (result: SpeechSDK.SpeechSynthesisResult) => { if (result.errorDetails != null) { reject(result.errorDetails) return } let encodedAudio: Uint8Array if (false) { const bufferSize = 2 ** 16 const buffers: Uint8Array[] = [] while (true) { const buffer = new Uint8Array(bufferSize) const amountRead = await audioOutputStream.read(buffer.buffer) if (amountRead == 0) { audioOutputStream.close() break } buffers.push(buffer.subarray(0, amountRead)) } encodedAudio = concatUint8Arrays(buffers) } else { encodedAudio = new Uint8Array(result.audioData) } logger.end() const rawAudio = await FFMpegTranscoder.decodeToChannels(encodedAudio, 24000, 1) logger.start('Convert boundary events to a timeline') const timeline = boundaryEventsToTimeline(events, getRawAudioDuration(rawAudio)) logger.end() resolve({ rawAudio, timeline: timeline }) } const onError = (error: string) => { reject(error) } if (!ssmlEnabled && ssmlPitchString != '+0%' || ssmlRateString != '+0Hz') { ssmlEnabled = true text = escapeHtml(text) } if (ssmlEnabled) { text = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">` + `<voice name="${voice}">` + `<prosody pitch="${ssmlPitchString}" rate="${ssmlRateString}">` + text + `</prosody>` + `</voice>` + `</speak>` synthesis.speakSsmlAsync(text, onResult, onError) } else { synthesis.speakTextAsync(text, onResult, onError) } }) } export async function getVoiceList(subscriptionKey: string, serviceRegion: string) { const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion) const synthesis = new SpeechSDK.SpeechSynthesizer(speechConfig, undefined) const result = await synthesis.getVoicesAsync() return result.voices } export function boundaryEventsToTimeline(events: any[], totalDuration: number) { const timeline: Timeline = [] for (const event of events) { const boundaryType = event.boundaryType != null ? event.boundaryType : event.Type if (boundaryType != 'WordBoundary') { continue } const text: string = event.text != null ? event.text : event.Data.text.Text const offset: number = event.audioOffset != null ? event.audioOffset : event.Data.Offset const duration: number = event.duration != null ? event.duration : event.Data.Duration const startTime = offset / 10000000 const endTime = (offset + duration) / 10000000 timeline.push({ type: 'word', text, startTime, endTime }) } return timeline }