echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

github.com/echogarden-project/echogarden

echogarden-project/echogarden

228 lines (164 loc) • 6.93 kB

text/typescript

import { SynthesisVoice } from '../api/API.js' import { decodeToChannels, SampleFormat } from '../audio/AudioBufferConversion.js' import { RawAudio } from '../audio/AudioUtilities.js' import { getShortLanguageCode, lcidToIsoLanguageCode } from '../utilities/Locale.js' import { Logger } from '../utilities/Logger.js' import { Timeline, TimelineEntry } from '../utilities/Timeline.js' import { logToStderr } from '../utilities/Utilities.js' const log = logToStderr export function synthesize(text: string, voiceName: string, rate = 0, useSpeechPlatform = false) { return new Promise<{ rawAudio: RawAudio, timeline: Timeline }>(async (resolve, reject) => { const logger = new Logger() logger.start('Initialize winax module') const { default: WinAX } = await import('winax') const ActiveXObject = (global as any).ActiveXObject logger.start('Create SAPI COM object') const sapiVoice = new ActiveXObject(useSpeechPlatform ? 'Speech.SPVoice' : 'SAPI.SPVoice') sapiVoice.EventInterests = 33790 logger.start('Get SAPI voice list and select best match') if (voiceName) { const voiceObjects = sapiVoice.GetVoices() for (let i = 0; i < voiceObjects.Count; i++) { const voiceObject = voiceObjects.Item(i) const candidateVoiceName = voiceObject.GetDescription() if (candidateVoiceName == voiceName) { sapiVoice.Voice = voiceObject } } } sapiVoice.Rate = rate // Create phone converter for language const sapiPhoneConverter = new ActiveXObject(useSpeechPlatform ? 'Speech.SpPhoneConverter' : 'SAPI.SpPhoneConverter') const sapiLanguageCodeHex = sapiVoice.Voice.GetAttribute('Language') const sapiLanguageCode = parseInt(sapiLanguageCodeHex, 16) sapiPhoneConverter.LanguageId = sapiLanguageCode logger.start('Synthesize with SAPI') const sampleRate = 22050 const bytesPerSecond = sampleRate * 2 const sapiOutputStream = new ActiveXObject('SAPI.SpMemoryStream') sapiOutputStream.Format.Type = 22 // format code code for SAFT22kHz16BitMono sapiVoice.AudioOutputStream = sapiOutputStream const dispatchMessagesInterval = setInterval(() => { WinAX.peekAndDispatchMessages() }, 50) const connectionPoints = WinAX.getConnectionPoints(sapiVoice) const connectionPoint = connectionPoints[0] const methods = connectionPoint.getMethods() const events: Timeline = [] let lastWordEvent: TimelineEntry | null = null let lastWordCharPos = -1 connectionPoint.advise({ StartStream: () => { }, Word: (streamId: number, streamPos: number, charPos: number, length: number) => { if (lastWordCharPos == charPos) { return } const wordText = text.substring(charPos, charPos + length) const startTime = streamPos / bytesPerSecond const wordEvent = { type: 'word', text: wordText, startTime, endTime: -1, timeline: [] } as TimelineEntry events.push(wordEvent) lastWordEvent = wordEvent lastWordCharPos = charPos }, Phoneme: (streamId: number, streamPos: number, duration: number, nextPhoneId: number, feature: number, currentPhoneId: number) => { if (events.length == 0) { return } const phoneText = sapiPhoneConverter.IdToPhone(currentPhoneId) if (phoneText == ',' || phoneText == '_') { return } const startTime = streamPos / bytesPerSecond const endTime = startTime + (duration / 1000) events.push({ type: 'phone', text: phoneText, startTime, endTime }) }, EndStream: (streamId: number, streamPos: number) => { clearInterval(dispatchMessagesInterval) const audioData = new Uint8Array(sapiOutputStream.GetData()) const audioChannels = decodeToChannels(audioData, 1, 16, SampleFormat.PCM) WinAX.release(sapiOutputStream) WinAX.release(sapiVoice) logger.end() resolve({ rawAudio: { audioChannels, sampleRate }, timeline: eventsToTimeline(events, audioChannels[0].length / sampleRate) }) } }) sapiVoice.Speak(text) }) } export async function getVoiceList(useSpeechPlatform = false) { const { default: WinAX } = await import('winax') const ActiveXObject = (global as any).ActiveXObject const sapiVoice = new ActiveXObject(useSpeechPlatform ? 'Speech.SPVoice' : 'SAPI.SPVoice') const voiceObjects = sapiVoice.GetVoices() const voices: SynthesisVoice[] = [] for (let i = 0; i < voiceObjects.Count; i++) { const voiceObject = voiceObjects.Item(i) const voiceName = voiceObject.GetDescription() const voiceGender = voiceObject.GetAttribute('Gender')?.toLowerCase() const sapiLanguageCodeHex = voiceObject.GetAttribute('Language') const sapiLanguageCode = parseInt(sapiLanguageCodeHex, 16) const languageCodes = await lcidToIsoLanguageCode(sapiLanguageCode) if (!languageCodes) { throw new Error(`Couldn't translate SAPI language code ${sapiLanguageCode} to ISO, for voice '${voiceName}'`) } const resultLanguageCodes: string[] = [] for (const languageCode of languageCodes) { if (!resultLanguageCodes.includes(languageCode)) { resultLanguageCodes.push(languageCode) } const shortLanguageCode = getShortLanguageCode(languageCode) if (!resultLanguageCodes.includes(shortLanguageCode)) { resultLanguageCodes.push(shortLanguageCode) } } voices.push({ name: voiceName, languages: resultLanguageCodes, gender: voiceGender || 'unknown' }) } WinAX.release(sapiVoice) return voices } export async function AssertSAPIAvailable(testForSpeechPlatform = false) { if (process.platform != 'win32') { throw new Error(`SAPI is not available on your platform. SAPI is a Microsoft Windows technology that is only runs on a Windows OS.`) } try { const { default: WinAX } = await import('winax') } catch (e) { throw new Error(`winax package, which is required for SAPI support, was not found. You can install it by running 'npm install winax -g'.`) } const ActiveXObject = (global as any).ActiveXObject try { const voice = new ActiveXObject('SAPI.SPVoice') } catch (e) { throw new Error(`Failed creating a SAPI instance: ${e}`) } try { const voice = new ActiveXObject('Speech.SPVoice') } catch(e) { throw new Error(`Failed creating an msspeech instance. Please ensure you installed the Microsoft Speech Platform runtime correctly.`) } } function eventsToTimeline(events: Timeline, totalDuration: number): Timeline { const timeline: Timeline = [] for (const event of events) { if (event.type == 'word') { timeline.push(event) } else if (event.type == 'phone') { if (timeline.length == 0) { throw new Error('Unexpected: phone event preceded a word event') } const lastWordEntry = timeline[timeline.length - 1] lastWordEntry.endTime = event.endTime const phoneTimeline = lastWordEntry.timeline! phoneTimeline.push(event) } } if (timeline.length > 0 && timeline[timeline.length - 1].endTime == -1) { timeline[timeline.length - 1].endTime = totalDuration } return timeline }