echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
228 lines (164 loc) • 6.93 kB
text/typescript
import { SynthesisVoice } from '../api/API.js'
import { decodeToChannels, SampleFormat } from '../audio/AudioBufferConversion.js'
import { RawAudio } from '../audio/AudioUtilities.js'
import { getShortLanguageCode, lcidToIsoLanguageCode } from '../utilities/Locale.js'
import { Logger } from '../utilities/Logger.js'
import { Timeline, TimelineEntry } from '../utilities/Timeline.js'
import { logToStderr } from '../utilities/Utilities.js'
const log = logToStderr
export function synthesize(text: string, voiceName: string, rate = 0, useSpeechPlatform = false) {
return new Promise<{ rawAudio: RawAudio, timeline: Timeline }>(async (resolve, reject) => {
const logger = new Logger()
logger.start('Initialize winax module')
const { default: WinAX } = await import('winax')
const ActiveXObject = (global as any).ActiveXObject
logger.start('Create SAPI COM object')
const sapiVoice = new ActiveXObject(useSpeechPlatform ? 'Speech.SPVoice' : 'SAPI.SPVoice')
sapiVoice.EventInterests = 33790
logger.start('Get SAPI voice list and select best match')
if (voiceName) {
const voiceObjects = sapiVoice.GetVoices()
for (let i = 0; i < voiceObjects.Count; i++) {
const voiceObject = voiceObjects.Item(i)
const candidateVoiceName = voiceObject.GetDescription()
if (candidateVoiceName == voiceName) {
sapiVoice.Voice = voiceObject
}
}
}
sapiVoice.Rate = rate
// Create phone converter for language
const sapiPhoneConverter = new ActiveXObject(useSpeechPlatform ? 'Speech.SpPhoneConverter' : 'SAPI.SpPhoneConverter')
const sapiLanguageCodeHex = sapiVoice.Voice.GetAttribute('Language')
const sapiLanguageCode = parseInt(sapiLanguageCodeHex, 16)
sapiPhoneConverter.LanguageId = sapiLanguageCode
logger.start('Synthesize with SAPI')
const sampleRate = 22050
const bytesPerSecond = sampleRate * 2
const sapiOutputStream = new ActiveXObject('SAPI.SpMemoryStream')
sapiOutputStream.Format.Type = 22 // format code code for SAFT22kHz16BitMono
sapiVoice.AudioOutputStream = sapiOutputStream
const dispatchMessagesInterval = setInterval(() => {
WinAX.peekAndDispatchMessages()
}, 50)
const connectionPoints = WinAX.getConnectionPoints(sapiVoice)
const connectionPoint = connectionPoints[0]
const methods = connectionPoint.getMethods()
const events: Timeline = []
let lastWordEvent: TimelineEntry | null = null
let lastWordCharPos = -1
connectionPoint.advise({
StartStream: () => {
},
Word: (streamId: number, streamPos: number, charPos: number, length: number) => {
if (lastWordCharPos == charPos) {
return
}
const wordText = text.substring(charPos, charPos + length)
const startTime = streamPos / bytesPerSecond
const wordEvent = { type: 'word', text: wordText, startTime, endTime: -1, timeline: [] } as TimelineEntry
events.push(wordEvent)
lastWordEvent = wordEvent
lastWordCharPos = charPos
},
Phoneme: (streamId: number, streamPos: number, duration: number, nextPhoneId: number, feature: number, currentPhoneId: number) => {
if (events.length == 0) {
return
}
const phoneText = sapiPhoneConverter.IdToPhone(currentPhoneId)
if (phoneText == ',' || phoneText == '_') {
return
}
const startTime = streamPos / bytesPerSecond
const endTime = startTime + (duration / 1000)
events.push({ type: 'phone', text: phoneText, startTime, endTime })
},
EndStream: (streamId: number, streamPos: number) => {
clearInterval(dispatchMessagesInterval)
const audioData = new Uint8Array(sapiOutputStream.GetData())
const audioChannels = decodeToChannels(audioData, 1, 16, SampleFormat.PCM)
WinAX.release(sapiOutputStream)
WinAX.release(sapiVoice)
logger.end()
resolve({ rawAudio: { audioChannels, sampleRate }, timeline: eventsToTimeline(events, audioChannels[0].length / sampleRate) })
}
})
sapiVoice.Speak(text)
})
}
export async function getVoiceList(useSpeechPlatform = false) {
const { default: WinAX } = await import('winax')
const ActiveXObject = (global as any).ActiveXObject
const sapiVoice = new ActiveXObject(useSpeechPlatform ? 'Speech.SPVoice' : 'SAPI.SPVoice')
const voiceObjects = sapiVoice.GetVoices()
const voices: SynthesisVoice[] = []
for (let i = 0; i < voiceObjects.Count; i++) {
const voiceObject = voiceObjects.Item(i)
const voiceName = voiceObject.GetDescription()
const voiceGender = voiceObject.GetAttribute('Gender')?.toLowerCase()
const sapiLanguageCodeHex = voiceObject.GetAttribute('Language')
const sapiLanguageCode = parseInt(sapiLanguageCodeHex, 16)
const languageCodes = await lcidToIsoLanguageCode(sapiLanguageCode)
if (!languageCodes) {
throw new Error(`Couldn't translate SAPI language code ${sapiLanguageCode} to ISO, for voice '${voiceName}'`)
}
const resultLanguageCodes: string[] = []
for (const languageCode of languageCodes) {
if (!resultLanguageCodes.includes(languageCode)) {
resultLanguageCodes.push(languageCode)
}
const shortLanguageCode = getShortLanguageCode(languageCode)
if (!resultLanguageCodes.includes(shortLanguageCode)) {
resultLanguageCodes.push(shortLanguageCode)
}
}
voices.push({
name: voiceName,
languages: resultLanguageCodes,
gender: voiceGender || 'unknown'
})
}
WinAX.release(sapiVoice)
return voices
}
export async function AssertSAPIAvailable(testForSpeechPlatform = false) {
if (process.platform != 'win32') {
throw new Error(`SAPI is not available on your platform. SAPI is a Microsoft Windows technology that is only runs on a Windows OS.`)
}
try {
const { default: WinAX } = await import('winax')
} catch (e) {
throw new Error(`winax package, which is required for SAPI support, was not found. You can install it by running 'npm install winax -g'.`)
}
const ActiveXObject = (global as any).ActiveXObject
try {
const voice = new ActiveXObject('SAPI.SPVoice')
} catch (e) {
throw new Error(`Failed creating a SAPI instance: ${e}`)
}
try {
const voice = new ActiveXObject('Speech.SPVoice')
} catch(e) {
throw new Error(`Failed creating an msspeech instance. Please ensure you installed the Microsoft Speech Platform runtime correctly.`)
}
}
function eventsToTimeline(events: Timeline, totalDuration: number): Timeline {
const timeline: Timeline = []
for (const event of events) {
if (event.type == 'word') {
timeline.push(event)
} else if (event.type == 'phone') {
if (timeline.length == 0) {
throw new Error('Unexpected: phone event preceded a word event')
}
const lastWordEntry = timeline[timeline.length - 1]
lastWordEntry.endTime = event.endTime
const phoneTimeline = lastWordEntry.timeline!
phoneTimeline.push(event)
}
}
if (timeline.length > 0 && timeline[timeline.length - 1].endTime == -1) {
timeline[timeline.length - 1].endTime = totalDuration
}
return timeline
}