echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
1,954 lines (1,407 loc) • 50.2 kB
text/typescript
import { deepClone, extendDeep } from '../utilities/ObjectUtilities.js'
import * as FFMpegTranscoder from '../codecs/FFMpegTranscoder.js'
import { clip, sha256AsHex, stringifyAndFormatJson, logToStderr, yieldToEventLoop, runOperationWithRetries } from '../utilities/Utilities.js'
import { RawAudio, concatAudioSegments, downmixToMono, encodeRawAudioToWave, getSamplePeakDecibels, getEmptyRawAudio, getRawAudioDuration, trimAudioEnd, trimAudioStart, attenuateIfClippingInPlace, normalizeAudioLevelInPlace } from '../audio/AudioUtilities.js'
import { Logger } from '../utilities/Logger.js'
import { isWordOrSymbolWord, parseText, splitToParagraphs } from '../nlp/Segmentation.js'
import { type RubberbandOptions } from '../dsp/Rubberband.js'
import { loadLexiconsForLanguage } from '../nlp/Lexicon.js'
import * as API from './API.js'
import { Timeline, TimelineEntry, addTimeOffsetToTimeline, multiplyTimelineByFactor } from '../utilities/Timeline.js'
import { getAppDataDir, ensureDir, existsSync, isFileIsUpToDate, readAndParseJsonFile, writeFileSafe } from '../utilities/FileSystem.js'
import { formatLanguageCodeWithName, getShortLanguageCode, normalizeLanguageCode, defaultDialectForLanguageCode, parseLangIdentifier, normalizeIdentifierToLanguageCode } from '../utilities/Locale.js'
import { loadPackage } from '../utilities/PackageManager.js'
import { EngineMetadata, appName } from './Common.js'
import { shouldCancelCurrentTask } from '../server/Worker.js'
import chalk from 'chalk'
import { type SubtitlesConfig } from '../subtitles/Subtitles.js'
import { type EspeakOptions } from '../synthesis/EspeakTTS.js'
import { type OpenAICloudTTSOptions } from '../synthesis/OpenAICloudTTS.js'
import { type ElevenLabsTTSOptions } from '../synthesis/ElevenLabsTTS.js'
import { type DeepgramTTSOptions } from '../synthesis/DeepgramTTS.js'
import { OnnxExecutionProvider } from '../utilities/OnnxUtilities.js'
import { simplifyPunctuationCharacters } from '../nlp/TextNormalizer.js'
import { convertHtmlToText } from '../utilities/StringUtilities.js'
import { joinPath, resolvePath } from '../utilities/PathUtilities.js'
import { Timer } from '../utilities/Timer.js'
const log = logToStderr
/////////////////////////////////////////////////////////////////////////////////////////////
// Synthesis
/////////////////////////////////////////////////////////////////////////////////////////////
export async function synthesize(input: string | string[], options: SynthesisOptions, onSegment?: SynthesisSegmentEvent, onSentence?: SynthesisSegmentEvent): Promise<SynthesisResult> {
options = extendDeep(defaultSynthesisOptions, options)
let segments: string[]
if (Array.isArray(input)) {
segments = input
} else if (options.ssml) {
segments = [input]
} else {
const plainTextOptions = options.plainText!
segments = splitToParagraphs(input, plainTextOptions.paragraphBreaks!, plainTextOptions.whitespace!)
}
return synthesizeSegments(segments, options, onSegment, onSentence)
}
async function synthesizeSegments(segments: string[], options: SynthesisOptions, onSegment?: SynthesisSegmentEvent, onSentence?: SynthesisSegmentEvent): Promise<SynthesisResult> {
const logger = new Logger()
options = extendDeep(defaultSynthesisOptions, options)
const totalSynthesisTimeTimer = new Timer()
if (!options.language && !options.voice) {
logger.start('No language or voice specified. Detect language')
let segmentsPlainText = segments
if (options.ssml) {
segmentsPlainText = []
for (const segment of segments) {
segmentsPlainText.push(await convertHtmlToText(segment))
}
}
const { detectedLanguage } = await API.detectTextLanguage(segmentsPlainText.join('\n\n'), options.languageDetection || {})
options.language = detectedLanguage
logger.end()
logger.logTitledMessage('Language detected', formatLanguageCodeWithName(detectedLanguage))
}
if (!options.engine) {
if (options.voice) {
throw new Error(`Voice '${options.voice}' was specified but no engine was specified.`)
}
options.engine = await selectBestOfflineEngineForLanguage(options.language!)
logger.logTitledMessage('No engine specified. Auto-selected engine', options.engine)
}
logger.start(`Get voice list for ${options.engine}`)
const { bestMatchingVoice } = await requestVoiceList(options)
if (!bestMatchingVoice) {
throw new Error('No matching voice found')
}
options.voice = bestMatchingVoice.name
if (!options.language) {
options.language = bestMatchingVoice.languages[0]
}
logger.end()
logger.logTitledMessage('Selected voice', `'${options.voice}' (${formatLanguageCodeWithName(bestMatchingVoice.languages[0], 2)})`)
const segmentsRawAudio: RawAudio[] = []
const segmentsTimelines: Timeline[] = []
const timeline: Timeline = []
let peakDecibelsSoFar = -100
let timeOffset = 0
for (let segmentIndex = 0; segmentIndex < segments.length; segmentIndex++) {
const segmentText = segments[segmentIndex]
logger.log(`\n${chalk.magentaBright(`Synthesizing segment ${segmentIndex + 1}/${segments.length}`)}: '${segmentText.trim()}'`)
const segmentStartTime = timeOffset
const segmentEntry: TimelineEntry = {
type: 'segment',
text: segmentText,
startTime: timeOffset,
endTime: -1,
timeline: []
}
let sentences: string[]
if ((options.splitToSentences || options.engine === 'vits' || options.engine === 'kokoro') && !options.ssml) {
const parsedText = await parseText(segmentText, options.language!)
sentences = parsedText.sentences.map(sentenceEntry => sentenceEntry.text)
if (sentences.length == 0) {
sentences = ['']
}
} else {
sentences = [segmentText]
}
const sentencesRawAudio: RawAudio[] = []
const sentencesTimelines: Timeline[] = []
for (let sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++) {
await yieldToEventLoop()
if (shouldCancelCurrentTask()) {
//log('\n\n\n\n\nCANCELED\n\n\n\n')
throw new Error('Canceled')
}
const sentenceText = sentences[sentenceIndex].trim()
logger.log(`\n${chalk.magentaBright(`Synthesizing sentence ${sentenceIndex + 1}/${sentences.length}`)}: "${sentenceText.trim()}"`)
const sentenceStartTime = timeOffset
let sentenceSynthesisOptions: SynthesisOptions = { postProcessing: { normalizeAudio: false } }
sentenceSynthesisOptions = extendDeep(options, sentenceSynthesisOptions)
const { synthesizedAudio: sentenceRawAudio, timeline: sentenceTimeline } = await synthesizeSegment(sentenceText, sentenceSynthesisOptions)
const endPause = sentenceIndex == sentences.length - 1 ? options.segmentEndPause! : options.sentenceEndPause!
sentenceRawAudio.audioChannels[0] = trimAudioEnd(sentenceRawAudio.audioChannels[0], endPause * sentenceRawAudio.sampleRate)
sentencesRawAudio.push(sentenceRawAudio)
if (sentenceTimeline.length > 0) {
sentencesTimelines.push(sentenceTimeline)
}
const sentenceAudioLength = sentenceRawAudio.audioChannels[0].length / sentenceRawAudio.sampleRate
timeOffset += sentenceAudioLength
const sentenceTimelineWithOffset = addTimeOffsetToTimeline(sentenceTimeline, sentenceStartTime)
const sentenceEndTime = timeOffset - endPause
segmentEntry.timeline!.push({
type: 'sentence',
text: sentenceText,
startTime: sentenceStartTime,
endTime: sentenceEndTime,
timeline: sentenceTimelineWithOffset
})
peakDecibelsSoFar = Math.max(peakDecibelsSoFar, getSamplePeakDecibels(sentenceRawAudio.audioChannels))
const sentenceAudio = await convertToTargetCodecIfNeeded(sentenceRawAudio)
if (onSentence) {
await onSentence({
index: sentenceIndex,
total: sentences.length,
audio: sentenceAudio,
timeline: sentenceTimeline,
transcript: sentenceText,
language: options.language!,
peakDecibelsSoFar
})
}
}
segmentEntry.endTime = segmentEntry.timeline?.[segmentEntry.timeline.length - 1]?.endTime || timeOffset
logger.end()
logger.start(`Merge and postprocess sentences`)
let segmentRawAudio: RawAudio
if (sentencesRawAudio.length > 0) {
const joinedAudioBuffers = concatAudioSegments(sentencesRawAudio.map(part => part.audioChannels))
segmentRawAudio = { audioChannels: joinedAudioBuffers, sampleRate: sentencesRawAudio[0].sampleRate }
} else {
segmentRawAudio = getEmptyRawAudio(1, 24000)
}
segmentsRawAudio.push(segmentRawAudio)
timeline.push(segmentEntry)
const segmentTimelineWithoutOffset = addTimeOffsetToTimeline(segmentEntry.timeline!, -segmentStartTime)
segmentsTimelines.push(segmentTimelineWithoutOffset)
const segmentAudio = await convertToTargetCodecIfNeeded(segmentRawAudio)
logger.end()
if (onSegment) {
await onSegment({
index: segmentIndex,
total: segments.length,
audio: segmentAudio,
timeline: segmentTimelineWithoutOffset,
transcript: segmentText,
language: options.language!,
peakDecibelsSoFar
})
}
}
logger.start(`\nMerge and postprocess segments`)
let resultRawAudio: RawAudio
if (segmentsRawAudio.length > 0) {
const joinedAudioBuffers = concatAudioSegments(segmentsRawAudio.map(part => part.audioChannels))
resultRawAudio = { audioChannels: joinedAudioBuffers, sampleRate: segmentsRawAudio[0].sampleRate }
if (options.postProcessing!.normalizeAudio) {
normalizeAudioLevelInPlace(resultRawAudio, options.postProcessing!.targetPeak, options.postProcessing!.maxGainIncrease)
} else {
attenuateIfClippingInPlace(resultRawAudio)
}
} else {
resultRawAudio = getEmptyRawAudio(1, 24000)
}
async function convertToTargetCodecIfNeeded(rawAudio: RawAudio) {
const targetCodec = options.outputAudioFormat?.codec
let output: RawAudio | Uint8Array
if (targetCodec) {
logger.start(`Convert to ${targetCodec} codec`)
if (targetCodec == 'wav') {
output = encodeRawAudioToWave(rawAudio)
} else {
const ffmpegOptions = FFMpegTranscoder.getDefaultFFMpegOptionsForSpeech(targetCodec, options.outputAudioFormat?.bitrate)
output = await FFMpegTranscoder.encodeFromChannels(rawAudio, ffmpegOptions)
}
} else {
output = rawAudio
}
return output
}
const resultAudio = await convertToTargetCodecIfNeeded(resultRawAudio)
logger.end()
logger.logTitledMessage('Total synthesis time', `${totalSynthesisTimeTimer.elapsedTime.toFixed(1)}ms`, chalk.magentaBright)
return {
audio: resultAudio,
timeline,
language: options.language,
voice: options.voice
}
}
export interface SynthesisResult {
audio: RawAudio | Uint8Array
timeline: Timeline
language: string
voice: string
}
async function synthesizeSegment(text: string, options: SynthesisOptions) {
const logger = new Logger()
const startTimestamp = logger.getTimestamp()
logger.start('Prepare text for synthesis')
const simplifiedText = simplifyPunctuationCharacters(text)
const engine = options.engine
logger.start(`Get voice list for ${engine}`)
const { bestMatchingVoice } = await requestVoiceList(options)
if (!bestMatchingVoice) {
throw new Error('No matching voice found')
}
const selectedVoice = bestMatchingVoice
let voicePackagePath: string | undefined
if (selectedVoice.packageName) {
logger.end()
voicePackagePath = await loadPackage(selectedVoice.packageName)
}
logger.start(`Initialize ${engine} module`)
const voice = selectedVoice.name
let language: string
if (options.language) {
language = await normalizeIdentifierToLanguageCode(options.language)
} else {
language = selectedVoice.languages[0]
}
const voiceGender = selectedVoice.gender
const speed = clip(options.speed!, 0.1, 10.0)
const pitch = clip(options.pitch!, 0.1, 10.0)
const inputIsSSML = options.ssml!
let synthesizedAudio: RawAudio
let timeline: Timeline | undefined
let shouldPostprocessSpeed = false
let shouldPostprocessPitch = false
switch (engine) {
case 'vits': {
if (inputIsSSML) {
throw new Error(`The VITS engine doesn't currently support SSML inputs`)
}
let vitsLanguage = language
if (vitsLanguage == 'en') {
vitsLanguage = 'en-us'
}
const vitsTTS = await import('../synthesis/VitsTTS.js')
const lengthScale = 1 / speed
const vitsOptions = options.vits!
const speakerId = vitsOptions.speakerId
if (speakerId != undefined) {
if (selectedVoice.speakerCount == undefined) {
if (speakerId != 0) {
throw new Error('Selected VITS model has only one speaker. Speaker ID must be 0 if specified.')
}
} else if (speakerId < 0 || speakerId >= selectedVoice.speakerCount) {
throw new Error(`Selected VITS model has ${selectedVoice.speakerCount} speaker IDs. Speaker ID should be in the range ${0} to ${selectedVoice.speakerCount - 1}`)
}
}
const lexicons = await loadLexiconsForLanguage(language, options.customLexiconPaths)
const modelPath = voicePackagePath!
const onnxExecutionProviders: OnnxExecutionProvider[] = vitsOptions.provider ? [vitsOptions.provider] : []
logger.end()
const { rawAudio, timeline: outTimeline } = await vitsTTS.synthesizeSentence(
text,
voice,
modelPath,
lengthScale,
speakerId ?? 0,
lexicons,
onnxExecutionProviders)
synthesizedAudio = rawAudio
timeline = outTimeline
shouldPostprocessPitch = true
logger.end()
break
}
case 'kokoro': {
if (inputIsSSML) {
throw new Error(`The Kokoro engine doesn't currently support SSML inputs`)
}
const kokoroOptions = options.kokoro!
const kokoroTTS = await import('../synthesis/KokoroTTS.js')
const lexicons = await loadLexiconsForLanguage(language, options.customLexiconPaths)
const onnxExecutionProviders: OnnxExecutionProvider[] = kokoroOptions.provider ? [kokoroOptions.provider] : []
const modelName = kokoroOptions.model!
const modelPackageName = `kokoro-${modelName}`
const modelPath = await loadPackage(modelPackageName)
const voicesPath = await loadPackage('kokoro-82m-v1.0-voices')
logger.end()
logger.logTitledMessage(`Using model`, modelPackageName)
const { rawAudio, timeline: outTimeline } = await kokoroTTS.synthesizeSentence(
text,
selectedVoice,
speed,
lexicons,
modelPath,
voicesPath,
onnxExecutionProviders
)
synthesizedAudio = rawAudio
timeline = outTimeline
shouldPostprocessPitch = true
logger.end()
break
}
case 'pico': {
if (inputIsSSML) {
throw new Error(`The SVOX Pico engine doesn't currently support SSML inputs`)
}
const SvoxPicoTTS = await import('../synthesis/SvoxPicoTTS.js')
const picoSpeed = Math.round(speed * 1.0 * 100)
const picoPitch = Math.round(pitch * 1.0 * 100)
const picoVolume = 35.0
const preparedText = `<speed level='${picoSpeed}'><pitch level='${picoPitch}'><volume level='${picoVolume}'>${simplifiedText}</volume></pitch></speed>`
logger.end()
const { textAnalysisFilename, signalGenerationFilename } = SvoxPicoTTS.getResourceFilenamesForLanguage(language)
const resourceFilePath = resolvePath(voicePackagePath!, textAnalysisFilename)
const signalGenerationFilePath = resolvePath(voicePackagePath!, signalGenerationFilename)
const { rawAudio } = await SvoxPicoTTS.synthesize(preparedText, resourceFilePath, signalGenerationFilePath)
synthesizedAudio = rawAudio
break
}
case 'flite': {
if (inputIsSSML) {
throw new Error(`The Flite engine doesn't currently support SSML inputs`)
}
const FliteTTS = await import('../synthesis/FliteTTS.js')
logger.end()
const { rawAudio, events } = await FliteTTS.synthesize(simplifiedText, voice, voicePackagePath, speed)
synthesizedAudio = rawAudio
shouldPostprocessPitch = true
break
}
case 'gnuspeech': {
if (inputIsSSML) {
throw new Error(`The Gnuspeech engine doesn't currently support SSML inputs`)
}
const engineOptions = options.gnuspeech!
const GnuSpeech = await import('../synthesis/GnuSpeechTTS.js')
const { defaultGnuSpeechOptions } = await import('@echogarden/gnuspeech-wasm')
const gnuSpeechOptions = extendDeep(defaultGnuSpeechOptions, engineOptions)
if (!engineOptions.tempo) {
gnuSpeechOptions.tempo = speed
}
await logger.startAsync(`Synthesize with Gnuspeech`)
const { rawAudio } = await GnuSpeech.synthesize(simplifiedText, gnuSpeechOptions)
synthesizedAudio = rawAudio
shouldPostprocessPitch = true
logger.end()
break
}
case 'espeak': {
const EspeakTTS = await import('../synthesis/EspeakTTS.js')
const engineOptions = options.espeak!
const espeakVoice = voice
const espeakLanguage = selectedVoice.languages[0]
const espeakRate = engineOptions.rate || speed * 150
const espeakPitch = engineOptions.pitch || options.pitch! * 50
const espeakPitchRange = engineOptions.pitchRange || options.pitchVariation! * 50
const espeakUseKlatt = engineOptions.useKlatt || false
const espeakInsertSeparators = engineOptions.insertSeparators || false
const espeakOptions: EspeakOptions = {
voice: espeakVoice,
ssml: inputIsSSML,
rate: espeakRate,
pitch: espeakPitch,
pitchRange: espeakPitchRange,
useKlatt: espeakUseKlatt,
insertSeparators: espeakInsertSeparators,
}
if (inputIsSSML) {
logger.end()
const { rawAudio } = await EspeakTTS.synthesize(text, espeakOptions)
synthesizedAudio = rawAudio
} else {
const lexicons = await loadLexiconsForLanguage(language, options.customLexiconPaths)
logger.end()
const { referenceSynthesizedAudio, referenceTimeline } = await EspeakTTS.preprocessAndSynthesize(text, espeakLanguage, espeakOptions, lexicons)
synthesizedAudio = referenceSynthesizedAudio
timeline = referenceTimeline.flatMap(clause => clause.timeline!)
}
break
}
case 'sam': {
if (inputIsSSML) {
throw new Error(`The SAM engine doesn't support SSML inputs`)
}
const SamTTS = await import('../synthesis/SamTTS.js')
const engineOptions = options.sam!
const samPitch = clip(engineOptions.pitch || Math.round((1 / pitch) * 64), 0, 255)
const samSpeed = clip(engineOptions.speed || Math.round((1 / speed) * 72), 0, 255)
const samMouth = clip(engineOptions.mouth!, 0, 255)
const samThroat = clip(engineOptions.throat!, 0, 255)
logger.end()
const { rawAudio } = await SamTTS.synthesize(simplifiedText, samPitch, samSpeed, samMouth, samThroat)
synthesizedAudio = rawAudio
break
}
case 'sapi': {
if (inputIsSSML) {
throw new Error(`The SAPI engine doesn't currently support SSML inputs`)
}
const SapiTTS = await import('../synthesis/SapiTTS.js')
await SapiTTS.AssertSAPIAvailable(false)
const engineOptions = options.sapi!
const sapiRate = engineOptions.rate || 0
logger.end()
const { rawAudio, timeline: outTimeline } = await SapiTTS.synthesize(text, voice, sapiRate, false)
synthesizedAudio = rawAudio
timeline = outTimeline
shouldPostprocessSpeed = true
shouldPostprocessPitch = true
break
}
case 'msspeech': {
if (inputIsSSML) {
throw new Error(`The MSSpeech engine doesn't currently support SSML inputs`)
}
const SapiTTS = await import('../synthesis/SapiTTS.js')
await SapiTTS.AssertSAPIAvailable(true)
const engineOptions = options.msspeech!
const sapiRate = engineOptions.rate || 0
logger.end()
const { rawAudio, timeline: outTimeline } = await SapiTTS.synthesize(text, voice, sapiRate, true)
synthesizedAudio = rawAudio
timeline = outTimeline
shouldPostprocessSpeed = true
shouldPostprocessPitch = true
break
}
case 'coqui-server': {
if (inputIsSSML) {
throw new Error(`The Coqui Server engine doesn't support SSML inputs`)
}
const CoquiServerTTS = await import('../synthesis/CoquiServerTTS.js')
const engineOptions = options.coquiServer!
const speakerId = engineOptions.speakerId!
const serverUrl = engineOptions.serverUrl
if (!serverUrl) {
throw new Error(`'coqui-server' requires a server URL`)
}
logger.end()
const { rawAudio } = await CoquiServerTTS.synthesize(simplifiedText, speakerId, serverUrl)
synthesizedAudio = rawAudio
shouldPostprocessSpeed = true
shouldPostprocessPitch = true
break
}
case 'google-cloud': {
const GoogleCloudTTS = await import('../synthesis/GoogleCloudTTS.js')
const engineOptions = options.googleCloud!
const apiKey = engineOptions.apiKey
if (!apiKey) {
throw new Error(`No Google Cloud API key provided`)
}
let pitchDeltaSemitones: number
// 1 semitone up = multiply by 1.05946
// 1 semitone down = divide by 1.05946
if (engineOptions.pitchDeltaSemitones != undefined) {
pitchDeltaSemitones = engineOptions.pitchDeltaSemitones
} else if (pitch >= 1.0) {
pitchDeltaSemitones = Math.round(17.3132 * Math.log(pitch))
} else {
pitchDeltaSemitones = Math.round(-17.3132 * Math.log(1 / pitch))
}
logger.end()
const { rawAudio, timepoints } = await GoogleCloudTTS.synthesize(text, apiKey, language, voice, speed, pitchDeltaSemitones, 0, inputIsSSML)
synthesizedAudio = rawAudio
break
}
case 'microsoft-azure': {
const AzureCognitiveServicesTTS = await import('../synthesis/AzureCognitiveServicesTTS.js')
const engineOptions = options.microsoftAzure!
const subscriptionKey = engineOptions.subscriptionKey
if (!subscriptionKey) {
throw new Error(`No Microsoft Azure subscription key provided`)
}
const serviceRegion = engineOptions!.serviceRegion
if (!serviceRegion) {
throw new Error(`No Microsoft Azure service region provided`)
}
let ssmlPitch: string
if (engineOptions.pitchDeltaHz != undefined) {
if (engineOptions.pitchDeltaHz >= 0) {
ssmlPitch = `+${Math.abs(engineOptions.pitchDeltaHz)}Hz`
} else {
ssmlPitch = `-${Math.abs(engineOptions.pitchDeltaHz)}Hz`
}
} else {
ssmlPitch = convertPitchScaleToSSMLValueString(pitch, voiceGender)
}
const ssmlRate = convertSpeedScaleToSSMLValueString(speed)
logger.end()
const { rawAudio, timeline: outTimeline } = await AzureCognitiveServicesTTS.synthesize(text, subscriptionKey, serviceRegion, language, voice, inputIsSSML, ssmlPitch, ssmlRate)
synthesizedAudio = rawAudio
timeline = outTimeline
break
}
case 'amazon-polly': {
const AwsPollyTTS = await import('../synthesis/AwsPollyTTS.js')
const engineOptions = options.amazonPolly!
const region = engineOptions.region
if (!region) {
throw new Error(`No Amazon Polly region provided`)
}
const accessKeyId = engineOptions.accessKeyId
if (!accessKeyId) {
throw new Error(`No Amazon Polly access key id provided`)
}
const secretAccessKey = engineOptions.secretAccessKey
if (!secretAccessKey) {
throw new Error(`No Amazon Polly secret access key provided`)
}
const pollyEngine = engineOptions.pollyEngine
const lexiconNames = engineOptions.lexiconNames
logger.end()
const { rawAudio } = await AwsPollyTTS.synthesize(text, undefined, voice, region, accessKeyId, secretAccessKey, pollyEngine, inputIsSSML, lexiconNames)
synthesizedAudio = rawAudio
shouldPostprocessSpeed = true
shouldPostprocessPitch = true
break
}
case 'openai-cloud': {
const OpenAICloudTTS = await import('../synthesis/OpenAICloudTTS.js')
const openAICloudTTSOptions = options.openAICloud!
if (!openAICloudTTSOptions.apiKey) {
throw new Error(`No API key given`)
}
logger.end();
synthesizedAudio = await OpenAICloudTTS.synthesize(text, voice, speed, openAICloudTTSOptions)
shouldPostprocessSpeed = false
shouldPostprocessPitch = true
break
}
case 'elevenlabs': {
if (inputIsSSML) {
throw new Error(`The ElevenLabs engine doesn't support SSML inputs`)
}
const ElevenLabsTTS = await import('../synthesis/ElevenLabsTTS.js')
const engineOptions = options.elevenLabs!
if (!engineOptions.apiKey) {
throw new Error(`No ElevenLabs API key provided`)
}
const voiceId = (selectedVoice as any)['elevenLabsVoiceId']
logger.end()
const { rawAudio, timeline: outTimeline } = await ElevenLabsTTS.synthesize(text, voiceId, language, engineOptions)
synthesizedAudio = rawAudio
timeline = outTimeline
shouldPostprocessSpeed = true
shouldPostprocessPitch = true
break
}
case 'deepgram': {
if (inputIsSSML) {
throw new Error(`The Deepgram engine doesn't support SSML inputs`)
}
const DeepgramTTS = await import('../synthesis/DeepgramTTS.js')
const engineOptions = options.deepgram!
if (!engineOptions.apiKey) {
throw new Error(`No Deepgram API key provided`)
}
const modelId = selectedVoice.deepgramModelId
logger.end()
const { rawAudio } = await DeepgramTTS.synthesize(text, modelId, engineOptions)
synthesizedAudio = rawAudio
shouldPostprocessSpeed = true
shouldPostprocessPitch = true
break
}
case 'google-translate': {
if (inputIsSSML) {
throw new Error(`The Google Translate engine doesn't support SSML inputs`)
}
const GoogleTranslateTTS = await import('../synthesis/GoogleTranslateTTS.js')
logger.end()
const { rawAudio, timeline: segmentTimeline } =
await runOperationWithRetries(
() => GoogleTranslateTTS.synthesizeLongText(text, language, options.googleTranslate?.tld, options.sentenceEndPause, options.segmentEndPause),
logger)
synthesizedAudio = rawAudio
logger.start(`Generate word-level timestamps by individually aligning fragments`)
const alignmentOptions: API.AlignmentOptions = extendDeep(options.alignment, { language })
timeline = await API.alignSegments(synthesizedAudio, segmentTimeline, alignmentOptions)
shouldPostprocessSpeed = true
shouldPostprocessPitch = true
break
}
case 'microsoft-edge': {
if (inputIsSSML) {
throw new Error(`The Microsoft Edge engine doesn't support SSML inputs`)
}
const MicrosoftEdgeTTS = await import('../synthesis/MicrosoftEdgeTTS.js')
const engineOptions = options.microsoftEdge!
const trustedClientToken = engineOptions.trustedClientToken
if (!trustedClientToken) {
throw new Error('No Microsoft Edge trusted client token provided')
}
if (await sha256AsHex(trustedClientToken) != '558d7c6a7f7db444895946fe23a54ad172fd6d159f46cb34dd4db21bb27c07d7') {
throw new Error('Trusted client token is incorrect.')
}
let ssmlPitch: string
if (engineOptions.pitchDeltaHz != undefined) {
if (engineOptions.pitchDeltaHz >= 0) {
ssmlPitch = `+${Math.abs(engineOptions.pitchDeltaHz)}Hz`
} else {
ssmlPitch = `-${Math.abs(engineOptions.pitchDeltaHz)}Hz`
}
} else {
ssmlPitch = convertPitchScaleToSSMLValueString(pitch, voiceGender)
}
const ssmlRate = convertSpeedScaleToSSMLValueString(speed)
logger.end()
const { rawAudio, timeline: edgeTimeline } =
await runOperationWithRetries(
() => MicrosoftEdgeTTS.synthesize(text, trustedClientToken!, voice, ssmlPitch, ssmlRate),
logger)
synthesizedAudio = rawAudio
timeline = edgeTimeline
break
}
case 'streamlabs-polly': {
if (inputIsSSML) {
throw new Error(`The Streamlabs Polly Engine engine doesn't support SSML inputs`)
}
const StreamlabsPollyTTS = await import('../synthesis/StreamlabsPollyTTS.js')
logger.end()
const { rawAudio, timeline: segmentTimeline } = await StreamlabsPollyTTS.synthesizeLongText(text, voice, language, options.sentenceEndPause, options.segmentEndPause)
synthesizedAudio = rawAudio
logger.start(`Generate word-level timestamps by individually aligning fragments`)
const alignmentOptions: API.AlignmentOptions = extendDeep(options.alignment, { language })
timeline = await API.alignSegments(synthesizedAudio, segmentTimeline, alignmentOptions)
shouldPostprocessSpeed = true
shouldPostprocessPitch = true
break
}
default: {
throw new Error(`Engine '${options.engine}' is not supported`)
}
}
logger.start('Postprocess synthesized audio')
synthesizedAudio = downmixToMono(synthesizedAudio)
if (options.postProcessing!.normalizeAudio) {
normalizeAudioLevelInPlace(synthesizedAudio, options.postProcessing!.targetPeak!, options.postProcessing!.maxGainIncrease!)
} else {
attenuateIfClippingInPlace(synthesizedAudio)
}
const preTrimSampleCount = synthesizedAudio.audioChannels[0].length
synthesizedAudio.audioChannels[0] = trimAudioStart(synthesizedAudio.audioChannels[0])
if (timeline) {
const oldDuration = preTrimSampleCount / synthesizedAudio.sampleRate
const newDuration = synthesizedAudio.audioChannels[0].length / synthesizedAudio.sampleRate
timeline = addTimeOffsetToTimeline(timeline, newDuration - oldDuration)
}
if (!timeline) {
logger.start('Align synthesized audio with text')
let plainText = text
if (inputIsSSML) {
plainText = await convertHtmlToText(text)
}
const alignmentOptions = options.alignment!
alignmentOptions.language = language
if (!alignmentOptions.customLexiconPaths) {
alignmentOptions.customLexiconPaths = options.customLexiconPaths
}
if (alignmentOptions.dtw!.windowDuration == null) {
alignmentOptions.dtw!.windowDuration = Math.max(5, Math.ceil(0.2 * getRawAudioDuration(synthesizedAudio)))
}
const { wordTimeline } = await API.align(synthesizedAudio, plainText, alignmentOptions)
timeline = wordTimeline
logger.end()
}
const postProcessingOptions = options.postProcessing!
let timeStretchFactor = postProcessingOptions.speed
if (shouldPostprocessSpeed && timeStretchFactor == undefined) {
timeStretchFactor = speed
}
let pitchShiftFactor = postProcessingOptions.pitch
if (shouldPostprocessPitch && pitchShiftFactor == undefined) {
pitchShiftFactor = pitch
}
if ((timeStretchFactor != undefined && timeStretchFactor != 1.0) || (pitchShiftFactor != undefined && pitchShiftFactor != 1.0)) {
logger.start('Apply time and pitch shifting')
timeStretchFactor = timeStretchFactor || 1.0
pitchShiftFactor = pitchShiftFactor || 1.0
const timePitchShiftingMethod = postProcessingOptions.timePitchShiftingMethod
if (timePitchShiftingMethod == 'sonic') {
const sonic = await import('../dsp/Sonic.js')
synthesizedAudio = await sonic.stretchTimePitch(synthesizedAudio, timeStretchFactor, pitchShiftFactor)
} else if (timePitchShiftingMethod == 'rubberband') {
const rubberband = await import('../dsp/Rubberband.js')
const rubberbandOptions: RubberbandOptions = extendDeep(rubberband.defaultRubberbandOptions, postProcessingOptions.rubberband || {})
synthesizedAudio = await rubberband.stretchTimePitch(synthesizedAudio, timeStretchFactor, pitchShiftFactor, rubberbandOptions)
} else {
throw new Error(`'${timePitchShiftingMethod}' is not a valid time and pitch shifting method`)
}
if (timeStretchFactor != 1.0 && timeline) {
timeline = multiplyTimelineByFactor(timeline, 1 / timeStretchFactor)
}
}
if (timeline) {
timeline = timeline.filter(entry => isWordOrSymbolWord(entry.text))
}
logger.end()
logger.logDuration('Part synthesis time', startTimestamp, chalk.magentaBright)
return { synthesizedAudio, timeline }
}
function convertSpeedScaleToSSMLValueString(rate: number) {
if (rate >= 1.0) {
const ratePercentage = Math.floor((rate - 1) * 100)
return `+${ratePercentage}%`
} else {
const ratePercentage = Math.floor(((1 / rate) - 1) * 100)
return `-${ratePercentage}%`
}
}
function convertPitchScaleToSSMLValueString(pitch: number, voiceGender: VoiceGender) {
let fundementalFrequency
if (voiceGender == 'male') {
// Use an estimate of the average male voice fundemental frequency
fundementalFrequency = 120
} else if (voiceGender == 'female') {
// Use an estimate of the average female voice fundemental frequency
fundementalFrequency = 210
} else {
// (shouldn't occur since all voices should have a gender specified)
// Use the average of male and female voice frequency
fundementalFrequency = 165
}
if (pitch >= 1.0) {
const pitchDeltaHertz = Math.floor(pitch * fundementalFrequency) - fundementalFrequency
return `+${pitchDeltaHertz}Hz`
} else {
const pitchDeltaHertz = fundementalFrequency - Math.floor(pitch * fundementalFrequency)
return `-${pitchDeltaHertz}Hz`
}
}
export type SynthesisEngine =
'vits' | 'kokoro' | 'pico' | 'flite' | 'gnuspeech' |
'espeak' | 'sam' | 'sapi' | 'msspeech' | 'coqui-server' |
'google-cloud' | 'microsoft-azure' | 'amazon-polly' |
'openai-cloud' | 'elevenlabs' | 'deepgram' |
'google-translate' | 'microsoft-edge' | 'streamlabs-polly'
export type TimePitchShiftingMethod = 'sonic' | 'rubberband'
export interface SynthesisOptions {
engine?: SynthesisEngine
language?: string
voice?: string
voiceGender?: VoiceGender
speed?: number
pitch?: number
pitchVariation?: number
splitToSentences?: boolean
ssml?: boolean
segmentEndPause?: number
sentenceEndPause?: number
customLexiconPaths?: string[]
plainText?: API.PlainTextOptions
alignment?: API.AlignmentOptions
postProcessing?: {
normalizeAudio?: boolean
targetPeak?: number
maxGainIncrease?: number
speed?: number
pitch?: number
timePitchShiftingMethod?: TimePitchShiftingMethod,
rubberband?: RubberbandOptions
}
outputAudioFormat?: {
codec?: 'wav' | 'mp3' | 'opus' | 'm4a' | 'ogg' | 'flac'
bitrate?: number
}
languageDetection?: API.TextLanguageDetectionOptions
subtitles?: SubtitlesConfig
vits?: {
speakerId?: number
provider?: OnnxExecutionProvider
}
kokoro?: {
provider?: OnnxExecutionProvider
model?: '82m-v1.0-fp32' | '82m-v1.0-quantized'
}
pico?: {
}
flite?: {
}
gnuspeech?: {
tempo?: number
controlRate?: number
debug?: boolean
}
espeak?: {
rate?: number
pitch?: number
pitchRange?: number
useKlatt?: boolean
insertSeparators?: boolean
}
sam?: {
pitch?: number
speed?: number
mouth?: number
throat?: number
}
sapi?: {
rate?: number
}
msspeech?: {
rate?: number
}
coquiServer?: {
serverUrl?: string
speakerId?: string | null
}
googleCloud?: {
apiKey?: string,
pitchDeltaSemitones?: number,
customVoice?: {
model?: string
reportedUsage?: string
}
}
microsoftAzure?: {
subscriptionKey?: string
serviceRegion?: string
pitchDeltaHz?: number
}
amazonPolly?: {
region?: string
accessKeyId?: string
secretAccessKey?: string
pollyEngine?: 'standard' | 'neural'
lexiconNames?: string[]
}
openAICloud?: OpenAICloudTTSOptions
elevenLabs?: ElevenLabsTTSOptions,
deepgram?: DeepgramTTSOptions
googleTranslate?: {
tld?: string
}
microsoftEdge?: {
trustedClientToken?: string
pitchDeltaHz?: number
}
streamlabsPolly?: {
},
}
export const defaultSynthesisOptions: SynthesisOptions = {
engine: undefined,
language: undefined,
voice: undefined,
voiceGender: undefined,
speed: 1.0,
pitch: 1.0,
pitchVariation: 1.0,
ssml: false,
splitToSentences: true,
segmentEndPause: 1.0,
sentenceEndPause: 0.75,
customLexiconPaths: undefined,
plainText: {
paragraphBreaks: 'double',
whitespace: 'collapse'
},
alignment: {
engine: 'dtw',
dtw: {
granularity: 'high'
}
},
postProcessing: {
normalizeAudio: true,
targetPeak: -3,
maxGainIncrease: 30,
speed: undefined,
pitch: undefined,
timePitchShiftingMethod: 'sonic',
rubberband: {
}
},
outputAudioFormat: undefined,
languageDetection: undefined,
subtitles: {
},
vits: {
speakerId: undefined,
provider: undefined,
},
kokoro: {
model: '82m-v1.0-fp32'
},
pico: {
},
flite: {
},
gnuspeech: {
debug: false,
},
espeak: {
rate: undefined,
pitch: undefined,
pitchRange: undefined,
useKlatt: false,
},
sam: {
speed: undefined,
pitch: undefined,
mouth: 128,
throat: 128
},
sapi: {
rate: 0,
},
msspeech: {
rate: 0,
},
coquiServer: {
serverUrl: 'http://[::1]:5002',
speakerId: null
},
googleCloud: {
apiKey: undefined,
pitchDeltaSemitones: undefined,
customVoice: {
}
},
microsoftAzure: {
subscriptionKey: undefined,
serviceRegion: undefined,
pitchDeltaHz: undefined
},
amazonPolly: {
region: undefined,
accessKeyId: undefined,
secretAccessKey: undefined,
pollyEngine: undefined,
lexiconNames: undefined,
},
openAICloud: {
},
elevenLabs: {
},
deepgram: {
},
googleTranslate: {
tld: 'us'
},
microsoftEdge: {
trustedClientToken: undefined,
pitchDeltaHz: undefined
},
streamlabsPolly: {
},
}
/////////////////////////////////////////////////////////////////////////////////////////////
// Voice list request
/////////////////////////////////////////////////////////////////////////////////////////////
export async function requestVoiceList(options: VoiceListRequestOptions): Promise<RequestVoiceListResult> {
options = extendDeep(defaultVoiceListRequestOptions, options)
const logger = new Logger()
const cacheOptions = options.cache!
let cacheDir = cacheOptions?.path
if (!cacheDir) {
const appDataDir = getAppDataDir(appName)
cacheDir = joinPath(appDataDir, 'voice-list-cache')
await ensureDir(cacheDir)
}
const cacheFilePath = joinPath(cacheDir, `${options.engine}.voices.json`)
async function loadVoiceList() {
let voiceList: SynthesisVoice[] = []
switch (options.engine) {
case 'espeak': {
const EspeakTTS = await import('../synthesis/EspeakTTS.js')
const voices = await EspeakTTS.listVoices()
voiceList = voices.map(voice => {
const languages = voice.languages.map(lang => normalizeLanguageCode(lang.name))
for (const language of languages) {
const shortLanguageCode = getShortLanguageCode(language)
if (!languages.includes(shortLanguageCode)) {
languages.push(shortLanguageCode)
}
}
return {
name: voice.identifier,
languages,
gender: 'male'
}
})
break
}
case 'flite': {
const FliteTTS = await import('../synthesis/FliteTTS.js')
voiceList = deepClone(FliteTTS.voiceList)
break
}
case 'pico': {
const SvoxPicoTTS = await import('../synthesis/SvoxPicoTTS.js')
voiceList = SvoxPicoTTS.voiceList
break
}
case 'gnuspeech': {
const GnuSpeech = await import('../synthesis/GnuSpeechTTS.js')
voiceList = GnuSpeech.voiceList
break
}
case 'sam': {
voiceList.push({
name: 'sam',
languages: ['en-US', 'en'],
gender: 'male'
})
break
}
case 'vits': {
const VitsTTS = await import('../synthesis/VitsTTS.js')
voiceList = VitsTTS.voiceList.map(entry => {
return { ...entry, packageName: `vits-${entry.name}` }
})
break
}
case 'kokoro': {
const KokoroTTS = await import('../synthesis/KokoroTTS.js')
voiceList = KokoroTTS.voiceList
break
}
case 'sapi': {
const SapiTTS = await import('../synthesis/SapiTTS.js')
await SapiTTS.AssertSAPIAvailable(false)
voiceList = await SapiTTS.getVoiceList(false)
break
}
case 'msspeech': {
const SapiTTS = await import('../synthesis/SapiTTS.js')
await SapiTTS.AssertSAPIAvailable(true)
voiceList = await SapiTTS.getVoiceList(true)
break
}
case 'coqui-server': {
voiceList = [{
name: 'coqui',
languages: ['en-US'],
gender: 'unknown'
}]
break
}
case 'google-cloud': {
const GoogleCloudTTS = await import('../synthesis/GoogleCloudTTS.js')
const apiKey = options.googleCloud!.apiKey
if (!apiKey) {
throw new Error(`No Google Cloud API key provided`)
}
const voices = await GoogleCloudTTS.getVoiceList(apiKey)
voiceList = voices.map(voice => ({
name: voice.name,
languages: [normalizeLanguageCode(voice.languageCodes[0]), getShortLanguageCode(voice.languageCodes[0])],
gender: voice.ssmlGender.toLowerCase() as ('male' | 'female'),
}))
break
}
case 'microsoft-azure': {
const AzureCognitiveServicesTTS = await import('../synthesis/AzureCognitiveServicesTTS.js')
const subscriptionKey = options.microsoftAzure!.subscriptionKey
if (!subscriptionKey) {
throw new Error(`No Microsoft Azure subscription key provided`)
}
const serviceRegion = options.microsoftAzure!.serviceRegion
if (!serviceRegion) {
throw new Error(`No Microsoft Azure service region provided`)
}
const voices = await AzureCognitiveServicesTTS.getVoiceList(subscriptionKey, serviceRegion)
for (const voice of voices) {
voiceList.push({
name: voice.name,
languages: [normalizeLanguageCode(voice.locale), getShortLanguageCode(voice.locale)],
gender: voice.gender == 1 ? 'female' : 'male'
})
}
break
}
case 'amazon-polly': {
const AwsPollyTTS = await import('../synthesis/AwsPollyTTS.js')
const region = options.amazonPolly!.region
if (!region) {
throw new Error(`No Amazon Polly region provided`)
}
const accessKeyId = options.amazonPolly!.accessKeyId
if (!accessKeyId) {
throw new Error(`No Amazon Polly access key id provided`)
}
const secretAccessKey = options.amazonPolly!.secretAccessKey
if (!secretAccessKey) {
throw new Error(`No Amazon Polly secret access key provided`)
}
const voices = await AwsPollyTTS.getVoiceList(region, accessKeyId, secretAccessKey)
for (const voice of voices) {
const languageCode = normalizeLanguageCode(voice.LanguageCode!)
const languageCodes = [languageCode, getShortLanguageCode(languageCode)]
if (voice.AdditionalLanguageCodes) {
for (const additionalLanguageCode of voice.AdditionalLanguageCodes) {
languageCodes.push(
normalizeLanguageCode(additionalLanguageCode),
getShortLanguageCode(additionalLanguageCode)
)
}
}
voiceList.push({
name: voice.Id!,
languages: languageCodes,
gender: voice.Gender!.toLowerCase() as ('male' | 'female')
})
}
break
}
case 'openai-cloud': {
const OpenAICloudTTS = await import('../synthesis/OpenAICloudTTS.js')
voiceList = OpenAICloudTTS.voiceList
break
}
case 'elevenlabs': {
const ElevenLabsTTS = await import('../synthesis/ElevenLabsTTS.js')
const engineOptions = options.elevenLabs!
const apiKey = engineOptions.apiKey
if (!apiKey) {
throw new Error(`No ElevenLabs API key provided`)
}
voiceList = await ElevenLabsTTS.getVoiceList(apiKey)
break
}
case 'deepgram': {
const DeepgramTTS = await import('../synthesis/DeepgramTTS.js')
voiceList = DeepgramTTS.voiceList
break
}
case 'google-translate': {
const GoogleTranslateTTS = await import('../synthesis/GoogleTranslateTTS.js')
const langLookup = GoogleTranslateTTS.supportedLanguageLookup
for (const langCode in langLookup) {
voiceList.push({
name: langLookup[langCode],
languages: langCode.includes('-') ? [normalizeLanguageCode(langCode), getShortLanguageCode(langCode)] : [normalizeLanguageCode(langCode)],
gender: 'unknown'
})
}
break
}
case 'microsoft-edge': {
const MicrosoftEdgeTTS = await import('../synthesis/MicrosoftEdgeTTS.js')
const trustedClientToken = options.microsoftEdge?.trustedClientToken
if (!trustedClientToken) {
throw new Error('No Microsoft Edge trusted client token provided')
}
const voices =
await runOperationWithRetries(
() => MicrosoftEdgeTTS.getVoiceList(trustedClientToken),
logger)
voiceList = voices.map((voice: any) => ({
name: voice.Name,
languages: [normalizeLanguageCode(voice.Locale), getShortLanguageCode(voice.Locale)],
gender: voice.Gender == 'Male' ? 'male' : 'female',
}))
break
}
case 'streamlabs-polly': {
const StreamlabsPollyTTS = await import('../synthesis/StreamlabsPollyTTS.js')
voiceList = StreamlabsPollyTTS.voiceList
break
}
}
if (cacheFilePath) {
await writeFileSafe(cacheFilePath, await stringifyAndFormatJson(voiceList))
}
return voiceList
}
let voiceList: SynthesisVoice[]
if (cacheFilePath && existsSync(cacheFilePath) && await isFileIsUpToDate(cacheFilePath, options.cache!.duration!)) {
voiceList = await readAndParseJsonFile(cacheFilePath)
} else {
voiceList = await loadVoiceList()
}
const languageCode = await normalizeIdentifierToLanguageCode(options.language || '')
if (languageCode) {
let filteredVoiceList = voiceList.filter(voice => voice.languages.includes(languageCode))
if (filteredVoiceList.length == 0 && languageCode.includes('-')) {
const shortLanguageCode = getShortLanguageCode(languageCode)
filteredVoiceList = voiceList.filter(voice => voice.languages.includes(shortLanguageCode))
}
voiceList = filteredVoiceList
}
if (options.voiceGender) {
const genderLowercase = options.voiceGender.toLowerCase()
voiceList = voiceList.filter(voice => voice.gender == genderLowercase || voice.gender == 'unknown')
}
if (options.voice) {
const namePatternLowerCase = options.voice.toLocaleLowerCase()
const namePatternParts = namePatternLowerCase.split(/\b/g)
if (namePatternParts.length > 1) {
voiceList = voiceList.filter(voice => voice.name.toLocaleLowerCase().includes(namePatternLowerCase))
} else {
voiceList = voiceList.filter(voice => {
const name = voice.name.toLocaleLowerCase()
const nameParts = name.split(/\b/g)
for (const namePart of nameParts) {
if (namePart.startsWith(namePatternLowerCase)) {
return true
}
}
return false
})
}
}
let bestMatchingVoice = voiceList[0]
if (bestMatchingVoice && voiceList.length > 1 && defaultDialectForLanguageCode[languageCode]) {
const expandedLanguageCode = defaultDialectForLanguageCode[languageCode]
for (const voice of voiceList) {
if (voice.languages.includes(expandedLanguageCode)) {
bestMatchingVoice = voice
break
}
}
}
return { voiceList, bestMatchingVoice }
}
export interface RequestVoiceListResult {
voiceList: API.SynthesisVoice[]
bestMatchingVoice: API.SynthesisVoice
}
export async function selectBestOfflineEngineForLanguage(language: string): Promise<SynthesisEngine> {
language = await normalizeIdentifierToLanguageCode(language)
const VitsTTS = await import('../synthesis/VitsTTS.js')
const vitsLanguages = getAllLangCodesFromVoiceList(VitsTTS.voiceList)
if (vitsLanguages.includes(language)) {
return 'vits'
}
return 'espeak'
}
export function getAllLangCodesFromVoiceList(voiceList: SynthesisVoice[]) {
const languageCodes = new Set<string>()
const langList: string[] = []
for (const voice of voiceList) {
for (const langCode of voice.languages) {
if (languageCodes.has(langCode)) {
continue
}
langList.push(langCode)
languageCodes.add(langCode)
}
}
return langList
}
export interface VoiceListRequestOptions extends SynthesisOptions {
cache?: {
path?: string
duration?: number
}
}
export const defaultVoiceListRequestOptions: VoiceListRequestOptions = {
...defaultSynthesisOptions,
cache: {
path: undefined,
duration: 60 * 1
},
}
export interface SynthesisSegmentEventData {
index: number
total: number
audio: RawAudio | Uint8Array
timeline: Timeline
transcript: string
language: string
peakDecibelsSoFar: number
}
export type SynthesisSegmentEvent = (data: SynthesisSegmentEventData) => Promise<void>
export interface SynthesisVoice {
name: string
languages: string[]
gender: VoiceGender
speakerCount?: number
packageName?: string
[key: string]: any
}
export type VoiceGender = 'male' | 'female' | 'unknown'
export const synthesisEngines: EngineMetadata[] = [
{
id: 'vits',
name: 'VITS',
description: 'A high-quality end-to-end neural speech synthesis architecture.',
type: 'local'
},
{
id: 'kokoro',
name: 'Kokoro',
description: 'A high-quality neural speech synthesis model based on the StyleTTS 2 architecture.',
type: 'local'
},
{
id: 'pico',
name: 'SVOX Pico',
description: 'A legacy diphone-based speech synthesizer.',
type: 'local'
},
{
id: 'flite',
name: 'Flite',
description: 'A legacy diphone-based speech synthesizer.',
type: 'local'
},
{
id: 'gnuspeech',
name: 'Gnuspeech',
description: 'A legacy articulatory speech synthesizer.',
type: 'local'
},
{
id: 'espeak',
name: 'eSpeak NG',
description: `A lightweight, highly multilingual, 'robot'-like formant-based speech synthesizer.`,
type: 'local'
},
{
id: 'sam',
name: 'SAM (Software Automatic Mouth)',
description: `A classic 'robot'-like speech synthesizer from 1982.`,
type: 'local'
},
{
id: 'sapi',
name: 'SAPI',
description: 'Microsoft Speech API (Windows only).',
type: 'local'
},
{
id: 'msspeech',
name: 'Microsoft Speech Platform',
description: 'Microsoft Server Speech API (Windows only).',
type: 'local'
},
{
id: 'coqui-server',
name: 'Coqui TTS',
description: 'A deep learning toolkit for Text-to-Speech.',
type: 'server'
},
{
id: 'google-cloud',
name: 'Google Cloud',
description: 'Google Cloud text-to-speech service.',
type: 'cloud'
},
{
id: 'microsoft-azure',
name: 'Azure Cognitive Services',
description: 'Microsoft Azure cloud text-to-speech service.',
type: 'cloud'
},
{
id: 'amazon-polly',
name: 'Amazon Polly',
description: 'Amazon Polly (also: AWS Polly) cloud text-to-speech.',
type: 'cloud'
},
{
id: 'openai-cloud',
name: 'OpenAI Cloud',
description: 'OpenAI cloud text-to-speech.',
type: 'cloud'
},
{
id: 'elevenlabs',
name: 'ElevenLabs',
description: 'A generative AI text-to-speech cloud service.',
type: 'cloud'
},
{
id: 'deepgram',
name: 'Deepgram',
description: 'A generative AI text-to-speech cloud service.',
type: 'cloud'
},
{
id: 'google-translate',
name: 'Google Translate',
description: 'Unoffical text-to-speech API used by the Google Translate web interface.',
type: 'cloud'
},
{
id: 'microsoft-edge',
name: 'Microsoft Edge',
descripti