echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
1,086 lines • 55 kB
JavaScript
import { deepClone, extendDeep } from '../utilities/ObjectUtilities.js';
import * as FFMpegTranscoder from '../codecs/FFMpegTranscoder.js';
import { clip, sha256AsHex, stringifyAndFormatJson, logToStderr, yieldToEventLoop, runOperationWithRetries } from '../utilities/Utilities.js';
import { concatAudioSegments, downmixToMono, encodeRawAudioToWave, getSamplePeakDecibels, getEmptyRawAudio, getRawAudioDuration, trimAudioEnd, trimAudioStart, attenuateIfClippingInPlace, normalizeAudioLevelInPlace } from '../audio/AudioUtilities.js';
import { Logger } from '../utilities/Logger.js';
import { isWordOrSymbolWord, parseText, splitToParagraphs } from '../nlp/Segmentation.js';
import { loadLexiconsForLanguage } from '../nlp/Lexicon.js';
import * as API from './API.js';
import { addTimeOffsetToTimeline, multiplyTimelineByFactor } from '../utilities/Timeline.js';
import { getAppDataDir, ensureDir, existsSync, isFileIsUpToDate, readAndParseJsonFile, writeFileSafe } from '../utilities/FileSystem.js';
import { formatLanguageCodeWithName, getShortLanguageCode, normalizeLanguageCode, defaultDialectForLanguageCode, normalizeIdentifierToLanguageCode } from '../utilities/Locale.js';
import { loadPackage } from '../utilities/PackageManager.js';
import { appName } from './Common.js';
import { shouldCancelCurrentTask } from '../server/Worker.js';
import chalk from 'chalk';
import { simplifyPunctuationCharacters } from '../nlp/TextNormalizer.js';
import { convertHtmlToText } from '../utilities/StringUtilities.js';
import { joinPath, resolvePath } from '../utilities/PathUtilities.js';
import { Timer } from '../utilities/Timer.js';
const log = logToStderr;
/////////////////////////////////////////////////////////////////////////////////////////////
// Synthesis
/////////////////////////////////////////////////////////////////////////////////////////////
export async function synthesize(input, options, onSegment, onSentence) {
options = extendDeep(defaultSynthesisOptions, options);
let segments;
if (Array.isArray(input)) {
segments = input;
}
else if (options.ssml) {
segments = [input];
}
else {
const plainTextOptions = options.plainText;
segments = splitToParagraphs(input, plainTextOptions.paragraphBreaks, plainTextOptions.whitespace);
}
return synthesizeSegments(segments, options, onSegment, onSentence);
}
async function synthesizeSegments(segments, options, onSegment, onSentence) {
const logger = new Logger();
options = extendDeep(defaultSynthesisOptions, options);
const totalSynthesisTimeTimer = new Timer();
if (!options.language && !options.voice) {
logger.start('No language or voice specified. Detect language');
let segmentsPlainText = segments;
if (options.ssml) {
segmentsPlainText = [];
for (const segment of segments) {
segmentsPlainText.push(await convertHtmlToText(segment));
}
}
const { detectedLanguage } = await API.detectTextLanguage(segmentsPlainText.join('\n\n'), options.languageDetection || {});
options.language = detectedLanguage;
logger.end();
logger.logTitledMessage('Language detected', formatLanguageCodeWithName(detectedLanguage));
}
if (!options.engine) {
if (options.voice) {
throw new Error(`Voice '${options.voice}' was specified but no engine was specified.`);
}
options.engine = await selectBestOfflineEngineForLanguage(options.language);
logger.logTitledMessage('No engine specified. Auto-selected engine', options.engine);
}
logger.start(`Get voice list for ${options.engine}`);
const { bestMatchingVoice } = await requestVoiceList(options);
if (!bestMatchingVoice) {
throw new Error('No matching voice found');
}
options.voice = bestMatchingVoice.name;
if (!options.language) {
options.language = bestMatchingVoice.languages[0];
}
logger.end();
logger.logTitledMessage('Selected voice', `'${options.voice}' (${formatLanguageCodeWithName(bestMatchingVoice.languages[0], 2)})`);
const segmentsRawAudio = [];
const segmentsTimelines = [];
const timeline = [];
let peakDecibelsSoFar = -100;
let timeOffset = 0;
for (let segmentIndex = 0; segmentIndex < segments.length; segmentIndex++) {
const segmentText = segments[segmentIndex];
logger.log(`\n${chalk.magentaBright(`Synthesizing segment ${segmentIndex + 1}/${segments.length}`)}: '${segmentText.trim()}'`);
const segmentStartTime = timeOffset;
const segmentEntry = {
type: 'segment',
text: segmentText,
startTime: timeOffset,
endTime: -1,
timeline: []
};
let sentences;
if ((options.splitToSentences || options.engine === 'vits' || options.engine === 'kokoro') && !options.ssml) {
const parsedText = await parseText(segmentText, options.language);
sentences = parsedText.sentences.map(sentenceEntry => sentenceEntry.text);
if (sentences.length == 0) {
sentences = [''];
}
}
else {
sentences = [segmentText];
}
const sentencesRawAudio = [];
const sentencesTimelines = [];
for (let sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++) {
await yieldToEventLoop();
if (shouldCancelCurrentTask()) {
//log('\n\n\n\n\nCANCELED\n\n\n\n')
throw new Error('Canceled');
}
const sentenceText = sentences[sentenceIndex].trim();
logger.log(`\n${chalk.magentaBright(`Synthesizing sentence ${sentenceIndex + 1}/${sentences.length}`)}: "${sentenceText.trim()}"`);
const sentenceStartTime = timeOffset;
let sentenceSynthesisOptions = { postProcessing: { normalizeAudio: false } };
sentenceSynthesisOptions = extendDeep(options, sentenceSynthesisOptions);
const { synthesizedAudio: sentenceRawAudio, timeline: sentenceTimeline } = await synthesizeSegment(sentenceText, sentenceSynthesisOptions);
const endPause = sentenceIndex == sentences.length - 1 ? options.segmentEndPause : options.sentenceEndPause;
sentenceRawAudio.audioChannels[0] = trimAudioEnd(sentenceRawAudio.audioChannels[0], endPause * sentenceRawAudio.sampleRate);
sentencesRawAudio.push(sentenceRawAudio);
if (sentenceTimeline.length > 0) {
sentencesTimelines.push(sentenceTimeline);
}
const sentenceAudioLength = sentenceRawAudio.audioChannels[0].length / sentenceRawAudio.sampleRate;
timeOffset += sentenceAudioLength;
const sentenceTimelineWithOffset = addTimeOffsetToTimeline(sentenceTimeline, sentenceStartTime);
const sentenceEndTime = timeOffset - endPause;
segmentEntry.timeline.push({
type: 'sentence',
text: sentenceText,
startTime: sentenceStartTime,
endTime: sentenceEndTime,
timeline: sentenceTimelineWithOffset
});
peakDecibelsSoFar = Math.max(peakDecibelsSoFar, getSamplePeakDecibels(sentenceRawAudio.audioChannels));
const sentenceAudio = await convertToTargetCodecIfNeeded(sentenceRawAudio);
if (onSentence) {
await onSentence({
index: sentenceIndex,
total: sentences.length,
audio: sentenceAudio,
timeline: sentenceTimeline,
transcript: sentenceText,
language: options.language,
peakDecibelsSoFar
});
}
}
segmentEntry.endTime = segmentEntry.timeline?.[segmentEntry.timeline.length - 1]?.endTime || timeOffset;
logger.end();
logger.start(`Merge and postprocess sentences`);
let segmentRawAudio;
if (sentencesRawAudio.length > 0) {
const joinedAudioBuffers = concatAudioSegments(sentencesRawAudio.map(part => part.audioChannels));
segmentRawAudio = { audioChannels: joinedAudioBuffers, sampleRate: sentencesRawAudio[0].sampleRate };
}
else {
segmentRawAudio = getEmptyRawAudio(1, 24000);
}
segmentsRawAudio.push(segmentRawAudio);
timeline.push(segmentEntry);
const segmentTimelineWithoutOffset = addTimeOffsetToTimeline(segmentEntry.timeline, -segmentStartTime);
segmentsTimelines.push(segmentTimelineWithoutOffset);
const segmentAudio = await convertToTargetCodecIfNeeded(segmentRawAudio);
logger.end();
if (onSegment) {
await onSegment({
index: segmentIndex,
total: segments.length,
audio: segmentAudio,
timeline: segmentTimelineWithoutOffset,
transcript: segmentText,
language: options.language,
peakDecibelsSoFar
});
}
}
logger.start(`\nMerge and postprocess segments`);
let resultRawAudio;
if (segmentsRawAudio.length > 0) {
const joinedAudioBuffers = concatAudioSegments(segmentsRawAudio.map(part => part.audioChannels));
resultRawAudio = { audioChannels: joinedAudioBuffers, sampleRate: segmentsRawAudio[0].sampleRate };
if (options.postProcessing.normalizeAudio) {
normalizeAudioLevelInPlace(resultRawAudio, options.postProcessing.targetPeak, options.postProcessing.maxGainIncrease);
}
else {
attenuateIfClippingInPlace(resultRawAudio);
}
}
else {
resultRawAudio = getEmptyRawAudio(1, 24000);
}
async function convertToTargetCodecIfNeeded(rawAudio) {
const targetCodec = options.outputAudioFormat?.codec;
let output;
if (targetCodec) {
logger.start(`Convert to ${targetCodec} codec`);
if (targetCodec == 'wav') {
output = encodeRawAudioToWave(rawAudio);
}
else {
const ffmpegOptions = FFMpegTranscoder.getDefaultFFMpegOptionsForSpeech(targetCodec, options.outputAudioFormat?.bitrate);
output = await FFMpegTranscoder.encodeFromChannels(rawAudio, ffmpegOptions);
}
}
else {
output = rawAudio;
}
return output;
}
const resultAudio = await convertToTargetCodecIfNeeded(resultRawAudio);
logger.end();
logger.logTitledMessage('Total synthesis time', `${totalSynthesisTimeTimer.elapsedTime.toFixed(1)}ms`, chalk.magentaBright);
return {
audio: resultAudio,
timeline,
language: options.language,
voice: options.voice
};
}
async function synthesizeSegment(text, options) {
const logger = new Logger();
const startTimestamp = logger.getTimestamp();
logger.start('Prepare text for synthesis');
const simplifiedText = simplifyPunctuationCharacters(text);
const engine = options.engine;
logger.start(`Get voice list for ${engine}`);
const { bestMatchingVoice } = await requestVoiceList(options);
if (!bestMatchingVoice) {
throw new Error('No matching voice found');
}
const selectedVoice = bestMatchingVoice;
let voicePackagePath;
if (selectedVoice.packageName) {
logger.end();
voicePackagePath = await loadPackage(selectedVoice.packageName);
}
logger.start(`Initialize ${engine} module`);
const voice = selectedVoice.name;
let language;
if (options.language) {
language = await normalizeIdentifierToLanguageCode(options.language);
}
else {
language = selectedVoice.languages[0];
}
const voiceGender = selectedVoice.gender;
const speed = clip(options.speed, 0.1, 10.0);
const pitch = clip(options.pitch, 0.1, 10.0);
const inputIsSSML = options.ssml;
let synthesizedAudio;
let timeline;
let shouldPostprocessSpeed = false;
let shouldPostprocessPitch = false;
switch (engine) {
case 'vits': {
if (inputIsSSML) {
throw new Error(`The VITS engine doesn't currently support SSML inputs`);
}
let vitsLanguage = language;
if (vitsLanguage == 'en') {
vitsLanguage = 'en-us';
}
const vitsTTS = await import('../synthesis/VitsTTS.js');
const lengthScale = 1 / speed;
const vitsOptions = options.vits;
const speakerId = vitsOptions.speakerId;
if (speakerId != undefined) {
if (selectedVoice.speakerCount == undefined) {
if (speakerId != 0) {
throw new Error('Selected VITS model has only one speaker. Speaker ID must be 0 if specified.');
}
}
else if (speakerId < 0 || speakerId >= selectedVoice.speakerCount) {
throw new Error(`Selected VITS model has ${selectedVoice.speakerCount} speaker IDs. Speaker ID should be in the range ${0} to ${selectedVoice.speakerCount - 1}`);
}
}
const lexicons = await loadLexiconsForLanguage(language, options.customLexiconPaths);
const modelPath = voicePackagePath;
const onnxExecutionProviders = vitsOptions.provider ? [vitsOptions.provider] : [];
logger.end();
const { rawAudio, timeline: outTimeline } = await vitsTTS.synthesizeSentence(text, voice, modelPath, lengthScale, speakerId ?? 0, lexicons, onnxExecutionProviders);
synthesizedAudio = rawAudio;
timeline = outTimeline;
shouldPostprocessPitch = true;
logger.end();
break;
}
case 'kokoro': {
if (inputIsSSML) {
throw new Error(`The Kokoro engine doesn't currently support SSML inputs`);
}
const kokoroOptions = options.kokoro;
const kokoroTTS = await import('../synthesis/KokoroTTS.js');
const lexicons = await loadLexiconsForLanguage(language, options.customLexiconPaths);
const onnxExecutionProviders = kokoroOptions.provider ? [kokoroOptions.provider] : [];
const modelName = kokoroOptions.model;
const modelPackageName = `kokoro-${modelName}`;
const modelPath = await loadPackage(modelPackageName);
const voicesPath = await loadPackage('kokoro-82m-v1.0-voices');
logger.end();
logger.logTitledMessage(`Using model`, modelPackageName);
const { rawAudio, timeline: outTimeline } = await kokoroTTS.synthesizeSentence(text, selectedVoice, speed, lexicons, modelPath, voicesPath, onnxExecutionProviders);
synthesizedAudio = rawAudio;
timeline = outTimeline;
shouldPostprocessPitch = true;
logger.end();
break;
}
case 'pico': {
if (inputIsSSML) {
throw new Error(`The SVOX Pico engine doesn't currently support SSML inputs`);
}
const SvoxPicoTTS = await import('../synthesis/SvoxPicoTTS.js');
const picoSpeed = Math.round(speed * 1.0 * 100);
const picoPitch = Math.round(pitch * 1.0 * 100);
const picoVolume = 35.0;
const preparedText = `<speed level='${picoSpeed}'><pitch level='${picoPitch}'><volume level='${picoVolume}'>${simplifiedText}</volume></pitch></speed>`;
logger.end();
const { textAnalysisFilename, signalGenerationFilename } = SvoxPicoTTS.getResourceFilenamesForLanguage(language);
const resourceFilePath = resolvePath(voicePackagePath, textAnalysisFilename);
const signalGenerationFilePath = resolvePath(voicePackagePath, signalGenerationFilename);
const { rawAudio } = await SvoxPicoTTS.synthesize(preparedText, resourceFilePath, signalGenerationFilePath);
synthesizedAudio = rawAudio;
break;
}
case 'flite': {
if (inputIsSSML) {
throw new Error(`The Flite engine doesn't currently support SSML inputs`);
}
const FliteTTS = await import('../synthesis/FliteTTS.js');
logger.end();
const { rawAudio, events } = await FliteTTS.synthesize(simplifiedText, voice, voicePackagePath, speed);
synthesizedAudio = rawAudio;
shouldPostprocessPitch = true;
break;
}
case 'gnuspeech': {
if (inputIsSSML) {
throw new Error(`The Gnuspeech engine doesn't currently support SSML inputs`);
}
const engineOptions = options.gnuspeech;
const GnuSpeech = await import('../synthesis/GnuSpeechTTS.js');
const { defaultGnuSpeechOptions } = await import('@echogarden/gnuspeech-wasm');
const gnuSpeechOptions = extendDeep(defaultGnuSpeechOptions, engineOptions);
if (!engineOptions.tempo) {
gnuSpeechOptions.tempo = speed;
}
await logger.startAsync(`Synthesize with Gnuspeech`);
const { rawAudio } = await GnuSpeech.synthesize(simplifiedText, gnuSpeechOptions);
synthesizedAudio = rawAudio;
shouldPostprocessPitch = true;
logger.end();
break;
}
case 'espeak': {
const EspeakTTS = await import('../synthesis/EspeakTTS.js');
const engineOptions = options.espeak;
const espeakVoice = voice;
const espeakLanguage = selectedVoice.languages[0];
const espeakRate = engineOptions.rate || speed * 150;
const espeakPitch = engineOptions.pitch || options.pitch * 50;
const espeakPitchRange = engineOptions.pitchRange || options.pitchVariation * 50;
const espeakUseKlatt = engineOptions.useKlatt || false;
const espeakInsertSeparators = engineOptions.insertSeparators || false;
const espeakOptions = {
voice: espeakVoice,
ssml: inputIsSSML,
rate: espeakRate,
pitch: espeakPitch,
pitchRange: espeakPitchRange,
useKlatt: espeakUseKlatt,
insertSeparators: espeakInsertSeparators,
};
if (inputIsSSML) {
logger.end();
const { rawAudio } = await EspeakTTS.synthesize(text, espeakOptions);
synthesizedAudio = rawAudio;
}
else {
const lexicons = await loadLexiconsForLanguage(language, options.customLexiconPaths);
logger.end();
const { referenceSynthesizedAudio, referenceTimeline } = await EspeakTTS.preprocessAndSynthesize(text, espeakLanguage, espeakOptions, lexicons);
synthesizedAudio = referenceSynthesizedAudio;
timeline = referenceTimeline.flatMap(clause => clause.timeline);
}
break;
}
case 'sam': {
if (inputIsSSML) {
throw new Error(`The SAM engine doesn't support SSML inputs`);
}
const SamTTS = await import('../synthesis/SamTTS.js');
const engineOptions = options.sam;
const samPitch = clip(engineOptions.pitch || Math.round((1 / pitch) * 64), 0, 255);
const samSpeed = clip(engineOptions.speed || Math.round((1 / speed) * 72), 0, 255);
const samMouth = clip(engineOptions.mouth, 0, 255);
const samThroat = clip(engineOptions.throat, 0, 255);
logger.end();
const { rawAudio } = await SamTTS.synthesize(simplifiedText, samPitch, samSpeed, samMouth, samThroat);
synthesizedAudio = rawAudio;
break;
}
case 'sapi': {
if (inputIsSSML) {
throw new Error(`The SAPI engine doesn't currently support SSML inputs`);
}
const SapiTTS = await import('../synthesis/SapiTTS.js');
await SapiTTS.AssertSAPIAvailable(false);
const engineOptions = options.sapi;
const sapiRate = engineOptions.rate || 0;
logger.end();
const { rawAudio, timeline: outTimeline } = await SapiTTS.synthesize(text, voice, sapiRate, false);
synthesizedAudio = rawAudio;
timeline = outTimeline;
shouldPostprocessSpeed = true;
shouldPostprocessPitch = true;
break;
}
case 'msspeech': {
if (inputIsSSML) {
throw new Error(`The MSSpeech engine doesn't currently support SSML inputs`);
}
const SapiTTS = await import('../synthesis/SapiTTS.js');
await SapiTTS.AssertSAPIAvailable(true);
const engineOptions = options.msspeech;
const sapiRate = engineOptions.rate || 0;
logger.end();
const { rawAudio, timeline: outTimeline } = await SapiTTS.synthesize(text, voice, sapiRate, true);
synthesizedAudio = rawAudio;
timeline = outTimeline;
shouldPostprocessSpeed = true;
shouldPostprocessPitch = true;
break;
}
case 'coqui-server': {
if (inputIsSSML) {
throw new Error(`The Coqui Server engine doesn't support SSML inputs`);
}
const CoquiServerTTS = await import('../synthesis/CoquiServerTTS.js');
const engineOptions = options.coquiServer;
const speakerId = engineOptions.speakerId;
const serverUrl = engineOptions.serverUrl;
if (!serverUrl) {
throw new Error(`'coqui-server' requires a server URL`);
}
logger.end();
const { rawAudio } = await CoquiServerTTS.synthesize(simplifiedText, speakerId, serverUrl);
synthesizedAudio = rawAudio;
shouldPostprocessSpeed = true;
shouldPostprocessPitch = true;
break;
}
case 'google-cloud': {
const GoogleCloudTTS = await import('../synthesis/GoogleCloudTTS.js');
const engineOptions = options.googleCloud;
const apiKey = engineOptions.apiKey;
if (!apiKey) {
throw new Error(`No Google Cloud API key provided`);
}
let pitchDeltaSemitones;
// 1 semitone up = multiply by 1.05946
// 1 semitone down = divide by 1.05946
if (engineOptions.pitchDeltaSemitones != undefined) {
pitchDeltaSemitones = engineOptions.pitchDeltaSemitones;
}
else if (pitch >= 1.0) {
pitchDeltaSemitones = Math.round(17.3132 * Math.log(pitch));
}
else {
pitchDeltaSemitones = Math.round(-17.3132 * Math.log(1 / pitch));
}
logger.end();
const { rawAudio, timepoints } = await GoogleCloudTTS.synthesize(text, apiKey, language, voice, speed, pitchDeltaSemitones, 0, inputIsSSML);
synthesizedAudio = rawAudio;
break;
}
case 'microsoft-azure': {
const AzureCognitiveServicesTTS = await import('../synthesis/AzureCognitiveServicesTTS.js');
const engineOptions = options.microsoftAzure;
const subscriptionKey = engineOptions.subscriptionKey;
if (!subscriptionKey) {
throw new Error(`No Microsoft Azure subscription key provided`);
}
const serviceRegion = engineOptions.serviceRegion;
if (!serviceRegion) {
throw new Error(`No Microsoft Azure service region provided`);
}
let ssmlPitch;
if (engineOptions.pitchDeltaHz != undefined) {
if (engineOptions.pitchDeltaHz >= 0) {
ssmlPitch = `+${Math.abs(engineOptions.pitchDeltaHz)}Hz`;
}
else {
ssmlPitch = `-${Math.abs(engineOptions.pitchDeltaHz)}Hz`;
}
}
else {
ssmlPitch = convertPitchScaleToSSMLValueString(pitch, voiceGender);
}
const ssmlRate = convertSpeedScaleToSSMLValueString(speed);
logger.end();
const { rawAudio, timeline: outTimeline } = await AzureCognitiveServicesTTS.synthesize(text, subscriptionKey, serviceRegion, language, voice, inputIsSSML, ssmlPitch, ssmlRate);
synthesizedAudio = rawAudio;
timeline = outTimeline;
break;
}
case 'amazon-polly': {
const AwsPollyTTS = await import('../synthesis/AwsPollyTTS.js');
const engineOptions = options.amazonPolly;
const region = engineOptions.region;
if (!region) {
throw new Error(`No Amazon Polly region provided`);
}
const accessKeyId = engineOptions.accessKeyId;
if (!accessKeyId) {
throw new Error(`No Amazon Polly access key id provided`);
}
const secretAccessKey = engineOptions.secretAccessKey;
if (!secretAccessKey) {
throw new Error(`No Amazon Polly secret access key provided`);
}
const pollyEngine = engineOptions.pollyEngine;
const lexiconNames = engineOptions.lexiconNames;
logger.end();
const { rawAudio } = await AwsPollyTTS.synthesize(text, undefined, voice, region, accessKeyId, secretAccessKey, pollyEngine, inputIsSSML, lexiconNames);
synthesizedAudio = rawAudio;
shouldPostprocessSpeed = true;
shouldPostprocessPitch = true;
break;
}
case 'openai-cloud': {
const OpenAICloudTTS = await import('../synthesis/OpenAICloudTTS.js');
const openAICloudTTSOptions = options.openAICloud;
if (!openAICloudTTSOptions.apiKey) {
throw new Error(`No API key given`);
}
logger.end();
synthesizedAudio = await OpenAICloudTTS.synthesize(text, voice, speed, openAICloudTTSOptions);
shouldPostprocessSpeed = false;
shouldPostprocessPitch = true;
break;
}
case 'elevenlabs': {
if (inputIsSSML) {
throw new Error(`The ElevenLabs engine doesn't support SSML inputs`);
}
const ElevenLabsTTS = await import('../synthesis/ElevenLabsTTS.js');
const engineOptions = options.elevenLabs;
if (!engineOptions.apiKey) {
throw new Error(`No ElevenLabs API key provided`);
}
const voiceId = selectedVoice['elevenLabsVoiceId'];
logger.end();
const { rawAudio, timeline: outTimeline } = await ElevenLabsTTS.synthesize(text, voiceId, language, engineOptions);
synthesizedAudio = rawAudio;
timeline = outTimeline;
shouldPostprocessSpeed = true;
shouldPostprocessPitch = true;
break;
}
case 'deepgram': {
if (inputIsSSML) {
throw new Error(`The Deepgram engine doesn't support SSML inputs`);
}
const DeepgramTTS = await import('../synthesis/DeepgramTTS.js');
const engineOptions = options.deepgram;
if (!engineOptions.apiKey) {
throw new Error(`No Deepgram API key provided`);
}
const modelId = selectedVoice.deepgramModelId;
logger.end();
const { rawAudio } = await DeepgramTTS.synthesize(text, modelId, engineOptions);
synthesizedAudio = rawAudio;
shouldPostprocessSpeed = true;
shouldPostprocessPitch = true;
break;
}
case 'google-translate': {
if (inputIsSSML) {
throw new Error(`The Google Translate engine doesn't support SSML inputs`);
}
const GoogleTranslateTTS = await import('../synthesis/GoogleTranslateTTS.js');
logger.end();
const { rawAudio, timeline: segmentTimeline } = await runOperationWithRetries(() => GoogleTranslateTTS.synthesizeLongText(text, language, options.googleTranslate?.tld, options.sentenceEndPause, options.segmentEndPause), logger);
synthesizedAudio = rawAudio;
logger.start(`Generate word-level timestamps by individually aligning fragments`);
const alignmentOptions = extendDeep(options.alignment, { language });
timeline = await API.alignSegments(synthesizedAudio, segmentTimeline, alignmentOptions);
shouldPostprocessSpeed = true;
shouldPostprocessPitch = true;
break;
}
case 'microsoft-edge': {
if (inputIsSSML) {
throw new Error(`The Microsoft Edge engine doesn't support SSML inputs`);
}
const MicrosoftEdgeTTS = await import('../synthesis/MicrosoftEdgeTTS.js');
const engineOptions = options.microsoftEdge;
const trustedClientToken = engineOptions.trustedClientToken;
if (!trustedClientToken) {
throw new Error('No Microsoft Edge trusted client token provided');
}
if (await sha256AsHex(trustedClientToken) != '558d7c6a7f7db444895946fe23a54ad172fd6d159f46cb34dd4db21bb27c07d7') {
throw new Error('Trusted client token is incorrect.');
}
let ssmlPitch;
if (engineOptions.pitchDeltaHz != undefined) {
if (engineOptions.pitchDeltaHz >= 0) {
ssmlPitch = `+${Math.abs(engineOptions.pitchDeltaHz)}Hz`;
}
else {
ssmlPitch = `-${Math.abs(engineOptions.pitchDeltaHz)}Hz`;
}
}
else {
ssmlPitch = convertPitchScaleToSSMLValueString(pitch, voiceGender);
}
const ssmlRate = convertSpeedScaleToSSMLValueString(speed);
logger.end();
const { rawAudio, timeline: edgeTimeline } = await runOperationWithRetries(() => MicrosoftEdgeTTS.synthesize(text, trustedClientToken, voice, ssmlPitch, ssmlRate), logger);
synthesizedAudio = rawAudio;
timeline = edgeTimeline;
break;
}
case 'streamlabs-polly': {
if (inputIsSSML) {
throw new Error(`The Streamlabs Polly Engine engine doesn't support SSML inputs`);
}
const StreamlabsPollyTTS = await import('../synthesis/StreamlabsPollyTTS.js');
logger.end();
const { rawAudio, timeline: segmentTimeline } = await StreamlabsPollyTTS.synthesizeLongText(text, voice, language, options.sentenceEndPause, options.segmentEndPause);
synthesizedAudio = rawAudio;
logger.start(`Generate word-level timestamps by individually aligning fragments`);
const alignmentOptions = extendDeep(options.alignment, { language });
timeline = await API.alignSegments(synthesizedAudio, segmentTimeline, alignmentOptions);
shouldPostprocessSpeed = true;
shouldPostprocessPitch = true;
break;
}
default: {
throw new Error(`Engine '${options.engine}' is not supported`);
}
}
logger.start('Postprocess synthesized audio');
synthesizedAudio = downmixToMono(synthesizedAudio);
if (options.postProcessing.normalizeAudio) {
normalizeAudioLevelInPlace(synthesizedAudio, options.postProcessing.targetPeak, options.postProcessing.maxGainIncrease);
}
else {
attenuateIfClippingInPlace(synthesizedAudio);
}
const preTrimSampleCount = synthesizedAudio.audioChannels[0].length;
synthesizedAudio.audioChannels[0] = trimAudioStart(synthesizedAudio.audioChannels[0]);
if (timeline) {
const oldDuration = preTrimSampleCount / synthesizedAudio.sampleRate;
const newDuration = synthesizedAudio.audioChannels[0].length / synthesizedAudio.sampleRate;
timeline = addTimeOffsetToTimeline(timeline, newDuration - oldDuration);
}
if (!timeline) {
logger.start('Align synthesized audio with text');
let plainText = text;
if (inputIsSSML) {
plainText = await convertHtmlToText(text);
}
const alignmentOptions = options.alignment;
alignmentOptions.language = language;
if (!alignmentOptions.customLexiconPaths) {
alignmentOptions.customLexiconPaths = options.customLexiconPaths;
}
if (alignmentOptions.dtw.windowDuration == null) {
alignmentOptions.dtw.windowDuration = Math.max(5, Math.ceil(0.2 * getRawAudioDuration(synthesizedAudio)));
}
const { wordTimeline } = await API.align(synthesizedAudio, plainText, alignmentOptions);
timeline = wordTimeline;
logger.end();
}
const postProcessingOptions = options.postProcessing;
let timeStretchFactor = postProcessingOptions.speed;
if (shouldPostprocessSpeed && timeStretchFactor == undefined) {
timeStretchFactor = speed;
}
let pitchShiftFactor = postProcessingOptions.pitch;
if (shouldPostprocessPitch && pitchShiftFactor == undefined) {
pitchShiftFactor = pitch;
}
if ((timeStretchFactor != undefined && timeStretchFactor != 1.0) || (pitchShiftFactor != undefined && pitchShiftFactor != 1.0)) {
logger.start('Apply time and pitch shifting');
timeStretchFactor = timeStretchFactor || 1.0;
pitchShiftFactor = pitchShiftFactor || 1.0;
const timePitchShiftingMethod = postProcessingOptions.timePitchShiftingMethod;
if (timePitchShiftingMethod == 'sonic') {
const sonic = await import('../dsp/Sonic.js');
synthesizedAudio = await sonic.stretchTimePitch(synthesizedAudio, timeStretchFactor, pitchShiftFactor);
}
else if (timePitchShiftingMethod == 'rubberband') {
const rubberband = await import('../dsp/Rubberband.js');
const rubberbandOptions = extendDeep(rubberband.defaultRubberbandOptions, postProcessingOptions.rubberband || {});
synthesizedAudio = await rubberband.stretchTimePitch(synthesizedAudio, timeStretchFactor, pitchShiftFactor, rubberbandOptions);
}
else {
throw new Error(`'${timePitchShiftingMethod}' is not a valid time and pitch shifting method`);
}
if (timeStretchFactor != 1.0 && timeline) {
timeline = multiplyTimelineByFactor(timeline, 1 / timeStretchFactor);
}
}
if (timeline) {
timeline = timeline.filter(entry => isWordOrSymbolWord(entry.text));
}
logger.end();
logger.logDuration('Part synthesis time', startTimestamp, chalk.magentaBright);
return { synthesizedAudio, timeline };
}
function convertSpeedScaleToSSMLValueString(rate) {
if (rate >= 1.0) {
const ratePercentage = Math.floor((rate - 1) * 100);
return `+${ratePercentage}%`;
}
else {
const ratePercentage = Math.floor(((1 / rate) - 1) * 100);
return `-${ratePercentage}%`;
}
}
function convertPitchScaleToSSMLValueString(pitch, voiceGender) {
let fundementalFrequency;
if (voiceGender == 'male') {
// Use an estimate of the average male voice fundemental frequency
fundementalFrequency = 120;
}
else if (voiceGender == 'female') {
// Use an estimate of the average female voice fundemental frequency
fundementalFrequency = 210;
}
else {
// (shouldn't occur since all voices should have a gender specified)
// Use the average of male and female voice frequency
fundementalFrequency = 165;
}
if (pitch >= 1.0) {
const pitchDeltaHertz = Math.floor(pitch * fundementalFrequency) - fundementalFrequency;
return `+${pitchDeltaHertz}Hz`;
}
else {
const pitchDeltaHertz = fundementalFrequency - Math.floor(pitch * fundementalFrequency);
return `-${pitchDeltaHertz}Hz`;
}
}
export const defaultSynthesisOptions = {
engine: undefined,
language: undefined,
voice: undefined,
voiceGender: undefined,
speed: 1.0,
pitch: 1.0,
pitchVariation: 1.0,
ssml: false,
splitToSentences: true,
segmentEndPause: 1.0,
sentenceEndPause: 0.75,
customLexiconPaths: undefined,
plainText: {
paragraphBreaks: 'double',
whitespace: 'collapse'
},
alignment: {
engine: 'dtw',
dtw: {
granularity: 'high'
}
},
postProcessing: {
normalizeAudio: true,
targetPeak: -3,
maxGainIncrease: 30,
speed: undefined,
pitch: undefined,
timePitchShiftingMethod: 'sonic',
rubberband: {}
},
outputAudioFormat: undefined,
languageDetection: undefined,
subtitles: {},
vits: {
speakerId: undefined,
provider: undefined,
},
kokoro: {
model: '82m-v1.0-fp32'
},
pico: {},
flite: {},
gnuspeech: {
debug: false,
},
espeak: {
rate: undefined,
pitch: undefined,
pitchRange: undefined,
useKlatt: false,
},
sam: {
speed: undefined,
pitch: undefined,
mouth: 128,
throat: 128
},
sapi: {
rate: 0,
},
msspeech: {
rate: 0,
},
coquiServer: {
serverUrl: 'http://[::1]:5002',
speakerId: null
},
googleCloud: {
apiKey: undefined,
pitchDeltaSemitones: undefined,
customVoice: {}
},
microsoftAzure: {
subscriptionKey: undefined,
serviceRegion: undefined,
pitchDeltaHz: undefined
},
amazonPolly: {
region: undefined,
accessKeyId: undefined,
secretAccessKey: undefined,
pollyEngine: undefined,
lexiconNames: undefined,
},
openAICloud: {},
elevenLabs: {},
deepgram: {},
googleTranslate: {
tld: 'us'
},
microsoftEdge: {
trustedClientToken: undefined,
pitchDeltaHz: undefined
},
streamlabsPolly: {},
};
/////////////////////////////////////////////////////////////////////////////////////////////
// Voice list request
/////////////////////////////////////////////////////////////////////////////////////////////
export async function requestVoiceList(options) {
options = extendDeep(defaultVoiceListRequestOptions, options);
const logger = new Logger();
const cacheOptions = options.cache;
let cacheDir = cacheOptions?.path;
if (!cacheDir) {
const appDataDir = getAppDataDir(appName);
cacheDir = joinPath(appDataDir, 'voice-list-cache');
await ensureDir(cacheDir);
}
const cacheFilePath = joinPath(cacheDir, `${options.engine}.voices.json`);
async function loadVoiceList() {
let voiceList = [];
switch (options.engine) {
case 'espeak': {
const EspeakTTS = await import('../synthesis/EspeakTTS.js');
const voices = await EspeakTTS.listVoices();
voiceList = voices.map(voice => {
const languages = voice.languages.map(lang => normalizeLanguageCode(lang.name));
for (const language of languages) {
const shortLanguageCode = getShortLanguageCode(language);
if (!languages.includes(shortLanguageCode)) {
languages.push(shortLanguageCode);
}
}
return {
name: voice.identifier,
languages,
gender: 'male'
};
});
break;
}
case 'flite': {
const FliteTTS = await import('../synthesis/FliteTTS.js');
voiceList = deepClone(FliteTTS.voiceList);
break;
}
case 'pico': {
const SvoxPicoTTS = await import('../synthesis/SvoxPicoTTS.js');
voiceList = SvoxPicoTTS.voiceList;
break;
}
case 'gnuspeech': {
const GnuSpeech = await import('../synthesis/GnuSpeechTTS.js');
voiceList = GnuSpeech.voiceList;
break;
}
case 'sam': {
voiceList.push({
name: 'sam',
languages: ['en-US', 'en'],
gender: 'male'
});
break;
}
case 'vits': {
const VitsTTS = await import('../synthesis/VitsTTS.js');
voiceList = VitsTTS.voiceList.map(entry => {
return { ...entry, packageName: `vits-${entry.name}` };
});
break;
}
case 'kokoro': {
const KokoroTTS = await import('../synthesis/KokoroTTS.js');
voiceList = KokoroTTS.voiceList;
break;
}
case 'sapi': {
const SapiTTS = await import('../synthesis/SapiTTS.js');
await SapiTTS.AssertSAPIAvailable(false);
voiceList = await SapiTTS.getVoiceList(false);
break;
}
case 'msspeech': {
const SapiTTS = await import('../synthesis/SapiTTS.js');
await SapiTTS.AssertSAPIAvailable(true);
voiceList = await SapiTTS.getVoiceList(true);
break;
}
case 'coqui-server': {
voiceList = [{
name: 'coqui',
languages: ['en-US'],
gender: 'unknown'
}];
break;
}
case 'google-cloud': {
const GoogleCloudTTS = await import('../synthesis/GoogleCloudTTS.js');
const apiKey = options.googleCloud.apiKey;
if (!apiKey) {
throw new Error(`No Google Cloud API key provided`);
}
const voices = await GoogleCloudTTS.getVoiceList(apiKey);
voiceList = voices.map(voice => ({
name: voice.name,
languages: [normalizeLanguageCode(voice.languageCodes[0]), getShortLanguageCode(voice.languageCodes[0])],
gender: voice.ssmlGender.toLowerCase(),
}));
break;
}
case 'microsoft-azure': {
const AzureCognitiveServicesTTS = await import('../synthesis/AzureCognitiveServicesTTS.js');
const subscriptionKey = options.microsoftAzure.subscriptionKey;
if (!subscriptionKey) {
throw new Error(`No Microsoft Azure subscription key provided`);
}
const serviceRegion = options.microsoftAzure.serviceRegion;
if (!serviceRegion) {
throw new Error(`No Microsoft Azure service region provided`);
}
const voices = await AzureCognitiveServicesTTS.getVoiceList(subscriptionKey, serviceRegion);
for (const voice of voices) {
voiceList.push({
name: voice.name,
languages: [normalizeLanguageCode(voice.locale), getShortLanguageCode(voice.locale)],
gender: voice.gender == 1 ? 'female' : 'male'
});
}
break;
}
case 'amazon-polly': {
const AwsPollyTTS = await import('../synthesis/AwsPollyTTS.js');
const region = options.amazonPolly.region;
if (!region) {
throw new Error(`No Amazon Polly region provided`);
}
const accessKeyId = options.amazonPolly.accessKeyId;
if (!accessKeyId) {
throw new Error(`No Amazon Polly access key id provided`);
}
const secretAccessKey = options.amazonPolly.secretAccessKey;
if (!secretAccessKey) {
throw new Error(`No Amazon Polly secret access key provided`);
}
const voices = await AwsPollyTTS.getVoiceList(region, accessKeyId, secretAccessKey);
for (const voice of voices) {
const languageCode = normalizeLanguageCode(voice.LanguageCode);
const languageCodes = [languageCode, getShortLanguageCode(languageCode)];
if (voice.AdditionalLanguageCodes) {
for (const additionalLanguageCode of voice.AdditionalLanguageCodes) {
languageCodes.push(normalizeLanguageCode(additionalLanguageCode), getShortLanguageCode(additionalLanguageCode));
}
}
voiceList.push({
name: voice.Id,
languages: languageCodes,
gender: voice.Gender.toLowerCase()
});
}
break;
}
case 'openai-cloud': {
const OpenAICloudTTS = await import('../synthesis/OpenAICloudTTS.js');
voiceList = OpenAICloudTTS.voiceList;
break;
}
case 'elevenlabs': {
const ElevenLabsTTS = await import('../synthesis/ElevenLabsTTS.js');
const engineOptions = options.elevenLabs;
const apiKey = engineOptions.apiKey;
if (!apiKey) {
throw new Error(`No ElevenLabs API key provided`);
}
voiceList = await ElevenLabsTTS.getVoiceList(apiKey);
break;
}
case 'deepgram': {
const DeepgramTTS = await import('../synthesis/DeepgramTTS.js');
voiceList = DeepgramTTS.voiceList;
break;
}
case 'google-translate': {
const GoogleTranslateTTS = await import('../synthesis/GoogleTranslateTTS.js');
const langLookup = GoogleTranslateTTS.supportedLanguageLookup;
for (const langCode in langLookup) {
voiceList.push({
name: langLookup[langCode],
languages: langCode.includes('-') ? [normalizeLanguageCode(langCode), getShortLanguageCode(langCode)] : [normalizeLanguageCode(langCode)],
gender: 'unknown'
});
}
break;
}
case 'microsoft-edge': {
const MicrosoftEdgeTTS = await import('../synthesis/MicrosoftEdgeTTS.js');
const trustedClientToken = options.microsoftEdge?.trustedClientToken;
if (!trustedClientToken) {
throw new Error('No Microsoft Edge trusted client token provided');
}
const voices = await runOperationWithRetries(() => MicrosoftEdgeTTS.getVoiceList(trustedClientToken), logger);
voiceList = voices.map((voice) => ({
name: voice.Name,
languages: [normalizeLanguageCode(voice.Locale), getShortLanguageCode(voice.Locale)],
gender: voice.Gender == 'Male' ? 'male' : 'female',
}));
break;
}
case 'streamlabs-polly': {
const StreamlabsPollyTTS = await import('../synthesis/StreamlabsPollyTTS.js');
voiceList = StreamlabsPollyTTS.voiceList;
break;
}
}
if (cacheFilePath) {
await writeFileSafe(cacheFilePath, await stringifyAndFormatJson(voiceList));
}
return voiceList;
}
let voiceList;
if (cacheFilePath && existsSync(cacheFilePath) && await isFileIsUpToDate(cacheFilePath, options.cache.duration)) {
voiceList = await readAndParseJsonFile(cacheFilePath);
}
else {
voiceList = await loadVoiceList();
}
const languageCode = await normalizeIdentifierToLanguageCode(options.language || '');
if (languageCode) {
let filteredVoiceList = voiceList.filter(voice => voice.languages.includes(languageCode));
if (filteredVoiceList.length == 0 && languageCode.includes('-')) {
const shortLanguageCode = getShortLanguageCode(languageCode);
filteredVoiceList = voiceList.filter(voice => voice.languages.includes(shortLanguageCode));
}
voiceList = filteredVoiceList;
}
if (options.voiceGender) {
const genderLowercase = options.voiceGender.toLowerCase();
voiceList = voiceList.filter(voice => voice.gender == genderLowercase || voice.gender == 'unknown');
}
if (options.voice) {
const namePatternLowerCase = options.voice.toLocaleLowerCase();
const namePatternParts = namePatternLowerCase.split(/\b/g);
if (namePatternParts.length > 1) {
voiceList = voiceList.filter(voice => voice.name.toLocaleLowerCase().includes(namePatternLowerCase));
}
else {
voiceList = voiceList.filter(voice => {
const name = voice.name.toLocaleL