echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
788 lines • 21.9 kB
JavaScript
import { Logger } from '../utilities/Logger.js';
import { getEmptyRawAudio, getRawAudioDuration } from '../audio/AudioUtilities.js';
import { readAndParseJsonFile, readdir } from '../utilities/FileSystem.js';
import { defaultEspeakOptions } from '../synthesis/EspeakTTS.js';
import { getOnnxSessionOptions } from '../utilities/OnnxUtilities.js';
import { joinPath } from '../utilities/PathUtilities.js';
const cachedInstanceLookup = new Map();
export async function synthesizeSentence(text, voiceName, modelPath, lengthScale, speakerId, lexicons, executionProviders) {
const cacheLookupKey = modelPath;
let vitsTTS = cachedInstanceLookup.get(cacheLookupKey);
if (!vitsTTS) {
vitsTTS = new VitsTTS(voiceName, modelPath, executionProviders);
cachedInstanceLookup.clear();
cachedInstanceLookup.set(cacheLookupKey, vitsTTS);
}
const result = await vitsTTS.synthesizeSentence(text, lengthScale, speakerId, lexicons);
return result;
}
export class VitsTTS {
voiceName;
modelPath;
executionProviders;
session;
metadata;
phonemeMap;
constructor(voiceName, modelPath, executionProviders) {
this.voiceName = voiceName;
this.modelPath = modelPath;
this.executionProviders = executionProviders;
}
async synthesizeSentence(sentence, lengthScale, speakerId = 0, lexicons) {
const logger = new Logger();
await this.initializeIfNeeded();
await logger.startAsync('Prepare for VITS synthesis');
const metadata = this.metadata;
const phonemeMap = this.phonemeMap;
const espeakVoice = metadata.espeak.voice;
const languageCode = espeakVoice;
const outputSampleRate = metadata.audio.sample_rate;
const baseLengthScale = metadata.inference.length_scale || 1.0;
lengthScale *= baseLengthScale;
sentence = //simplifyPunctuationCharacters(sentence.trim())
sentence
.replaceAll('(', ', ')
.replaceAll(')', ', ')
.replaceAll('—', ', ');
const Espeak = await import('../synthesis/EspeakTTS.js');
logger.end();
const espeakOptions = {
...defaultEspeakOptions,
voice: espeakVoice,
useKlatt: false
};
const { referenceSynthesizedAudio, referenceTimeline, fragments, phonemizedFragmentsSubstitutions, phonemizedSentence } = await Espeak.preprocessAndSynthesize(sentence, languageCode, espeakOptions, lexicons);
if (phonemizedSentence.length == 0) {
logger.end();
return {
rawAudio: getEmptyRawAudio(1, outputSampleRate),
timeline: [],
referenceSynthesizedAudio: getEmptyRawAudio(1, outputSampleRate),
referenceTimeline: []
};
}
await logger.startAsync('Encode phonemes to identifiers');
const phraseEndBreaker = ',';
let sentenceEndBreaker = '.';
if (sentence.endsWith('?') || sentence.endsWith(`?"`)) {
sentenceEndBreaker = '?';
}
else if (sentence.endsWith('!') || sentence.endsWith(`!"`)) {
sentenceEndBreaker = '!';
}
const phonemeCharacterSeparatorId = phonemeMap.get('_');
const wordSeparatorId = phonemeMap.get(' ');
const startId = phonemeMap.get('^');
const endId = phonemeMap.get('$');
const phraseEndBreakerId = phonemeMap.get(phraseEndBreaker);
const sentenceEndBreakerId = phonemeMap.get(sentenceEndBreaker);
const ids = [...startId, ...phonemeCharacterSeparatorId];
for (let phraseIndex = 0; phraseIndex < phonemizedSentence.length; phraseIndex++) {
const phrase = phonemizedSentence[phraseIndex];
for (const word of phrase) {
for (const phoneme of word) {
for (const phonemeCharacter of phoneme) {
const id = phonemeMap.get(phonemeCharacter);
if (id == null) {
//logger.log(`No id found for subphoneme '${char}'`)
continue;
}
ids.push(...id, ...phonemeCharacterSeparatorId);
}
}
if (phraseIndex < phonemizedSentence.length - 1) {
ids.push(...wordSeparatorId, ...phonemeCharacterSeparatorId);
}
}
if (phraseIndex < phonemizedSentence.length - 1) {
ids.push(...phraseEndBreakerId, ...phonemeCharacterSeparatorId);
}
}
ids.push(...sentenceEndBreakerId, ...phonemeCharacterSeparatorId, ...endId);
//logger.log(ids)
const bigIntIds = new BigInt64Array(ids.map(id => BigInt(id)));
const idLengths = new BigInt64Array([BigInt(bigIntIds.length)]);
await logger.startAsync('Generate audio using synthesis model');
const Onnx = await import('onnxruntime-node');
const inputTensor = new Onnx.Tensor('int64', bigIntIds, [1, bigIntIds.length]);
const inputLengthsTensor = new Onnx.Tensor('int64', idLengths, [1]);
const scalesTensor = new Onnx.Tensor('float32', [metadata.inference.noise_scale, lengthScale, metadata.inference.noise_w], [3]);
const speakerIdTensor = new Onnx.Tensor('int64', new BigInt64Array([BigInt(speakerId)]), [1]);
const modelInputs = {
input: inputTensor,
input_lengths: inputLengthsTensor,
scales: scalesTensor,
sid: speakerIdTensor
};
const modelResults = await this.session.run(modelInputs);
const modelOutput = modelResults['output'];
const modelOutputAudioSamples = modelOutput['data'];
const synthesizedAudio = { audioChannels: [modelOutputAudioSamples], sampleRate: outputSampleRate };
await logger.startAsync('Align with reference synthesized audio');
const { alignUsingDtw } = await import('../alignment/SpeechAlignment.js');
const referenceWordTimeline = referenceTimeline.flatMap(phrase => phrase.timeline);
const dtwWindowDuration = Math.max(5, Math.ceil(0.2 * getRawAudioDuration(synthesizedAudio)));
const mappedTimeline = await alignUsingDtw(synthesizedAudio, referenceSynthesizedAudio, referenceWordTimeline, ['high'], [dtwWindowDuration]);
logger.end();
return { rawAudio: synthesizedAudio, timeline: mappedTimeline, referenceSynthesizedAudio, referenceTimeline };
}
async initializeIfNeeded() {
if (this.session) {
return;
}
const logger = new Logger();
await logger.startAsync('Initialize VITS ONNX synthesis model');
const Onnx = await import('onnxruntime-node');
const filesInModelPath = await readdir(this.modelPath);
const onnxModelFilename = filesInModelPath.find(filename => filename.endsWith('.onnx'));
if (!onnxModelFilename) {
throw new Error(`Couldn't file any ONNX model file in ${this.modelPath}`);
}
const onnxModelFilepath = joinPath(this.modelPath, onnxModelFilename);
const onnxSessionOptions = getOnnxSessionOptions({ executionProviders: this.executionProviders });
this.session = await Onnx.InferenceSession.create(onnxModelFilepath, onnxSessionOptions);
this.metadata = await readAndParseJsonFile(`${onnxModelFilepath}.json`);
this.phonemeMap = new Map();
for (const key in this.metadata.phoneme_id_map) {
this.phonemeMap.set(key, this.metadata.phoneme_id_map[key]);
}
logger.end();
}
}
export const voiceList = [
{
name: 'ar_JO-kareem-low',
languages: ['ar-JO', 'ar'],
gender: 'male',
},
{
name: 'ar_JO-kareem-medium',
languages: ['ar-JO', 'ar'],
gender: 'male',
},
{
name: 'ca_ES-upc_ona-x_low',
languages: ['ca-ES', 'ca'],
gender: 'female',
},
{
name: 'ca_ES-upc_ona-medium',
languages: ['ca-ES', 'ca'],
gender: 'female',
},
{
name: 'ca_ES-upc_pau-x_low',
languages: ['ca-ES', 'ca'],
gender: 'male',
},
{
name: 'cs_CZ-jirka-low',
languages: ['cs-CZ', 'cs'],
gender: 'male',
},
{
name: 'cs_CZ-jirka-medium',
languages: ['cs-CZ', 'cs'],
gender: 'male',
},
{
name: 'cy_GB-gwryw_gogleddol-medium',
languages: ['cy-GB', 'cy'],
gender: 'male',
},
{
name: 'da_DK-nst_talesyntese-medium',
languages: ['da-DK', 'da'],
gender: 'male',
},
{
name: 'de_DE-thorsten-low',
languages: ['de-DE', 'de'],
gender: 'male',
},
{
name: 'de_DE-thorsten-medium',
languages: ['de-DE', 'de'],
gender: 'male',
},
{
name: 'de_DE-thorsten_emotional-medium',
languages: ['de-DE', 'de'],
gender: 'male',
speakerCount: 8
},
{
name: 'de_DE-thorsten-high',
languages: ['de-DE', 'de'],
gender: 'male',
},
{
name: 'de_DE-eva_k-x_low',
languages: ['de-DE', 'de'],
gender: 'female',
},
{
name: 'de_DE-ramona-low',
languages: ['de-DE', 'de'],
gender: 'female',
},
{
name: 'de_DE-pavoque-low',
languages: ['de-DE', 'de'],
gender: 'male',
},
{
name: 'de_DE-kerstin-low',
languages: ['de-DE', 'de'],
gender: 'female',
},
{
name: 'de_DE-karlsson-low',
languages: ['de-DE', 'de'],
gender: 'male',
},
{
name: 'de_DE-mls-medium',
languages: ['de-DE', 'de'],
gender: 'unknown',
speakerCount: 236
},
{
name: 'el_GR-rapunzelina-low',
languages: ['el-GR', 'el'],
gender: 'female',
},
{
name: 'en_GB-alan-low',
languages: ['en-GB', 'en'],
gender: 'male',
},
{
name: 'en_GB-alan-medium',
languages: ['en-GB', 'en'],
gender: 'male',
},
{
name: 'en_GB-semaine-medium',
languages: ['en-GB', 'en'],
gender: 'unknown',
speakerCount: 4
},
{
name: 'en_GB-danny-low',
languages: ['en-GB', 'en'],
gender: 'male',
},
{
name: 'en_GB-alba-medium',
languages: ['en-GB', 'en'],
gender: 'female',
},
{
name: 'en_GB-aru-medium',
languages: ['en-GB', 'en'],
gender: 'unknown',
speakerCount: 12,
},
{
name: 'en_GB-southern_english_female-low',
languages: ['en-GB', 'en'],
gender: 'female',
},
{
name: 'en_GB-northern_english_male-medium',
languages: ['en-GB', 'en'],
gender: 'male',
},
{
name: 'en_GB-vctk-medium',
languages: ['en-GB', 'en'],
gender: 'unknown',
speakerCount: 109,
},
{
name: 'en_GB-jenny_dioco-medium',
languages: ['en-GB', 'en'],
gender: 'female',
},
{
name: 'en_GB-cori-high',
languages: ['en-GB', 'en'],
gender: 'female',
},
{
name: 'en_GB-cori-medium',
languages: ['en-GB', 'en'],
gender: 'female',
},
{
name: 'en_US-amy-low',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'en_US-amy-medium',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'en_US-kathleen-low',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'en_US-lessac-low',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'en_US-lessac-medium',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'en_US-lessac-high',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'en_US-libritts-high',
languages: ['en-US', 'en'],
gender: 'unknown',
speakerCount: 904,
},
{
name: 'en_US-libritts_r-medium',
languages: ['en-US', 'en'],
gender: 'unknown',
speakerCount: 904,
},
{
name: 'en_US-ryan-low',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'en_US-ryan-medium',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'en_US-ryan-high',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'en_US-joe-medium',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'en_US-kusal-medium',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'en_US-arctic-medium',
languages: ['en-US', 'en'],
gender: 'unknown',
speakerCount: 18
},
{
name: 'en_US-l2arctic-medium',
languages: ['en-US', 'en'],
gender: 'unknown',
speakerCount: 24
},
{
name: 'en_US-hfc_male-medium',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'en_US-hfc_female-medium',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'en_US-kristin-medium',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'en_US-ljspeech-high',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'en_US-ljspeech-medium',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'en_US-norman-medium',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'en_US-john-medium',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'en_US-bryce-medium',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'es_ES-carlfm-x_low',
languages: ['es-ES', 'es'],
gender: 'male',
},
{
name: 'es_ES-sharvard-medium',
languages: ['es-ES', 'es'],
gender: 'unknown',
speakerCount: 2
},
{
name: 'es_ES-davefx-medium',
languages: ['es-ES', 'es'],
gender: 'male',
},
{
name: 'es_ES-mls_9972-low',
languages: ['es-ES', 'es'],
gender: 'male',
},
{
name: 'es_ES-mls_10246-low',
languages: ['es-ES', 'es'],
gender: 'male',
},
{
name: 'es_MX-ald-medium',
languages: ['es-MX', 'es'],
gender: 'male',
},
{
name: 'es_MX-claude-high',
languages: ['es-MX', 'es'],
gender: 'female',
},
{
name: 'fa_IR-amir-medium',
languages: ['fa_IR', 'fa'],
gender: 'male',
},
{
name: 'fa_IR-gyro-medium',
languages: ['fa_IR', 'fa'],
gender: 'male',
},
{
name: 'fi_FI-harri-low',
languages: ['fi-FI', 'fi'],
gender: 'female',
},
{
name: 'fi_FI-harri-medium',
languages: ['fi-FI', 'fi'],
gender: 'female',
},
{
name: 'fr_FR-siwis-low',
languages: ['fr-FR', 'fr'],
gender: 'female',
},
{
name: 'fr_FR-siwis-medium',
languages: ['fr-FR', 'fr'],
gender: 'female',
},
{
name: 'fr_FR-mls_1840-low',
languages: ['fr-FR', 'fr'],
gender: 'male',
},
{
name: 'fr_FR-gilles-low',
languages: ['fr-FR', 'fr'],
gender: 'male',
},
{
name: 'fr_FR-upmc-medium',
languages: ['fr-FR', 'fr'],
gender: 'unknown',
speakerCount: 2
},
{
name: 'fr_FR-mls-medium',
languages: ['fr-FR', 'fr'],
gender: 'unknown',
speakerCount: 125
},
{
name: 'fr_FR-tom-medium',
languages: ['fr-FR', 'fr'],
gender: 'male',
},
{
name: 'hu_HU-anna-medium',
languages: ['hu-HU', 'hu'],
gender: 'female',
},
{
name: 'hu_HU-imre-medium',
languages: ['hu-HU', 'hu'],
gender: 'male',
},
{
name: 'is_IS-ugla-medium',
languages: ['is-IS', 'is'],
gender: 'female',
},
{
name: 'is_IS-steinn-medium',
languages: ['is-IS', 'is'],
gender: 'male',
},
{
name: 'is_IS-salka-medium',
languages: ['is-IS', 'is'],
gender: 'female',
},
{
name: 'is_IS-bui-medium',
languages: ['is-IS', 'is'],
gender: 'male',
},
{
name: 'it_IT-riccardo-x_low',
languages: ['it-IT', 'it'],
gender: 'male',
},
{
name: 'it_IT-paola-medium',
languages: ['it-IT', 'it'],
gender: 'female',
},
{
name: 'ka_GE-natia-medium',
languages: ['ka-GE', 'ka'],
gender: 'female',
},
{
name: 'kk_KZ-iseke-x_low',
languages: ['kk-KZ', 'kk'],
gender: 'male',
},
{
name: 'kk_KZ-raya-x_low',
languages: ['kk-KZ', 'kk'],
gender: 'male',
},
{
name: 'kk_KZ-issai-high',
languages: ['kk-KZ', 'kk'],
gender: 'unknown',
speakerCount: 6,
},
{
name: 'lb_LU-marylux-medium',
languages: ['lb-LU', 'lb'],
gender: 'female',
},
{
name: 'ne_NP-google-medium',
languages: ['ne-NP', 'ne'],
gender: 'female',
speakerCount: 18,
},
{
name: 'ne_NP-google-x_low',
languages: ['ne-NP', 'ne'],
gender: 'female',
speakerCount: 18,
},
{
name: 'nl_NL-mls_5809-low',
languages: ['nl-NL', 'nl'],
gender: 'female',
},
{
name: 'nl_NL-mls_7432-low',
languages: ['nl-NL', 'nl'],
gender: 'female',
},
{
name: 'nl_NL-mls-medium',
languages: ['nl-NL', 'nl'],
gender: 'unknown',
speakerCount: 52,
},
{
name: 'nl_BE-nathalie-x_low',
languages: ['nl-BE', 'nl'],
gender: 'female',
},
{
name: 'nl_BE-nathalie-medium',
languages: ['nl-BE', 'nl'],
gender: 'female',
},
{
name: 'nl_BE-rdh-medium',
languages: ['nl-BE', 'nl'],
gender: 'male',
},
{
name: 'nl_BE-rdh-x_low',
languages: ['nl-BE', 'nl'],
gender: 'male',
},
{
name: 'no_NO-talesyntese-medium',
languages: ['no-NO', 'no'],
gender: 'male',
},
{
name: 'pl_PL-mls_6892-low',
languages: ['pl-PL', 'pl'],
gender: 'male',
},
{
name: 'pl_PL-darkman-medium',
languages: ['pl-PL', 'pl'],
gender: 'male',
},
{
name: 'pl_PL-gosia-medium',
languages: ['pl-PL', 'pl'],
gender: 'female',
},
{
name: 'pl_PL-mc_speech-medium',
languages: ['pl-PL', 'pl'],
gender: 'male',
},
{
name: 'pt_BR-edresson-low',
languages: ['pt-BR', 'pt'],
gender: 'male',
},
{
name: 'pt_BR-faber-medium',
languages: ['pt-BR', 'pt'],
gender: 'male',
},
{
name: 'pt_PT-tugao-medium',
languages: ['pt-PT', 'pt'],
gender: 'male',
},
{
name: 'ro_RO-mihai-medium',
languages: ['ro-RO', 'ro'],
gender: 'male',
},
{
name: 'ru_RU-ruslan-medium',
languages: ['ru-RU', 'ru'],
gender: 'male',
},
{
name: 'ru_RU-irinia-medium',
languages: ['ru-RU', 'ru'],
gender: 'female',
},
{
name: 'ru_RU-denis-medium',
languages: ['ru-RU', 'ru'],
gender: 'male',
},
{
name: 'ru_RU-dmitri-medium',
languages: ['ru-RU', 'ru'],
gender: 'male',
},
{
name: 'sk_SK-lili-medium',
languages: ['sk-SK', 'sk'],
gender: 'female',
},
{
name: 'sl_SI-artur-medium',
languages: ['sl_SI', 'sl'],
gender: 'male',
},
{
name: 'sr_RS-serbski_institut-medium',
languages: ['sr-RS', 'sr'],
gender: 'male',
speakerCount: 2
},
{
name: 'sv_SE-nst-medium',
languages: ['sv-SE', 'sv'],
gender: 'male',
},
{
name: 'sw_CD-lanfrica-medium',
languages: ['sw-CD', 'sw'],
gender: 'male',
},
{
name: 'tr_TR-dfki-medium',
languages: ['tr-TR', 'tr'],
gender: 'male',
},
{
name: 'tr_TR-fahrettin-medium',
languages: ['tr-TR', 'tr'],
gender: 'male',
},
{
name: 'tr_TR-fettah-medium',
languages: ['tr-TR', 'tr'],
gender: 'male',
},
{
name: 'uk_UA-lada-x_low',
languages: ['uk-UA', 'uk'],
gender: 'female',
},
{
name: 'uk_UA-ukrainian_tts-medium',
languages: ['uk-UA', 'uk'],
gender: 'unknown',
speakerCount: 3,
},
{
name: 'vi_VN-vivos-x_low',
languages: ['vi-VN', 'vi'],
gender: 'unknown',
speakerCount: 65,
},
{
name: 'vi_VN-25hours-single-low',
languages: ['vi-VN', 'vi'],
gender: 'female',
},
{
name: 'vi_VN-vais1000-medium',
languages: ['vi-VN', 'vi'],
gender: 'female',
},
{
name: 'zh_CN-huayan-x_low',
languages: ['zh-CN', 'zh'],
gender: 'female',
},
{
name: 'zh_CN-huayan-medium',
languages: ['zh-CN', 'zh'],
gender: 'female',
},
];
//# sourceMappingURL=VitsTTS.js.map