UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

788 lines 21.9 kB
import { Logger } from '../utilities/Logger.js'; import { getEmptyRawAudio, getRawAudioDuration } from '../audio/AudioUtilities.js'; import { readAndParseJsonFile, readdir } from '../utilities/FileSystem.js'; import { defaultEspeakOptions } from '../synthesis/EspeakTTS.js'; import { getOnnxSessionOptions } from '../utilities/OnnxUtilities.js'; import { joinPath } from '../utilities/PathUtilities.js'; const cachedInstanceLookup = new Map(); export async function synthesizeSentence(text, voiceName, modelPath, lengthScale, speakerId, lexicons, executionProviders) { const cacheLookupKey = modelPath; let vitsTTS = cachedInstanceLookup.get(cacheLookupKey); if (!vitsTTS) { vitsTTS = new VitsTTS(voiceName, modelPath, executionProviders); cachedInstanceLookup.clear(); cachedInstanceLookup.set(cacheLookupKey, vitsTTS); } const result = await vitsTTS.synthesizeSentence(text, lengthScale, speakerId, lexicons); return result; } export class VitsTTS { voiceName; modelPath; executionProviders; session; metadata; phonemeMap; constructor(voiceName, modelPath, executionProviders) { this.voiceName = voiceName; this.modelPath = modelPath; this.executionProviders = executionProviders; } async synthesizeSentence(sentence, lengthScale, speakerId = 0, lexicons) { const logger = new Logger(); await this.initializeIfNeeded(); await logger.startAsync('Prepare for VITS synthesis'); const metadata = this.metadata; const phonemeMap = this.phonemeMap; const espeakVoice = metadata.espeak.voice; const languageCode = espeakVoice; const outputSampleRate = metadata.audio.sample_rate; const baseLengthScale = metadata.inference.length_scale || 1.0; lengthScale *= baseLengthScale; sentence = //simplifyPunctuationCharacters(sentence.trim()) sentence .replaceAll('(', ', ') .replaceAll(')', ', ') .replaceAll('—', ', '); const Espeak = await import('../synthesis/EspeakTTS.js'); logger.end(); const espeakOptions = { ...defaultEspeakOptions, voice: espeakVoice, useKlatt: false }; const { referenceSynthesizedAudio, referenceTimeline, fragments, phonemizedFragmentsSubstitutions, phonemizedSentence } = await Espeak.preprocessAndSynthesize(sentence, languageCode, espeakOptions, lexicons); if (phonemizedSentence.length == 0) { logger.end(); return { rawAudio: getEmptyRawAudio(1, outputSampleRate), timeline: [], referenceSynthesizedAudio: getEmptyRawAudio(1, outputSampleRate), referenceTimeline: [] }; } await logger.startAsync('Encode phonemes to identifiers'); const phraseEndBreaker = ','; let sentenceEndBreaker = '.'; if (sentence.endsWith('?') || sentence.endsWith(`?"`)) { sentenceEndBreaker = '?'; } else if (sentence.endsWith('!') || sentence.endsWith(`!"`)) { sentenceEndBreaker = '!'; } const phonemeCharacterSeparatorId = phonemeMap.get('_'); const wordSeparatorId = phonemeMap.get(' '); const startId = phonemeMap.get('^'); const endId = phonemeMap.get('$'); const phraseEndBreakerId = phonemeMap.get(phraseEndBreaker); const sentenceEndBreakerId = phonemeMap.get(sentenceEndBreaker); const ids = [...startId, ...phonemeCharacterSeparatorId]; for (let phraseIndex = 0; phraseIndex < phonemizedSentence.length; phraseIndex++) { const phrase = phonemizedSentence[phraseIndex]; for (const word of phrase) { for (const phoneme of word) { for (const phonemeCharacter of phoneme) { const id = phonemeMap.get(phonemeCharacter); if (id == null) { //logger.log(`No id found for subphoneme '${char}'`) continue; } ids.push(...id, ...phonemeCharacterSeparatorId); } } if (phraseIndex < phonemizedSentence.length - 1) { ids.push(...wordSeparatorId, ...phonemeCharacterSeparatorId); } } if (phraseIndex < phonemizedSentence.length - 1) { ids.push(...phraseEndBreakerId, ...phonemeCharacterSeparatorId); } } ids.push(...sentenceEndBreakerId, ...phonemeCharacterSeparatorId, ...endId); //logger.log(ids) const bigIntIds = new BigInt64Array(ids.map(id => BigInt(id))); const idLengths = new BigInt64Array([BigInt(bigIntIds.length)]); await logger.startAsync('Generate audio using synthesis model'); const Onnx = await import('onnxruntime-node'); const inputTensor = new Onnx.Tensor('int64', bigIntIds, [1, bigIntIds.length]); const inputLengthsTensor = new Onnx.Tensor('int64', idLengths, [1]); const scalesTensor = new Onnx.Tensor('float32', [metadata.inference.noise_scale, lengthScale, metadata.inference.noise_w], [3]); const speakerIdTensor = new Onnx.Tensor('int64', new BigInt64Array([BigInt(speakerId)]), [1]); const modelInputs = { input: inputTensor, input_lengths: inputLengthsTensor, scales: scalesTensor, sid: speakerIdTensor }; const modelResults = await this.session.run(modelInputs); const modelOutput = modelResults['output']; const modelOutputAudioSamples = modelOutput['data']; const synthesizedAudio = { audioChannels: [modelOutputAudioSamples], sampleRate: outputSampleRate }; await logger.startAsync('Align with reference synthesized audio'); const { alignUsingDtw } = await import('../alignment/SpeechAlignment.js'); const referenceWordTimeline = referenceTimeline.flatMap(phrase => phrase.timeline); const dtwWindowDuration = Math.max(5, Math.ceil(0.2 * getRawAudioDuration(synthesizedAudio))); const mappedTimeline = await alignUsingDtw(synthesizedAudio, referenceSynthesizedAudio, referenceWordTimeline, ['high'], [dtwWindowDuration]); logger.end(); return { rawAudio: synthesizedAudio, timeline: mappedTimeline, referenceSynthesizedAudio, referenceTimeline }; } async initializeIfNeeded() { if (this.session) { return; } const logger = new Logger(); await logger.startAsync('Initialize VITS ONNX synthesis model'); const Onnx = await import('onnxruntime-node'); const filesInModelPath = await readdir(this.modelPath); const onnxModelFilename = filesInModelPath.find(filename => filename.endsWith('.onnx')); if (!onnxModelFilename) { throw new Error(`Couldn't file any ONNX model file in ${this.modelPath}`); } const onnxModelFilepath = joinPath(this.modelPath, onnxModelFilename); const onnxSessionOptions = getOnnxSessionOptions({ executionProviders: this.executionProviders }); this.session = await Onnx.InferenceSession.create(onnxModelFilepath, onnxSessionOptions); this.metadata = await readAndParseJsonFile(`${onnxModelFilepath}.json`); this.phonemeMap = new Map(); for (const key in this.metadata.phoneme_id_map) { this.phonemeMap.set(key, this.metadata.phoneme_id_map[key]); } logger.end(); } } export const voiceList = [ { name: 'ar_JO-kareem-low', languages: ['ar-JO', 'ar'], gender: 'male', }, { name: 'ar_JO-kareem-medium', languages: ['ar-JO', 'ar'], gender: 'male', }, { name: 'ca_ES-upc_ona-x_low', languages: ['ca-ES', 'ca'], gender: 'female', }, { name: 'ca_ES-upc_ona-medium', languages: ['ca-ES', 'ca'], gender: 'female', }, { name: 'ca_ES-upc_pau-x_low', languages: ['ca-ES', 'ca'], gender: 'male', }, { name: 'cs_CZ-jirka-low', languages: ['cs-CZ', 'cs'], gender: 'male', }, { name: 'cs_CZ-jirka-medium', languages: ['cs-CZ', 'cs'], gender: 'male', }, { name: 'cy_GB-gwryw_gogleddol-medium', languages: ['cy-GB', 'cy'], gender: 'male', }, { name: 'da_DK-nst_talesyntese-medium', languages: ['da-DK', 'da'], gender: 'male', }, { name: 'de_DE-thorsten-low', languages: ['de-DE', 'de'], gender: 'male', }, { name: 'de_DE-thorsten-medium', languages: ['de-DE', 'de'], gender: 'male', }, { name: 'de_DE-thorsten_emotional-medium', languages: ['de-DE', 'de'], gender: 'male', speakerCount: 8 }, { name: 'de_DE-thorsten-high', languages: ['de-DE', 'de'], gender: 'male', }, { name: 'de_DE-eva_k-x_low', languages: ['de-DE', 'de'], gender: 'female', }, { name: 'de_DE-ramona-low', languages: ['de-DE', 'de'], gender: 'female', }, { name: 'de_DE-pavoque-low', languages: ['de-DE', 'de'], gender: 'male', }, { name: 'de_DE-kerstin-low', languages: ['de-DE', 'de'], gender: 'female', }, { name: 'de_DE-karlsson-low', languages: ['de-DE', 'de'], gender: 'male', }, { name: 'de_DE-mls-medium', languages: ['de-DE', 'de'], gender: 'unknown', speakerCount: 236 }, { name: 'el_GR-rapunzelina-low', languages: ['el-GR', 'el'], gender: 'female', }, { name: 'en_GB-alan-low', languages: ['en-GB', 'en'], gender: 'male', }, { name: 'en_GB-alan-medium', languages: ['en-GB', 'en'], gender: 'male', }, { name: 'en_GB-semaine-medium', languages: ['en-GB', 'en'], gender: 'unknown', speakerCount: 4 }, { name: 'en_GB-danny-low', languages: ['en-GB', 'en'], gender: 'male', }, { name: 'en_GB-alba-medium', languages: ['en-GB', 'en'], gender: 'female', }, { name: 'en_GB-aru-medium', languages: ['en-GB', 'en'], gender: 'unknown', speakerCount: 12, }, { name: 'en_GB-southern_english_female-low', languages: ['en-GB', 'en'], gender: 'female', }, { name: 'en_GB-northern_english_male-medium', languages: ['en-GB', 'en'], gender: 'male', }, { name: 'en_GB-vctk-medium', languages: ['en-GB', 'en'], gender: 'unknown', speakerCount: 109, }, { name: 'en_GB-jenny_dioco-medium', languages: ['en-GB', 'en'], gender: 'female', }, { name: 'en_GB-cori-high', languages: ['en-GB', 'en'], gender: 'female', }, { name: 'en_GB-cori-medium', languages: ['en-GB', 'en'], gender: 'female', }, { name: 'en_US-amy-low', languages: ['en-US', 'en'], gender: 'female', }, { name: 'en_US-amy-medium', languages: ['en-US', 'en'], gender: 'female', }, { name: 'en_US-kathleen-low', languages: ['en-US', 'en'], gender: 'female', }, { name: 'en_US-lessac-low', languages: ['en-US', 'en'], gender: 'female', }, { name: 'en_US-lessac-medium', languages: ['en-US', 'en'], gender: 'female', }, { name: 'en_US-lessac-high', languages: ['en-US', 'en'], gender: 'female', }, { name: 'en_US-libritts-high', languages: ['en-US', 'en'], gender: 'unknown', speakerCount: 904, }, { name: 'en_US-libritts_r-medium', languages: ['en-US', 'en'], gender: 'unknown', speakerCount: 904, }, { name: 'en_US-ryan-low', languages: ['en-US', 'en'], gender: 'male', }, { name: 'en_US-ryan-medium', languages: ['en-US', 'en'], gender: 'male', }, { name: 'en_US-ryan-high', languages: ['en-US', 'en'], gender: 'male', }, { name: 'en_US-joe-medium', languages: ['en-US', 'en'], gender: 'male', }, { name: 'en_US-kusal-medium', languages: ['en-US', 'en'], gender: 'male', }, { name: 'en_US-arctic-medium', languages: ['en-US', 'en'], gender: 'unknown', speakerCount: 18 }, { name: 'en_US-l2arctic-medium', languages: ['en-US', 'en'], gender: 'unknown', speakerCount: 24 }, { name: 'en_US-hfc_male-medium', languages: ['en-US', 'en'], gender: 'male', }, { name: 'en_US-hfc_female-medium', languages: ['en-US', 'en'], gender: 'female', }, { name: 'en_US-kristin-medium', languages: ['en-US', 'en'], gender: 'female', }, { name: 'en_US-ljspeech-high', languages: ['en-US', 'en'], gender: 'female', }, { name: 'en_US-ljspeech-medium', languages: ['en-US', 'en'], gender: 'female', }, { name: 'en_US-norman-medium', languages: ['en-US', 'en'], gender: 'male', }, { name: 'en_US-john-medium', languages: ['en-US', 'en'], gender: 'male', }, { name: 'en_US-bryce-medium', languages: ['en-US', 'en'], gender: 'male', }, { name: 'es_ES-carlfm-x_low', languages: ['es-ES', 'es'], gender: 'male', }, { name: 'es_ES-sharvard-medium', languages: ['es-ES', 'es'], gender: 'unknown', speakerCount: 2 }, { name: 'es_ES-davefx-medium', languages: ['es-ES', 'es'], gender: 'male', }, { name: 'es_ES-mls_9972-low', languages: ['es-ES', 'es'], gender: 'male', }, { name: 'es_ES-mls_10246-low', languages: ['es-ES', 'es'], gender: 'male', }, { name: 'es_MX-ald-medium', languages: ['es-MX', 'es'], gender: 'male', }, { name: 'es_MX-claude-high', languages: ['es-MX', 'es'], gender: 'female', }, { name: 'fa_IR-amir-medium', languages: ['fa_IR', 'fa'], gender: 'male', }, { name: 'fa_IR-gyro-medium', languages: ['fa_IR', 'fa'], gender: 'male', }, { name: 'fi_FI-harri-low', languages: ['fi-FI', 'fi'], gender: 'female', }, { name: 'fi_FI-harri-medium', languages: ['fi-FI', 'fi'], gender: 'female', }, { name: 'fr_FR-siwis-low', languages: ['fr-FR', 'fr'], gender: 'female', }, { name: 'fr_FR-siwis-medium', languages: ['fr-FR', 'fr'], gender: 'female', }, { name: 'fr_FR-mls_1840-low', languages: ['fr-FR', 'fr'], gender: 'male', }, { name: 'fr_FR-gilles-low', languages: ['fr-FR', 'fr'], gender: 'male', }, { name: 'fr_FR-upmc-medium', languages: ['fr-FR', 'fr'], gender: 'unknown', speakerCount: 2 }, { name: 'fr_FR-mls-medium', languages: ['fr-FR', 'fr'], gender: 'unknown', speakerCount: 125 }, { name: 'fr_FR-tom-medium', languages: ['fr-FR', 'fr'], gender: 'male', }, { name: 'hu_HU-anna-medium', languages: ['hu-HU', 'hu'], gender: 'female', }, { name: 'hu_HU-imre-medium', languages: ['hu-HU', 'hu'], gender: 'male', }, { name: 'is_IS-ugla-medium', languages: ['is-IS', 'is'], gender: 'female', }, { name: 'is_IS-steinn-medium', languages: ['is-IS', 'is'], gender: 'male', }, { name: 'is_IS-salka-medium', languages: ['is-IS', 'is'], gender: 'female', }, { name: 'is_IS-bui-medium', languages: ['is-IS', 'is'], gender: 'male', }, { name: 'it_IT-riccardo-x_low', languages: ['it-IT', 'it'], gender: 'male', }, { name: 'it_IT-paola-medium', languages: ['it-IT', 'it'], gender: 'female', }, { name: 'ka_GE-natia-medium', languages: ['ka-GE', 'ka'], gender: 'female', }, { name: 'kk_KZ-iseke-x_low', languages: ['kk-KZ', 'kk'], gender: 'male', }, { name: 'kk_KZ-raya-x_low', languages: ['kk-KZ', 'kk'], gender: 'male', }, { name: 'kk_KZ-issai-high', languages: ['kk-KZ', 'kk'], gender: 'unknown', speakerCount: 6, }, { name: 'lb_LU-marylux-medium', languages: ['lb-LU', 'lb'], gender: 'female', }, { name: 'ne_NP-google-medium', languages: ['ne-NP', 'ne'], gender: 'female', speakerCount: 18, }, { name: 'ne_NP-google-x_low', languages: ['ne-NP', 'ne'], gender: 'female', speakerCount: 18, }, { name: 'nl_NL-mls_5809-low', languages: ['nl-NL', 'nl'], gender: 'female', }, { name: 'nl_NL-mls_7432-low', languages: ['nl-NL', 'nl'], gender: 'female', }, { name: 'nl_NL-mls-medium', languages: ['nl-NL', 'nl'], gender: 'unknown', speakerCount: 52, }, { name: 'nl_BE-nathalie-x_low', languages: ['nl-BE', 'nl'], gender: 'female', }, { name: 'nl_BE-nathalie-medium', languages: ['nl-BE', 'nl'], gender: 'female', }, { name: 'nl_BE-rdh-medium', languages: ['nl-BE', 'nl'], gender: 'male', }, { name: 'nl_BE-rdh-x_low', languages: ['nl-BE', 'nl'], gender: 'male', }, { name: 'no_NO-talesyntese-medium', languages: ['no-NO', 'no'], gender: 'male', }, { name: 'pl_PL-mls_6892-low', languages: ['pl-PL', 'pl'], gender: 'male', }, { name: 'pl_PL-darkman-medium', languages: ['pl-PL', 'pl'], gender: 'male', }, { name: 'pl_PL-gosia-medium', languages: ['pl-PL', 'pl'], gender: 'female', }, { name: 'pl_PL-mc_speech-medium', languages: ['pl-PL', 'pl'], gender: 'male', }, { name: 'pt_BR-edresson-low', languages: ['pt-BR', 'pt'], gender: 'male', }, { name: 'pt_BR-faber-medium', languages: ['pt-BR', 'pt'], gender: 'male', }, { name: 'pt_PT-tugao-medium', languages: ['pt-PT', 'pt'], gender: 'male', }, { name: 'ro_RO-mihai-medium', languages: ['ro-RO', 'ro'], gender: 'male', }, { name: 'ru_RU-ruslan-medium', languages: ['ru-RU', 'ru'], gender: 'male', }, { name: 'ru_RU-irinia-medium', languages: ['ru-RU', 'ru'], gender: 'female', }, { name: 'ru_RU-denis-medium', languages: ['ru-RU', 'ru'], gender: 'male', }, { name: 'ru_RU-dmitri-medium', languages: ['ru-RU', 'ru'], gender: 'male', }, { name: 'sk_SK-lili-medium', languages: ['sk-SK', 'sk'], gender: 'female', }, { name: 'sl_SI-artur-medium', languages: ['sl_SI', 'sl'], gender: 'male', }, { name: 'sr_RS-serbski_institut-medium', languages: ['sr-RS', 'sr'], gender: 'male', speakerCount: 2 }, { name: 'sv_SE-nst-medium', languages: ['sv-SE', 'sv'], gender: 'male', }, { name: 'sw_CD-lanfrica-medium', languages: ['sw-CD', 'sw'], gender: 'male', }, { name: 'tr_TR-dfki-medium', languages: ['tr-TR', 'tr'], gender: 'male', }, { name: 'tr_TR-fahrettin-medium', languages: ['tr-TR', 'tr'], gender: 'male', }, { name: 'tr_TR-fettah-medium', languages: ['tr-TR', 'tr'], gender: 'male', }, { name: 'uk_UA-lada-x_low', languages: ['uk-UA', 'uk'], gender: 'female', }, { name: 'uk_UA-ukrainian_tts-medium', languages: ['uk-UA', 'uk'], gender: 'unknown', speakerCount: 3, }, { name: 'vi_VN-vivos-x_low', languages: ['vi-VN', 'vi'], gender: 'unknown', speakerCount: 65, }, { name: 'vi_VN-25hours-single-low', languages: ['vi-VN', 'vi'], gender: 'female', }, { name: 'vi_VN-vais1000-medium', languages: ['vi-VN', 'vi'], gender: 'female', }, { name: 'zh_CN-huayan-x_low', languages: ['zh-CN', 'zh'], gender: 'female', }, { name: 'zh_CN-huayan-medium', languages: ['zh-CN', 'zh'], gender: 'female', }, ]; //# sourceMappingURL=VitsTTS.js.map