echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
659 lines • 19.8 kB
JavaScript
import { Logger } from '../utilities/Logger.js';
import { readdir, readFileAsBinary } from '../utilities/FileSystem.js';
import { joinPath } from '../utilities/PathUtilities.js';
import { getOnnxSessionOptions } from '../utilities/OnnxUtilities.js';
import { concatAudioSegments, getRawAudioDuration } from '../audio/AudioUtilities.js';
import { defaultEspeakOptions } from '../synthesis/EspeakTTS.js';
import { indexOfLastMatchingNumberInRange } from '../utilities/Utilities.js';
import { simplifyPunctuationCharacters } from '../nlp/TextNormalizer.js';
import { getShortLanguageCode } from '../utilities/Locale.js';
import { substituteStringUsingLookup } from '../utilities/StringUtilities.js';
const cachedInstanceLookup = new Map();
export async function synthesizeSentence(text, voice, speed, lexicons, modelPath, voicesPath, executionProviders) {
const cacheLookupKey = `${modelPath} ${voicesPath}`;
let kokoroTTS = cachedInstanceLookup.get(cacheLookupKey);
if (!kokoroTTS) {
kokoroTTS = new KokoroTTS(modelPath, voicesPath, executionProviders);
cachedInstanceLookup.clear();
cachedInstanceLookup.set(cacheLookupKey, kokoroTTS);
}
const result = await kokoroTTS.synthesizeSentence(text, voice, speed, lexicons);
return result;
}
export class KokoroTTS {
modelPath;
voicesPath;
executionProviders;
session;
constructor(modelPath, voicesPath, executionProviders) {
this.modelPath = modelPath;
this.voicesPath = voicesPath;
this.executionProviders = executionProviders;
}
async synthesizeSentence(sentenceText, voice, speed, lexicons) {
await this.initializeSessionIfNeeded();
const logger = new Logger();
const Onnx = await import('onnxruntime-node');
const voiceEntry = voiceList.find(entry => entry.name === voice.name && entry.languages[0] === voice.languages[0]);
if (!voiceEntry) {
throw new Error(`Kokoro voice '${voice.name}' was not found.`);
}
const voicePrimaryLanguage = voice.languages[0];
const voicePrimaryLanguageShort = getShortLanguageCode(voicePrimaryLanguage);
sentenceText = //simplifyPunctuationCharacters(sentence.trim())
sentenceText
.replaceAll('(', ', ')
.replaceAll(')', ', ')
.replaceAll('—', ', ');
const simplifiedSentenceText = simplifyPunctuationCharacters(sentenceText.trim());
const voiceLanguage = voiceEntry.languages[0];
const voiceGender = voiceEntry.gender;
const voiceId = voiceEntry.name.toLocaleLowerCase();
let filenamePrefix = filenameLanguagePrefixLookup[voiceLanguage];
if (filenamePrefix === undefined) {
throw new Error(`Unsupported voice language '${voiceLanguage}'`);
}
if (voiceGender === 'male') {
filenamePrefix += 'm';
}
else if (voiceGender === 'female') {
filenamePrefix += 'f';
}
else {
throw new Error(`Unsupported voice gender '${voiceGender}'`);
}
const voicePath = joinPath(this.voicesPath, `${filenamePrefix}_${voiceId}.bin`);
const voiceFile = new Float32Array((await readFileAsBinary(voicePath)).buffer);
const Espeak = await import('../synthesis/EspeakTTS.js');
const espeakOptions = {
...defaultEspeakOptions,
voice: languageToEspeakVoice[voiceLanguage],
useKlatt: false
};
//await logger.startAsync('Phonemize text')
const { referenceSynthesizedAudio, referenceTimeline, fragments, phonemizedFragmentsSubstitutions, phonemizedSentence } = await Espeak.preprocessAndSynthesize(sentenceText, voiceLanguage, espeakOptions, lexicons);
logger.end();
const phraseBreakTokenId = charToTokenIDLookup[','];
const wordBreakTokenId = charToTokenIDLookup[' '];
let sentenceEndChar;
if (simplifiedSentenceText.endsWith('?') || simplifiedSentenceText.endsWith(`?"`) || simplifiedSentenceText.endsWith(`?)`)) {
sentenceEndChar = '?';
}
else if (simplifiedSentenceText.endsWith('!') || simplifiedSentenceText.endsWith(`!"`) || simplifiedSentenceText.endsWith(`!)`)) {
sentenceEndChar = '!';
}
else {
sentenceEndChar = '.';
}
const sentenceEndTokenId = charToTokenIDLookup[sentenceEndChar];
const allTokenIds = [];
for (let phraseIndex = 0; phraseIndex < phonemizedSentence.length; phraseIndex++) {
const phrase = phonemizedSentence[phraseIndex];
for (let wordIndex = 0; wordIndex < phrase.length; wordIndex++) {
const word = phrase[wordIndex];
for (const phoneme of word) {
let processedPhoneme = phoneme;
if (voicePrimaryLanguageShort === 'en') {
// Extract stress mark if needed
let stressMark;
if (phoneme[0] === 'ˈ' || phoneme[0] === 'ˌ') {
stressMark = phoneme[0];
processedPhoneme = phoneme.substring(1);
}
// Apply English dialect specific substitutions
if (voicePrimaryLanguage === 'en-GB') {
processedPhoneme = substituteStringUsingLookup(processedPhoneme, britishEnglishESpeakToMisakiSubstitutions);
}
else {
processedPhoneme = substituteStringUsingLookup(processedPhoneme, americanEnglishESpeakToMisakiSubstitutions);
processedPhoneme = processedPhoneme.replaceAll('ː', '');
}
// Apply English specific substitutions
processedPhoneme = substituteStringUsingLookup(processedPhoneme, englishESpeakToMisakiSubstitutions);
// Bring back stress mark if needed
if (stressMark !== undefined) {
processedPhoneme = stressMark + processedPhoneme;
}
if (false) {
// Workaround a word having only 'I' not being pronounced at some cases
if (processedPhoneme === 'I' && word.length === 1) {
processedPhoneme = 'aɪ';
//processedPhoneme = 'ˌI'
}
}
}
// Perform tokenization
for (const phonemeCharacter of processedPhoneme) {
const id = charToTokenIDLookup[phonemeCharacter];
if (id !== undefined) {
allTokenIds.push(id);
}
}
}
if (wordIndex < phrase.length - 1) {
allTokenIds.push(wordBreakTokenId);
}
else {
if (phraseIndex < phonemizedSentence.length - 1) {
allTokenIds.push(phraseBreakTokenId);
allTokenIds.push(wordBreakTokenId);
}
}
}
}
allTokenIds.push(sentenceEndTokenId);
const maxPartLength = 509;
const parts = [];
{
let startIndex = 0;
let endIndex = 0;
while (endIndex < allTokenIds.length) {
endIndex = startIndex + maxPartLength;
if (endIndex >= allTokenIds.length) {
endIndex = allTokenIds.length;
}
else {
const indexOfLastPhraseBreak = indexOfLastMatchingNumberInRange(allTokenIds, phraseBreakTokenId, startIndex, endIndex);
if (indexOfLastPhraseBreak !== -1) {
endIndex = indexOfLastPhraseBreak + 1;
}
else {
const indexOfLastWordBreak = indexOfLastMatchingNumberInRange(allTokenIds, wordBreakTokenId, startIndex, endIndex);
if (indexOfLastWordBreak !== -1) {
endIndex = indexOfLastWordBreak + 1;
}
}
}
const partTokenIds = allTokenIds.slice(startIndex, endIndex);
if (partTokenIds[partTokenIds.length - 1] === phraseBreakTokenId) {
partTokenIds.pop();
}
if (partTokenIds.find(tokenId => ![wordBreakTokenId, phraseBreakTokenId, sentenceEndTokenId].includes(tokenId))) {
parts.push([0, ...partTokenIds, 0]);
}
startIndex = endIndex;
}
}
const styleLength = 256;
const sampleRate = 24000;
const audioParts = [];
for (let partIndex = 0; partIndex < parts.length; partIndex++) {
if (parts.length === 1) {
await logger.startAsync(`Synthesize sentence with ONNX model`);
}
else {
await logger.startAsync(`Synthesize sentence fragment ${partIndex + 1}/${parts.length} with ONNX model`);
}
const part = parts[partIndex];
const styleDataStartOffset = part.length * styleLength;
const styleData = voiceFile.slice(styleDataStartOffset, styleDataStartOffset + styleLength);
const modelInputs = {
input_ids: new Onnx.Tensor('int64', BigInt64Array.from(part.map(x => BigInt(x))), [1, part.length]),
style: new Onnx.Tensor('float32', styleData, [1, styleLength]),
speed: new Onnx.Tensor('float32', [speed], [1]),
};
const result = await this.session.run(modelInputs);
const outSamples = result.waveform.data;
audioParts.push([outSamples]);
}
logger.end();
const concatenatedAudioParts = audioParts.length > 0 ? concatAudioSegments(audioParts) : [new Float32Array(0)];
const synthesizedAudio = { audioChannels: concatenatedAudioParts, sampleRate };
await logger.startAsync('Align with reference synthesized audio');
const { alignUsingDtw } = await import('../alignment/SpeechAlignment.js');
const referenceWordTimeline = referenceTimeline.flatMap(phrase => phrase.timeline);
const dtwWindowDuration = Math.max(5, Math.ceil(0.2 * getRawAudioDuration(synthesizedAudio)));
const mappedTimeline = await alignUsingDtw(synthesizedAudio, referenceSynthesizedAudio, referenceWordTimeline, ['high'], [dtwWindowDuration]);
logger.end();
return { rawAudio: synthesizedAudio, timeline: mappedTimeline };
}
async initializeSessionIfNeeded() {
if (this.session) {
return;
}
const logger = new Logger();
await logger.startAsync('Initialize Kokoro ONNX synthesis model');
const filesInModelPath = await readdir(this.modelPath);
const onnxModelFilename = filesInModelPath.find(filename => filename.endsWith('.onnx'));
if (!onnxModelFilename) {
throw new Error(`Couldn't file any ONNX model file in ${this.modelPath}`);
}
const onnxModelFilepath = joinPath(this.modelPath, onnxModelFilename);
const onnxSessionOptions = getOnnxSessionOptions({ executionProviders: this.executionProviders });
const Onnx = await import('onnxruntime-node');
this.session = await Onnx.InferenceSession.create(onnxModelFilepath, onnxSessionOptions);
logger.end();
}
}
const charToTokenIDLookup = {
';': 1,
':': 2,
',': 3,
'.': 4,
'!': 5,
'?': 6,
'—': 9,
'…': 10,
'\'': 11,
'(': 12,
')': 13,
'“': 14,
'”': 15,
' ': 16,
'\u0303': 17,
'ʣ': 18,
'ʥ': 19,
'ʦ': 20,
'ʨ': 21,
'ᵝ': 22,
'\uAB67': 23,
'A': 24,
'I': 25,
'O': 31,
'Q': 33,
'S': 35,
'T': 36,
'W': 39,
'Y': 41,
'ᵊ': 42,
'a': 43,
'b': 44,
'c': 45,
'd': 46,
'e': 47,
'f': 48,
'h': 50,
'i': 51,
'j': 52,
'k': 53,
'l': 54,
'm': 55,
'n': 56,
'o': 57,
'p': 58,
'q': 59,
'r': 60,
's': 61,
't': 62,
'u': 63,
'v': 64,
'w': 65,
'x': 66,
'y': 67,
'z': 68,
'ɑ': 69,
'ɐ': 70,
'ɒ': 71,
'æ': 72,
'β': 75,
'ɔ': 76,
'ɕ': 77,
'ç': 78,
'ɖ': 80,
'ð': 81,
'ʤ': 82,
'ə': 83,
'ɚ': 85,
'ɛ': 86,
'ɜ': 87,
'ɟ': 90,
'ɡ': 92,
'ɥ': 99,
'ɨ': 101,
'ɪ': 102,
'ʝ': 103,
'ɯ': 110,
'ɰ': 111,
'ŋ': 112,
'ɳ': 113,
'ɲ': 114,
'ɴ': 115,
'ø': 116,
'ɸ': 118,
'θ': 119,
'œ': 120,
'ɹ': 123,
'ɾ': 125,
'ɻ': 126,
'ʁ': 128,
'ɽ': 129,
'ʂ': 130,
'ʃ': 131,
'ʈ': 132,
'ʧ': 133,
'ʊ': 135,
'ʋ': 136,
'ʌ': 138,
'ɣ': 139,
'ɤ': 140,
'χ': 142,
'ʎ': 143,
'ʒ': 147,
'ʔ': 148,
'ˈ': 156,
'ˌ': 157,
'ː': 158,
'ʰ': 162,
'ʲ': 164,
'↓': 169,
'→': 171,
'↗': 172,
'↘': 173,
'ᵻ': 177
};
const englishESpeakToMisakiSubstitutions = {
//'aɪ': 'I', // Disable since model doesn't seem to work well with this mapping in some cases, like "I am"
'aɪə': 'Iə',
'aɪɚ': 'Iəɹ',
'aʊ': 'W',
'dʒ': 'ʤ',
'e': 'A',
'eɪ': 'A',
'r': 'ɹ',
'tʃ': 'ʧ',
'x': 'k',
'ç': 'k',
'ɐ': 'ə',
'ɔɪ': 'Y',
'əl': 'ᵊl',
'ɚ': 'əɹ',
'ɬ': 'l',
'ʔ': 't',
'ʔn': 'tᵊn',
'ʔˌn\u0329': 'tᵊn',
'ʲ': '',
'ʲO': 'jO',
'ʲQ': 'jQ',
// Make these substitutions regardless of dialect:
'əʊ': 'Q',
'oʊ': 'O',
};
const britishEnglishESpeakToMisakiSubstitutions = {
'eə': 'ɛː',
'iə': 'ɪə',
//'əʊ': 'Q',
};
const americanEnglishESpeakToMisakiSubstitutions = {
//'oʊ': 'O',
'ɜːɹ': 'ɜɹ',
'ɜː': 'ɜɹ',
'ɪə': 'iə',
};
const filenameLanguagePrefixLookup = {
'en-US': 'a',
'en-GB': 'b',
'es-ES': 'e',
'fr-FR': 'f',
'hi-IN': 'h',
'it-IT': 'i',
'ja-JP': 'j',
'pt-BR': 'p',
'zh-CN': 'z',
};
const languageToEspeakVoice = {
'en-US': 'en-us',
'en-GB': 'en-gb-x-rp',
'es-ES': 'es-419',
'fr-FR': 'fr',
'hi-IN': 'hi',
'it-IT': 'it',
'ja-JP': 'ja',
'pt-BR': 'pt-br',
'zh-CN': 'cmn',
};
export const voiceList = [
// US English voices
{
name: 'Heart',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'Bella',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'Nicole',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'Aoede',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'Kore',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'Sarah',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'Nova',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'Sky',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'Alloy',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'Jessica',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'River',
languages: ['en-US', 'en'],
gender: 'female',
},
{
name: 'Michael',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'Fenrir',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'Puck',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'Echo',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'Eric',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'Liam',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'Onyx',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'Santa',
languages: ['en-US', 'en'],
gender: 'male',
},
{
name: 'Adam',
languages: ['en-US', 'en'],
gender: 'male',
},
// UK English voices
{
name: 'Emma',
languages: ['en-GB', 'en'],
gender: 'female',
},
{
name: 'Isabella',
languages: ['en-GB', 'en'],
gender: 'female',
},
{
name: 'Alice',
languages: ['en-GB', 'en'],
gender: 'female',
},
{
name: 'Lily',
languages: ['en-GB', 'en'],
gender: 'female',
},
{
name: 'George',
languages: ['en-GB', 'en'],
gender: 'male',
},
{
name: 'Fable',
languages: ['en-GB', 'en'],
gender: 'male',
},
{
name: 'Lewis',
languages: ['en-GB', 'en'],
gender: 'male',
},
{
name: 'Daniel',
languages: ['en-GB', 'en'],
gender: 'male',
},
// Spanish (Spain) voices
{
name: 'Dora',
languages: ['es-ES', 'es'],
gender: 'female',
},
{
name: 'Alex',
languages: ['es-ES', 'es'],
gender: 'male',
},
{
name: 'Santa',
languages: ['es-ES', 'es'],
gender: 'male',
},
// French (France) voices
{
name: 'Siwis',
languages: ['fr-FR', 'fr'],
gender: 'female',
},
// Hindi (India) voices
{
name: 'Alpha',
languages: ['hi-IN', 'hi'],
gender: 'female',
},
{
name: 'Beta',
languages: ['hi-IN', 'hi'],
gender: 'female',
},
{
name: 'Omega',
languages: ['hi-IN', 'hi'],
gender: 'male',
},
{
name: 'Psi',
languages: ['hi-IN', 'hi'],
gender: 'male',
},
// Italian (Italy) voices
{
name: 'Sara',
languages: ['it-IT', 'it'],
gender: 'female',
},
{
name: 'Nicola',
languages: ['it-IT', 'it'],
gender: 'male',
},
// Portuguese (Brazil) voices
{
name: 'Dora',
languages: ['pt-BR', 'pt'],
gender: 'female',
},
{
name: 'Alex',
languages: ['pt-BR', 'pt'],
gender: 'male',
},
{
name: 'Santa',
languages: ['pt-BR', 'pt'],
gender: 'male',
},
// Chinese (China) voices
{
name: 'Xiaobei',
languages: ['zh-CN', 'zh'],
gender: 'female',
},
{
name: 'Xiaoni',
languages: ['zh-CN', 'zh'],
gender: 'female',
},
{
name: 'Xiaoxiao',
languages: ['zh-CN', 'zh'],
gender: 'female',
},
{
name: 'Xiaoyi',
languages: ['zh-CN', 'zh'],
gender: 'female',
},
{
name: 'Yunjian',
languages: ['zh-CN', 'zh'],
gender: 'male',
},
{
name: 'Yunxi',
languages: ['zh-CN', 'zh'],
gender: 'male',
},
{
name: 'Yunxia',
languages: ['zh-CN', 'zh'],
gender: 'male',
},
{
name: 'Yunyang',
languages: ['zh-CN', 'zh'],
gender: 'male',
},
];
//# sourceMappingURL=KokoroTTS.js.map