echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
441 lines • 19.6 kB
JavaScript
import { concatFloat32Arrays, logToStderr, formatObjectToString } from '../utilities/Utilities.js';
import { int16PcmToFloat32 } from '../audio/AudioBufferConversion.js';
import { Logger } from '../utilities/Logger.js';
import { WasmMemoryManager } from '../utilities/WasmMemoryManager.js';
import { getEmptyRawAudio } from '../audio/AudioUtilities.js';
import { getNormalizedFragmentsForSpeech, simplifyPunctuationCharacters } from '../nlp/TextNormalizer.js';
import { ipaPhoneToKirshenbaum } from '../nlp/PhoneConversion.js';
import { splitToWords, wordCharacterPattern } from '../nlp/Segmentation.js';
import { tryGetFirstLexiconSubstitution } from '../nlp/Lexicon.js';
import { phonemizeSentence } from '../nlp/EspeakPhonemizer.js';
import { extendDeep } from '../utilities/ObjectUtilities.js';
import { escapeHtml } from '../encodings/HtmlEscape.js';
const log = logToStderr;
let espeakInstance;
let espeakModule;
export async function preprocessAndSynthesize(text, language, espeakOptions, lexicons = []) {
const logger = new Logger();
espeakOptions = extendDeep(defaultEspeakOptions, espeakOptions);
await logger.startAsync('Tokenize and analyze text');
let lowerCaseLanguageCode = language.toLowerCase();
if (lowerCaseLanguageCode === 'en-gb') {
lowerCaseLanguageCode = 'en-gb-x-rp';
}
let fragments;
let preprocessedFragments;
const phonemizedFragmentsSubstitutions = new Map();
fragments = [];
preprocessedFragments = [];
let words = await splitToWords(text, language);
// Merge repeating non-words to a single word to work around eSpeak bug
const wordsWithMerges = [];
for (let i = 0; i < words.length; i++) {
const currentWord = words[i];
const previousWord = words[i - 1];
if (i > 0 &&
currentWord === previousWord &&
!['[', ']'].includes(currentWord) && // Work around eSpeak-NG marker bug with repeating squared brackets
!wordCharacterPattern.test(currentWord)) {
wordsWithMerges[wordsWithMerges.length - 1] += currentWord;
}
else {
wordsWithMerges.push(currentWord);
}
}
words = wordsWithMerges;
// Trim words and remove words containing only whitespace
words = words.map(word => word.trim()).filter(word => word !== '');
const { normalizedFragments, referenceFragments } = getNormalizedFragmentsForSpeech(words, language);
const simplifiedFragments = normalizedFragments.map(word => simplifyPunctuationCharacters(word).toLocaleLowerCase());
if ([`'`].includes(simplifiedFragments[0])) {
normalizedFragments[0] = `()`;
}
for (let fragmentIndex = 0; fragmentIndex < normalizedFragments.length; fragmentIndex++) {
const fragment = normalizedFragments[fragmentIndex];
const substitutionPhonemes = tryGetFirstLexiconSubstitution(simplifiedFragments, fragmentIndex, lexicons, lowerCaseLanguageCode);
if (!substitutionPhonemes) {
continue;
}
phonemizedFragmentsSubstitutions.set(fragmentIndex, substitutionPhonemes);
const referenceIPA = (await textToPhonemes(fragment, espeakOptions.voice, true)).replaceAll('_', ' ');
const referenceKirshenbaum = (await textToPhonemes(fragment, espeakOptions.voice, false)).replaceAll('_', '');
const kirshenbaumPhonemes = substitutionPhonemes.map(phone => ipaPhoneToKirshenbaum(phone)).join('');
logger.logTitledMessage(`\nLexicon substitution for '${fragment}'`, `IPA: ${substitutionPhonemes.join(' ')} (original: ${referenceIPA}), Kirshenbaum: ${kirshenbaumPhonemes} (reference: ${referenceKirshenbaum})`);
const substitutionPhonemesFragment = ` [[${kirshenbaumPhonemes}]] `;
normalizedFragments[fragmentIndex] = substitutionPhonemesFragment;
}
fragments = referenceFragments;
preprocessedFragments = normalizedFragments;
logger.start('Synthesize preprocessed fragments with eSpeak');
const { rawAudio: referenceSynthesizedAudio, timeline: referenceTimeline } = await synthesizeFragments(preprocessedFragments, espeakOptions);
await logger.startAsync('Build phonemized tokens');
const phonemizedSentence = [];
let wordIndex = 0;
for (const phraseEntry of referenceTimeline) {
const phrase = [];
for (const wordEntry of phraseEntry.timeline) {
wordEntry.text = fragments[wordIndex];
if (phonemizedFragmentsSubstitutions.has(wordIndex)) {
phrase.push(phonemizedFragmentsSubstitutions.get(wordIndex));
}
else {
for (const tokenEntry of wordEntry.timeline) {
const tokenPhonemes = [];
for (const phoneme of tokenEntry.timeline) {
if (phoneme.text) {
tokenPhonemes.push(phoneme.text);
}
}
if (tokenPhonemes.length > 0) {
phrase.push(tokenPhonemes);
}
}
}
wordIndex += 1;
}
if (phrase.length > 0) {
phonemizedSentence.push(phrase);
}
}
logger.log(phonemizedSentence.map(phrase => phrase.map(word => word.join(' ')).join(' | ')).join(' || '));
logger.end();
return { referenceSynthesizedAudio, referenceTimeline, fragments, preprocessedFragments, phonemizedFragmentsSubstitutions, phonemizedSentence };
}
export async function synthesizeFragments(fragments, espeakOptions) {
espeakOptions = extendDeep(defaultEspeakOptions, espeakOptions);
const voice = espeakOptions.voice;
const sampleRate = await getSampleRate();
if (fragments.length === 0) {
return {
rawAudio: getEmptyRawAudio(1, sampleRate),
timeline: [],
events: []
};
}
const canInsertSeparators = !['roa/an', 'art/eo', 'trk/ky', 'zlw/pl', 'zle/uk'].includes(voice);
let textWithMarkers;
if (canInsertSeparators) {
textWithMarkers = `() | `;
}
else {
textWithMarkers = `() `;
}
for (let i = 0; i < fragments.length; i++) {
let fragment = fragments[i];
fragment = simplifyPunctuationCharacters(fragment);
//fragment = encodeHTMLAngleBrackets(fragment)
fragment = fragment.replaceAll('<', '_').replaceAll('>', '_');
if (fragment.split('').every(c => c === ':')) {
fragment = ',';
}
if (espeakOptions.insertSeparators && canInsertSeparators) {
const separator = ` | `;
textWithMarkers += `<mark name="s-${i}"/>${separator}${fragment}${separator}<mark name="e-${i}"/>`;
}
else {
if (fragment.endsWith('.')) {
fragment += ' ()';
}
textWithMarkers += `<mark name="s-${i}"/>${fragment}<mark name="e-${i}"/> `;
}
}
const { rawAudio, events } = await synthesize(textWithMarkers, { ...espeakOptions, ssml: true });
// Add first marker if missing
if (fragments.length > 0) {
const firstMarkerEvent = events.find(event => event.type === 'mark');
if (firstMarkerEvent && firstMarkerEvent.id === 'e-0') {
events.unshift({
type: 'mark',
text_position: 0,
word_length: 0,
audio_position: 0,
id: 's-0',
});
}
}
// Build word timeline from events
const wordTimeline = fragments.map(word => ({
type: 'word',
text: word,
startTime: -1,
endTime: -1,
timeline: [{
type: 'token',
text: '',
startTime: -1,
endTime: -1,
timeline: []
}]
}));
let wordIndex = 0;
const clauseEndIndexes = [];
for (const event of events) {
const eventTime = event.audio_position / 1000;
const currentWordEntry = wordTimeline[wordIndex];
const currentTokenTimeline = currentWordEntry.timeline;
const currentTokenEntry = currentTokenTimeline[currentTokenTimeline.length - 1];
const currentPhoneTimeline = currentTokenEntry.timeline;
const lastPhoneEntry = currentPhoneTimeline[currentPhoneTimeline.length - 1];
if (lastPhoneEntry && lastPhoneEntry.endTime === -1) {
lastPhoneEntry.endTime = eventTime;
}
if (event.type === 'word') {
if (!event.id || currentPhoneTimeline.length === 0) {
continue;
}
if (currentTokenEntry.endTime === -1) {
currentTokenEntry.endTime = eventTime;
}
currentTokenTimeline.push({
type: 'token',
text: '',
startTime: eventTime,
endTime: -1,
timeline: []
});
}
else if (event.type === 'phoneme') {
const phoneText = event.id;
if (!phoneText || phoneText.startsWith('(')) {
continue;
}
currentPhoneTimeline.push({
type: 'phone',
text: phoneText,
startTime: eventTime,
endTime: -1
});
currentTokenEntry.text += phoneText;
currentTokenEntry.startTime = currentPhoneTimeline[0].startTime;
}
else if (event.type === 'mark') {
const markerName = event.id;
if (markerName.startsWith('s-')) {
const markerIndex = parseInt(markerName.substring(2));
if (markerIndex != wordIndex) {
throw new Error(`Word start marker for index ${wordIndex} is not consistent with word index. The words were: ${formatObjectToString(fragments)}`);
}
if (currentPhoneTimeline.length > 0) {
throw new Error(`Word entry ${wordIndex} already has phones before its start marker was seen. The words were: ${formatObjectToString(fragments)}`);
}
currentWordEntry.startTime = eventTime;
currentTokenEntry.startTime = eventTime;
}
else if (markerName.startsWith('e-')) {
const markerIndex = parseInt(markerName.substring(2));
if (markerIndex != wordIndex) {
throw new Error(`Word end marker for index ${wordIndex} is not consistent with word index. The words were: ${formatObjectToString(fragments)}`);
}
currentWordEntry.startTime = currentTokenTimeline[0].startTime;
currentWordEntry.endTime = eventTime;
currentTokenEntry.endTime = eventTime;
wordIndex += 1;
if (wordIndex === wordTimeline.length) {
break;
}
}
else {
continue;
}
}
else if (event.type === 'end') {
clauseEndIndexes.push(wordIndex);
}
}
clauseEndIndexes.push(wordTimeline.length);
// Split compound tokens
for (const [index, wordEntry] of wordTimeline.entries()) {
const tokenTimeline = wordEntry.timeline;
if (index === 0) {
continue;
}
if (!tokenTimeline || tokenTimeline.length === 0) {
throw new Error('Unexpected: token timeline should exist and have at least one token');
}
if (tokenTimeline.length !== 1 || tokenTimeline[0].text !== '') {
continue;
}
const wordReferencePhonemes = (await textToPhonemes(wordEntry.text, espeakOptions.voice, true)).split('_');
const wordReferenceIPA = wordReferencePhonemes.join(' ');
if (wordReferenceIPA.trim().length === 0) {
continue;
}
const wordReferenceIPAWithoutStress = wordReferenceIPA.replaceAll('ˈ', '').replaceAll('ˌ', '');
const previousWordEntry = wordTimeline[index - 1];
if (!previousWordEntry.timeline) {
continue;
}
const previousWordTokenEntry = previousWordEntry.timeline[previousWordEntry.timeline.length - 1];
if (!previousWordTokenEntry.timeline) {
continue;
}
const previousWordTokenIPAWithoutStress = previousWordTokenEntry.timeline.map(phoneEntry => phoneEntry.text.replaceAll('ˈ', '').replaceAll('ˌ', '')).join(' ');
if (previousWordEntry.timeline.length > 1 && previousWordTokenIPAWithoutStress === wordReferenceIPAWithoutStress) {
tokenTimeline.pop();
const tokenEntryToInsert = previousWordEntry.timeline.pop();
tokenTimeline.push(tokenEntryToInsert);
previousWordEntry.endTime = previousWordEntry.timeline[previousWordEntry.timeline.length - 1].endTime;
wordEntry.startTime = tokenEntryToInsert.startTime;
wordEntry.endTime = tokenEntryToInsert.endTime;
continue;
}
if (previousWordTokenEntry.timeline.length <= wordReferencePhonemes.length) {
continue;
}
if (!previousWordTokenIPAWithoutStress.endsWith(wordReferenceIPAWithoutStress)) {
continue;
}
const tokenEntry = tokenTimeline[0];
tokenEntry.timeline = previousWordTokenEntry.timeline.splice(previousWordTokenEntry.timeline.length - wordReferencePhonemes.length);
tokenEntry.text = tokenEntry.timeline.map(phoneEntry => phoneEntry.text).join('');
tokenEntry.startTime = tokenEntry.timeline[0].startTime;
tokenEntry.endTime = tokenEntry.timeline[tokenEntry.timeline.length - 1].endTime;
wordEntry.startTime = tokenEntry.startTime;
wordEntry.endTime = tokenEntry.endTime;
previousWordTokenEntry.text = previousWordTokenEntry.timeline.map(phoneEntry => phoneEntry.text).join('');
previousWordTokenEntry.endTime = previousWordTokenEntry.timeline[previousWordTokenEntry.timeline.length - 1].endTime;
previousWordEntry.endTime = previousWordTokenEntry.endTime;
}
// Build clause timeline
const clauseTimeline = [];
let clauseStartIndex = 0;
for (const clauseEndIndex of clauseEndIndexes) {
const newClause = {
type: 'clause',
text: '',
startTime: -1,
endTime: -1,
timeline: []
};
for (let entryIndex = clauseStartIndex; entryIndex <= clauseEndIndex && entryIndex < wordTimeline.length; entryIndex++) {
const wordEntry = wordTimeline[entryIndex];
if (newClause.startTime === -1) {
newClause.startTime = wordEntry.startTime;
}
newClause.endTime = wordEntry.endTime;
newClause.text += `${wordEntry.text} `;
newClause.timeline.push(wordEntry);
}
if (newClause.timeline.length > 0) {
clauseTimeline.push(newClause);
clauseStartIndex = clauseEndIndex + 1;
}
}
return { rawAudio, timeline: clauseTimeline, events };
}
export async function synthesize(text, espeakOptions) {
const logger = new Logger();
espeakOptions = extendDeep(defaultEspeakOptions, espeakOptions);
logger.start('Get eSpeak Emscripten instance');
if (!espeakOptions.ssml) {
text = escapeHtml(text);
}
const { instance } = await getEspeakInstance();
const sampleChunks = [];
const allEvents = [];
logger.start('Synthesize with eSpeak');
if (espeakOptions.useKlatt) {
await setVoice(`${espeakOptions.voice}+klatt6`);
}
else {
await setVoice(espeakOptions.voice);
}
await setRate(espeakOptions.rate);
await setPitch(espeakOptions.pitch);
await setPitchRange(espeakOptions.pitchRange);
instance.synthesize(text, (samples, events) => {
if (samples && samples.length > 0) {
sampleChunks.push(int16PcmToFloat32(samples));
}
for (const event of events) {
if (event.type === 'word') {
const textPosition = event.text_position - 1;
event['text'] = text.substring(textPosition, textPosition + event.word_length);
}
}
allEvents.push(...events);
});
const concatenatedSamples = concatFloat32Arrays(sampleChunks);
const rawAudio = { audioChannels: [concatenatedSamples], sampleRate: 22050 };
logger.end();
return { rawAudio, events: allEvents };
}
export async function textToIPA(text, voice) {
await setVoice(voice);
const { instance } = await getEspeakInstance();
const ipa = instance.synthesize_ipa(text).ipa.trim();
return ipa;
}
export async function textToPhonemes(text, voice, useIPA = true) {
await setVoice(voice);
const { instance, module } = await getEspeakInstance();
const textPtr = instance.convert_to_phonemes(text, useIPA);
const wasmMemory = new WasmMemoryManager(module);
const resultRef = wasmMemory.wrapNullTerminatedUtf8String(textPtr.ptr);
const result = resultRef.getValue();
wasmMemory.freeAll();
return result;
}
let lastVoiceId;
export async function setVoice(voiceId) {
const { instance } = await getEspeakInstance();
if (voiceId !== lastVoiceId) {
instance.set_voice(voiceId);
lastVoiceId = voiceId;
}
}
export async function setVolume(volume) {
const { instance } = await getEspeakInstance();
return instance.setVolume(volume);
}
export async function setRate(rate) {
const { instance } = await getEspeakInstance();
return instance.set_rate(rate);
}
export async function setPitch(pitch) {
const { instance } = await getEspeakInstance();
return instance.set_pitch(pitch);
}
export async function setPitchRange(pitchRange) {
const { instance } = await getEspeakInstance();
return instance.set_range(pitchRange);
}
export async function getSampleRate() {
return 22050;
}
export async function listVoices() {
const { instance } = await getEspeakInstance();
const voiceList = instance.list_voices();
return voiceList;
}
async function getEspeakInstance() {
if (!espeakInstance) {
const { default: EspeakInitializer } = await import('@echogarden/espeak-ng-emscripten');
const m = await EspeakInitializer();
espeakInstance = await (new m.eSpeakNGWorker());
espeakModule = m;
}
return { instance: espeakInstance, module: espeakModule };
}
export const defaultEspeakOptions = {
voice: 'en-us',
ssml: false,
rate: 1.0,
pitch: 1.0,
pitchRange: 1.0,
useKlatt: false,
insertSeparators: false
};
export async function testKirshenbaumPhonemization(text, language = 'en-us') {
const ipaPhonemizedSentence = (await phonemizeSentence(text, language)).flatMap(clause => clause);
const kirshenbaumPhonemizedSentence = (await phonemizeSentence(text, language, undefined, false)).flatMap(clause => clause);
const ipaFragments = ipaPhonemizedSentence.map(word => word.join(''));
const kirshenbaumFragments = kirshenbaumPhonemizedSentence.map(word => word.join(''));
const fragments = ipaPhonemizedSentence.map(word => word.map(phoneme => ipaPhoneToKirshenbaum(phoneme)).join(''));
for (let i = 0; i < fragments.length; i++) {
if (fragments[i] !== kirshenbaumFragments[i]) {
log(`IPA: ${ipaFragments[i]} | converted: ${fragments[i]} | ground truth: ${kirshenbaumFragments[i]}`);
}
}
}
//# sourceMappingURL=EspeakTTS.js.map