@met4citizen/headtts
Version:
HeadTTS: A free Javascript text-to-speech with timestamps and visemes.
470 lines (405 loc) • 13.5 kB
JavaScript
import * as utils from "./utils.mjs";
// Base class for language modules
class LanguageBase {
/**
* @constructor
*/
constructor( settings = null ) {
this.settings = Object.assign({
trace: false
}, settings || {});
// Whitespace characters
this.whitespaces = {
" ": " ", "\n": "\n", "\t": "\t", "\r": "\r", "\v": "\v", "\f": "\f",
"\u00A0": "\u00A0", "\u1680": "\u1680", "\u2000": "\u2000",
"\u2001": "\u2001", "\u2002": "\u2002", "\u2003": "\u2003",
"\u2004": "\u2004", "\u2005": "\u2005", "\u2006": "\u2006",
"\u2007": "\u2007", "\u2008": "\u2008", "\u2008": "\u2008",
"\u2009": "\u2009", "\u200A": "\u200A", "\u2028": "\u2028",
"\u2029": "\u2029", "\u202F": "\u202F", "\u205F": "\u205F"
};
// Misaki tokens to Oculus visemes
this.misakiToOculusViseme = {
"$": null, ";": null, ":": null, ",": null, ".": null, "!": null,
"?": null, "—": null, "…": null, "\"": null, "(": null, ")": null,
"“": null, "”": null, " ": null, "\u0303": null, "ʣ": "DD", "ʥ": "CH",
"ʦ": "CH", "ʨ": "CH", "ᵝ": null, "ꭧ": null, "A": "E", "I": "I",
"O": "O", "Q": "O", "S": "SS", "T": "DD", "W": "U", "Y": "I",
"ᵊ": null, "a": "aa", "b": "PP", "c": "kk", "d": "DD", "e": "E",
"f": "FF", "h": null, "i": "I", "j": "I", "k": "kk", "l": "RR",
"m": "PP", "n": "nn", "o": "O", "p": "PP", "q": "kk", "r": "RR",
"s": "SS", "t": "DD", "u": "U", "v": "FF", "w": "U", "x": "SS",
"y": "I", "z": "SS", "ɑ": "aa", "ɐ": "aa", "ɒ": "aa", "æ": "aa",
"β": "FF", "ɔ": "O", "ɕ": "SS", "ç": "SS", "ɖ": "DD", "ð": "TH", "ʤ":
"CH", "ə": "E", "ɚ": "RR", "ɛ": "E", "ɜ": "E", "ɟ": "DD", "ɡ": "kk",
"ɥ": "U", "ɨ": "I", "ɪ": "I", "ʝ": "I", "ɯ": "U", "ɰ": "U", "ŋ": "nn",
"ɳ": "nn", "ɲ": "nn", "ɴ": "nn", "ø": "O", "ɸ": "FF", "θ": "TH",
"œ": "E", "ɹ": "RR", "ɾ": "DD", "ɻ": "RR", "ʁ": "RR", "ɽ": "RR",
"ʂ": "SS", "ʃ": "SS", "ʈ": "DD", "ʧ": "CH", "ʊ": "U", "ʋ": "FF",
"ʌ": "aa", "ɣ": null, "ɤ": "O", "χ": null, "ʎ": "RR", "ʒ": "SS",
"ʔ": null, "ˈ": null, "ˌ": null, "ː": null, "ʰ": null, "ʲ": null,
"↓": null, "→": null, "↗": null, "↘": null, "ᵻ": "I"
};
// Allowed letters in upper case
// NOTE: Diacritics will be removed unless added to this object.
this.normalizedLettersUpper = {
'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G',
'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N',
'O': 'O', 'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U',
'V': 'V', 'W': 'W', 'X': 'X', 'Y': 'Y', 'Z': 'Z', 'ß': 'SS', 'Ø': 'O',
'Æ': 'AE', 'Œ': 'OE', 'Ð': 'D', 'Þ': 'TH', 'Ł': 'L'
};
// Allowed punctuations
this.punctuations = {
";": ";", ":": ":", ",": ",", ".": ".", "!": "!", "?": "?", "¡": "!",
"¿": "?", "—": "—", '"': '"', "…": "…", "«": '"', "»": '"', "“": '"',
"”": '"', "(": "(", ")": ")", "{": "(", "}": ")", "[": "(", "]": ")",
" ": " ", "-": "-", "'": "'"
};
// Allowed punctuations in mid-word
this.punctuationsMidWord = {
"-": "-", "'": "'", ".": ".", ",": ","
}
if ( this.settings.trace ) {
utils.trace( 'Language base module initiated.' );
}
}
/**
* Convert graphemes to phonemes.
*
* @param {string} s Word
* @return {string[]} Array of phonemes
*/
phonemizeWord(s) {
throw new Error("The method phonemizeWord not implemented.");
}
/**
* Add one dictionary line.
*
* @param {string} s Line
*/
addToDictionary(s) {
if ( s.startsWith(";;;") ) return; // Comment
const fields = s.split("\t");
if ( fields.length >= 2) {
const word = fields[0];
const phonemes = fields[1].split("");
this.dictionary[word] = phonemes;
}
}
/**
* Load pronouncing dictionary.
*
* @param {string} [dictionary=null] Dictionary path/url. If null, do not use dictionaries
* @param {boolean} [force=false] If true, re-load even if already loaded.
*/
async loadDictionary( dictionary = null, force = false ) {
if ( this.dictionary && !force ) return;
this.dictionary = null;
if ( dictionary ) {
// Use stream to read the file
if ( utils.isNode() ) {
const fs = await import('node:fs');
const readline = await import('node:readline');
let stream = fs.createReadStream(dictionary, { encoding: 'utf8' });
const rl = readline.createInterface({
input: stream,
crlfDelay: Infinity
});
this.dictionary = {};
for await (const line of rl) {
this.addToDictionary(line);
}
} else {
const response = await fetch(dictionary);
const reader = response.body.getReader();
const decoder = new TextDecoder(); // Defaults to utf-8
let buffer = "";
this.dictionary = {};
while (true) {
const { value, done } = await reader.read();
let lines;
if (done) {
lines = [buffer];
} else {
buffer += decoder.decode(value, { stream: true });
lines = buffer.split(/\r?\n/);
buffer = lines.pop(); // Save the incomplete line
}
for (const line of lines) {
this.addToDictionary(line);
}
if ( done ) break;
}
}
}
if ( this.settings.trace ) {
utils.trace( 'Language dictionary "' + dictionary + '" loaded.' );
}
}
/**
* Split text string in parts that contain one word.
* Each part is split from the first letter of the word, except
* the first part, which starts from the beginning including
* leading spaces, if any.
*
* @param {string} s Text string
* @return {string[]} Array of parts
*/
splitText(s) {
const parts = [];
const chars = [...s];
const len = chars.length;
let i = 0;
let lastType = 0; // 0=unknown, 1=whitespace, 2=other
let foundWord = false;
let part = "";
while( i<len ) {
const isLast = i === (len-1);
const c = chars[i];
const type = this.whitespaces.hasOwnProperty(c) ? 1 : 2;
if ( isLast ) {
parts.push(part+c);
} else if ( foundWord && type === 2 && type !== lastType) {
parts.push(part);
part = c;
lastType = type;
foundWord = type === 2;
} else {
part += c;
lastType = type;
foundWord = foundWord || type === 2;
}
i++;
}
return parts;
}
/**
* Normalize text and set it to uppercase.
*
* @param {string} s Text string
* @return {string[]} Normalized array of characters in upper case.
*/
normalizeUpper(s) {
const norm = [];
const chars = [...s.toUpperCase()];
const len = chars.length;
let i=0;
while( i<len ) {
let c = chars[i];
if ( this.normalizedLettersUpper.hasOwnProperty(c) ) {
norm.push( this.normalizedLettersUpper[c] );
} else if ( this.punctuations.hasOwnProperty(c) ) {
norm.push( this.punctuations[c] );
} else {
c = c.normalize("NFD").replace(/[\u0300-\u036f]/g, "").normalize("NFC");
if ( this.normalizedLettersUpper.hasOwnProperty(c) ) {
norm.push( this.normalizedLettersUpper[c] );
}
}
i++;
}
return norm;
}
/**
* Set the `text` to be spoken by analysing the part content.
* NOTE: The language module should override this
* method and implement language specific conversions.
*
* @param {Object} part Current part
* @param {number} i Index
* @param {Object[]} arr All the parts.
*/
partSetText(part,i,arr) {
switch( part.type ) {
case "text":
part.text = part.value;
part.subtitles = part.value;
break;
case "speech":
part.text = part.value;
break;
case "phonetic":
part.phonemes = part.value;
break;
case "break":
// TODO: Figure out better way to add breaks
let cnt = Math.max(2, Math.min( part.value / 500, 10 ));
part.phonemes = Array(cnt).fill(".").join("") + " ";
break;
// Leave other types to language specific implementation
}
part.subtitles = part.subtitles || "";
}
/**
* Divide the given part into punctionations and words.
* NOTE: Only `text` type currently supported.
* TODO: Add support for other types.
*
* @param {Object} part Part object
*/
partSetSpeak(part) {
part.speak = [];
if ( part.text && typeof part.text === "string" ) {
const chars = this.normalizeUpper(part.text);
const len = chars.length;
let i=0;
let word = "";
let lastType = 0; // 0=Unknown, 1=punctuation, 2=word
while( i<len ) {
const isLast = i === (len-1);
const c = chars[i];
const cNext = isLast ? null : chars[i+1];
const isPunctuation = this.punctuations.hasOwnProperty(c);
const isPunctuationMid = !isLast && this.punctuationsMidWord.hasOwnProperty(c) && !this.punctuations.hasOwnProperty(cNext);
const type = (isPunctuation && !isPunctuationMid) ? 1 : 2;
if ( type === lastType ) {
word += c;
} else {
if ( word ) {
part.speak.push({ type: lastType, word: word });
word = "";
}
word = c;
lastType = type;
}
if ( isLast && word ) {
part.speak.push({ type, word });
}
i++;
}
}
}
/**
* Construct `phonemes` from speakable words and punctuations.
*
* @param {Object} part Part object
*/
partSetPhonemes(part) {
// Initialize
part.phonemes = [];
if ( part.speak && Array.isArray(part.speak) ) {
const len = part.speak.length;
let i = 0;
while ( i<len ) {
const s = part.speak[i];
if ( s.type === 1 ) {
part.phonemes.push(...s.word);
} else if ( s.type === 2 ) {
part.phonemes.push(...this.phonemizeWord(s.word));
}
i++;
}
}
}
/**
* Construct `visemes` from `phonemes`.
*
* @param {Object} part Part object
*/
partSetVisemes(part) {
// Initialize
part.visemes = [];
// Map
if ( part.phonemes && Array.isArray(part.phonemes)) {
part.phonemes.forEach( ph => {
if ( this.misakiToOculusViseme.hasOwnProperty(ph) ) {
part.visemes.push( this.misakiToOculusViseme[ph] );
} else {
// console.info('Viseme not found for "' + ph + '"');
part.visemes.push(null)
}
});
}
}
/**
* Generate phonemes and TalkingHead metadata from input.
* NOTE: Starting times and durations in `metadata` refer to phoneme
* start and end indices respectively.
*
* @param {string|Object[]} input Input element
* @return {Object} `phonemes` array and TalkingHead `metadata` template
*/
generate(input) {
// Output data
let phonemes = [];
const metadata = {
words: [], wtimes: [], wdurations: [],
visemes: [], vtimes: [], vdurations: []
};
if ( this.settings.trace ) {
utils.trace( 'Generate start, input=', input );
}
// Break input into parts
let parts = [];
const inputs = Array.isArray(input) ? input : [input];
inputs.forEach( x => {
if ( typeof x === "string" ) {
const textParts = this.splitText(x);
textParts.forEach( y => {
const part = { type: "text", value: y };
parts.push( part );
});
} else {
parts.push(x);
}
});
// Set text to be spoken
parts.forEach( this.partSetText.bind(this) );
// Populate output
parts.forEach( part => {
// Phonemize and set visemes
if ( part.hasOwnProperty("phonemes") ) {
if ( typeof part.phonemes === "string" ) {
part.phonemes = [...part.phonemes];
}
} else {
this.partSetSpeak(part);
this.partSetPhonemes(part);
}
// Set visemes
this.partSetVisemes(part);
if ( this.settings.trace ) {
utils.trace( 'Generate, part=', part );
}
// Set output
metadata.words.push( part.subtitles );
metadata.wtimes.push( phonemes.length );
const len = part.phonemes.length
let i = phonemes.length; // Phoneme index
for( let j=0; j<len; j++ ) {
// Phonemes
const ph = part.phonemes[j];
phonemes.push( ph );
// Visemes
const viseme = part.visemes[j];
if ( viseme ) {
metadata.visemes.push(viseme);
metadata.vtimes.push(i);
i = phonemes.length;
metadata.vdurations.push(i);
} else {
// Custom timing
if ( ph === "ˈ" || ph === "ˌ" ) {
// Do not update startingtime
} else if ( ph === "ː" ) {
i = phonemes.length;
const len2 = metadata.visemes.length;
if ( len2 ) {
metadata.vdurations[len2-1] = i;
}
} else {
i = phonemes.length;
}
}
}
metadata.wdurations.push( phonemes.length );
});
if ( this.settings.trace ) {
utils.trace( 'Generate end, output=', { phonemes, metadata } );
}
// Output
return { phonemes, metadata };
}
}
export { LanguageBase };