UNPKG

@met4citizen/talkinghead

Version:

Talking Head (3D): A JavaScript class for real-time lip-sync using Ready Player Me full-body 3D avatars.

github.com/met4citizen/TalkingHead

met4citizen/TalkingHead

391 lines (336 loc) • 13.4 kB

JavaScript

/** * @class English lip-sync processor * @author Mika Suominen */ class LipsyncEn { /** * @constructor */ constructor() { // English words to Oculus visemes, algorithmic rules adapted from: // NRL Report 7948, "Automatic Translation of English Text to Phonetics by Means of Letter-to-Sound Rules" (1976) // by HONEY SUE ELOVITZ, RODNEY W. JOHNSON, ASTRID McHUGH, AND JOHN E. SHORE // Available at: https://apps.dtic.mil/sti/pdfs/ADA021929.pdf this.rules = { 'A': [ "[A] =aa", " [ARE] =aa RR", " [AR]O=aa RR", "[AR]#=E RR", " ^[AS]#=E SS", "[A]WA=aa", "[AW]=aa", " :[ANY]=E nn I", "[A]^+#=E", "#:[ALLY]=aa nn I", " [AL]#=aa nn", "[AGAIN]=aa kk E nn", "#:[AG]E=I kk", "[A]^+:#=aa", ":[A]^+ =E", "[A]^%=E", " [ARR]=aa RR", "[ARR]=aa RR", " :[AR] =aa RR", "[AR] =E", "[AR]=aa RR", "[AIR]=E RR", "[AI]=E", "[AY]=E", "[AU]=aa", "#:[AL] =aa nn", "#:[ALS] =aa nn SS", "[ALK]=aa kk", "[AL]^=aa nn", " :[ABLE]=E PP aa nn", "[ABLE]=aa PP aa nn", "[ANG]+=E nn kk", "[A]=aa" ], 'B': [ " [BE]^#=PP I", "[BEING]=PP I I nn", " [BOTH] =PP O TH", " [BUS]#=PP I SS", "[BUIL]=PP I nn", "[B]=PP" ], 'C': [ " [CH]^=kk", "^E[CH]=kk", "[CH]=CH", " S[CI]#=SS aa", "[CI]A=SS", "[CI]O=SS", "[CI]EN=SS", "[C]+=SS", "[CK]=kk", "[COM]%=kk aa PP", "[C]=kk" ], 'D': [ "#:[DED] =DD I DD", ".E[D] =DD", "#^:E[D] =DD", " [DE]^#=DD I", " [DO] =DD U", " [DOES]=DD aa SS", " [DOING]=DD U I nn", " [DOW]=DD aa", "[DU]A=kk U", "[D]=DD" ], 'E': [ "#:[E] =", "'^:[E] =", " :[E] =I", "#[ED] =DD", "#:[E]D =", "[EV]ER=E FF", "[E]^%=I", "[ERI]#=I RR I", "[ERI]=E RR I", "#:[ER]#=E", "[ER]#=E RR", "[ER]=E", " [EVEN]=I FF E nn", "#:[E]W=", "@[EW]=U", "[EW]=I U", "[E]O=I", "#:&[ES] =I SS", "#:[E]S =", "#:[ELY] =nn I", "#:[EMENT]=PP E nn DD", "[EFUL]=FF U nn", "[EE]=I", "[EARN]=E nn", " [EAR]^=E", "[EAD]=E DD", "#:[EA] =I aa", "[EA]SU=E", "[EA]=I", "[EIGH]=E", "[EI]=I", " [EYE]=aa", "[EY]=I", "[EU]=I U", "[E]=E" ], 'F': [ "[FUL]=FF U nn", "[F]=FF" ], 'G': [ "[GIV]=kk I FF", " [G]I^=kk", "[GE]T=kk E", "SU[GGES]=kk kk E SS", "[GG]=kk", " B#[G]=kk", "[G]+=kk", "[GREAT]=kk RR E DD", "#[GH]=", "[G]=kk" ], 'H': [ " [HAV]=I aa FF", " [HERE]=I I RR", " [HOUR]=aa EE", "[HOW]=I aa", "[H]#=I", "[H]=" ], 'I': [ " [IN]=I nn", " [I] =aa", "[IN]D=aa nn", "[IER]=I E", "#:R[IED] =I DD", "[IED] =aa DD", "[IEN]=I E nn", "[IE]T=aa E", " :[I]%=aa", "[I]%=I", "[IE]=I", "[I]^+:#=I", "[IR]#=aa RR", "[IZ]%=aa SS", "[IS]%=aa SS", "[I]D%=aa", "+^[I]^+=I", "[I]T%=aa", "#^:[I]^+=I", "[I]^+=aa", "[IR]=E", "[IGH]=aa", "[ILD]=aa nn DD", "[IGN] =aa nn", "[IGN]^=aa nn", "[IGN]%=aa nn", "[IQUE]=I kk", "[I]=I" ], 'J': [ "[J]=kk" ], 'K': [ " [K]N=", "[K]=kk" ], 'L': [ "[LO]C#=nn O", "L[L]=", "#^:[L]%=aa nn", "[LEAD]=nn I DD", "[L]=nn" ], 'M': [ "[MOV]=PP U FF", "[M]=PP" ], 'N': [ "E[NG]+=nn kk", "[NG]R=nn kk", "[NG]#=nn kk", "[NGL]%=nn kk aa nn", "[NG]=nn", "[NK]=nn kk", " [NOW] =nn aa", "[N]=nn" ], 'O': [ "[OF] =aa FF", "[OROUGH]=E O", "#:[OR] =E", "#:[ORS] =E SS", "[OR]=aa RR", " [ONE]=FF aa nn", "[OW]=O", " [OVER]=O FF E", "[OV]=aa FF", "[O]^%=O", "[O]^EN=O", "[O]^I#=O", "[OL]D=O nn", "[OUGHT]=aa DD", "[OUGH]=aa FF", " [OU]=aa", "H[OU]S#=aa", "[OUS]=aa SS", "[OUR]=aa RR", "[OULD]=U DD", "^[OU]^L=aa", "[OUP]=U OO", "[OU]=aa", "[OY]=O", "[OING]=O I nn", "[OI]=O", "[OOR]=aa RR", "[OOK]=U kk", "[OOD]=U DD", "[OO]=U", "[O]E=O", "[O] =O", "[OA]=O", " [ONLY]=O nn nn I", " [ONCE]=FF aa nn SS", "[ON'T]=O nn DD", "C[O]N=aa", "[O]NG=aa", " ^:[O]N=aa", "I[ON]=aa nn", "#:[ON] =aa nn", "#^[ON]=aa nn", "[O]ST =O", "[OF]^=aa FF", "[OTHER]=aa TH E", "[OSS] =aa SS", "#^:[OM]=aa PP", "[O]=aa" ], 'P': [ "[PH]=FF", "[PEOP]=PP I PP", "[POW]=PP aa", "[PUT] =PP U DD", "[P]=PP" ], 'Q': [ "[QUAR]=kk FF aa RR", "[QU]=kk FF", "[Q]=kk" ], 'R': [ " [RE]^#=RR I", "[R]=RR" ], 'S': [ "[SH]=SS", "#[SION]=SS aa nn", "[SOME]=SS aa PP", "#[SUR]#=SS E", "[SUR]#=SS E", "#[SU]#=SS U", "#[SSU]#=SS U", "#[SED] =SS DD", "#[S]#=SS", "[SAID]=SS E DD", "^[SION]=SS aa nn", "[S]S=", ".[S] =SS", "#:.E[S] =SS", "#^:##[S] =SS", "#^:#[S] =SS", "U[S] =SS", " :#[S] =SS", " [SCH]=SS kk", "[S]C+=", "#[SM]=SS PP", "#[SN]'=SS aa nn", "[S]=SS" ], 'T': [ " [THE] =TH aa", "[TO] =DD U", "[THAT] =TH aa DD", " [THIS] =TH I SS", " [THEY]=TH E", " [THERE]=TH E RR", "[THER]=TH E", "[THEIR]=TH E RR", " [THAN] =TH aa nn", " [THEM] =TH E PP", "[THESE] =TH I SS", " [THEN]=TH E nn", "[THROUGH]=TH RR U", "[THOSE]=TH O SS", "[THOUGH] =TH O", " [THUS]=TH aa SS", "[TH]=TH", "#:[TED] =DD I DD", "S[TI]#N=CH", "[TI]O=SS", "[TI]A=SS", "[TIEN]=SS aa nn", "[TUR]#=CH E", "[TU]A=CH U", " [TWO]=DD U", "[T]=DD" ], 'U': [ " [UN]I=I U nn", " [UN]=aa nn", " [UPON]=aa PP aa nn", "@[UR]#=U RR", "[UR]#=I U RR", "[UR]=E", "[U]^ =aa", "[U]^^=aa", "[UY]=aa", " G[U]#=", "G[U]%=", "G[U]#=FF", "#N[U]=I U", "@[U]=I", "[U]=I U" ], 'V': [ "[VIEW]=FF I U", "[V]=FF" ], 'W': [ " [WERE]=FF E", "[WA]S=FF aa", "[WA]T=FF aa", "[WHERE]=FF E RR", "[WHAT]=FF aa DD", "[WHOL]=I O nn", "[WHO]=I U", "[WH]=FF", "[WAR]=FF aa RR", "[WOR]^=FF E", "[WR]=RR", "[W]=FF" ], 'X': [ " [X]=SS", "[X]=kk SS" ], 'Y': [ "[YOUNG]=I aa nn", " [YOU]=I U", " [YES]=I E SS", " [Y]=I", "#^:[Y] =I", "#^:[Y]I=I", " :[Y] =aa", " :[Y]#=aa", " :[Y]^+:#=I", " :[Y]^#=I", "[Y]=I" ], 'Z': [ "[Z]=SS" ] }; const ops = { '#': '[AEIOUY]+', // One or more vowels AEIOUY // This one is not used: "'": '[BCDFGHJKLMNPQRSTVWXZ]+', // One or more consonants BCDFGHJKLMNPQRSTVWXZ '.': '[BDVGJLMNRWZ]', // One voiced consonant BDVGJLMNRWZ // This one is not used: '$': '[BDVGJLMNRWZ][EI]', // One consonant followed by E or I '%': '(?:ER|E|ES|ED|ING|ELY)', // One of ER, E, ES, ED, ING, ELY '&': '(?:[SCGZXJ]|CH|SH)', // One of S, C, G, Z, X, J, CH, SH '@': '(?:[TSRDLZNJ]|TH|CH|SH)', // One of T, S, R, D, L, Z, N, J, TH, CH, SH '^': '[BCDFGHJKLMNPQRSTVWXZ]', // One consonant BCDFGHJKLMNPQRSTVWXZ '+': '[EIY]', // One of E, I, Y ':': '[BCDFGHJKLMNPQRSTVWXZ]*', // Zero or more consonants BCDFGHJKLMNPQRSTVWXZ ' ': '\\b' // Start/end of the word }; // Convert rules to regex Object.keys(this.rules).forEach( key => { this.rules[key] = this.rules[key].map( rule => { const posL = rule.indexOf('['); const posR = rule.indexOf(']'); const posE = rule.indexOf('='); const strLeft = rule.substring(0,posL); const strLetters = rule.substring(posL+1,posR); const strRight = rule.substring(posR+1,posE); const strVisemes = rule.substring(posE+1); const o = { regex: '', move: 0, visemes: [] }; let exp = ''; exp += [...strLeft].map( x => ops[x] || x ).join(''); const ctxLetters = [...strLetters]; ctxLetters[0] = ctxLetters[0].toLowerCase(); exp += ctxLetters.join(''); o.move = ctxLetters.length; exp += [...strRight].map( x => ops[x] || x ).join(''); o.regex = new RegExp(exp); if ( strVisemes.length ) { strVisemes.split(' ').forEach( viseme => { o.visemes.push(viseme); }); } return o; }); }); // Viseme durations in relative unit (1=average) // TODO: Check for statistics for English this.visemeDurations = { 'aa': 0.95, 'E': 0.90, 'I': 0.92, 'O': 0.96, 'U': 0.95, 'PP': 1.08, 'SS': 1.23, 'TH': 1, 'DD': 1.05, 'FF': 1.00, 'kk': 1.21, 'nn': 0.88, 'RR': 0.88, 'DD': 1.05, 'sil': 1 }; // Pauses in relative units (1=average) this.specialDurations = { ' ': 1, ',': 3, '-':0.5, "'":0.5 }; // English number words this.digits = ['oh', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']; this.ones = ['','one','two','three','four','five','six','seven','eight','nine']; this.tens = ['','','twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety']; this.teens = ['ten','eleven','twelve','thirteen','fourteen','fifteen','sixteen','seventeen','eighteen','nineteen']; // Symbols to English this.symbols = { '%': 'percent', '€': 'euros', '&': 'and', '+': 'plus', '$': 'dollars' }; this.symbolsReg = /[%€&\+\$]/g; } convert_digit_by_digit(num) { num = String(num).split(""); let numWords = ""; for(let m=0; m<num.length; m++) { numWords += this.digits[num[m]] + " "; } numWords = numWords.substring(0, numWords.length - 1); //kill final space return numWords; } convert_sets_of_two(num) { let firstNumHalf = String(num).substring(0, 2); let secondNumHalf = String(num).substring(2, 4); let numWords = this.convert_tens(firstNumHalf); numWords += " " + this.convert_tens(secondNumHalf); return numWords; } convert_millions(num){ if (num>=1000000){ return this.convert_millions(Math.floor(num/1000000))+" million "+this.convert_thousands(num%1000000); } else { return this.convert_thousands(num); } } convert_thousands(num){ if (num>=1000){ return this.convert_hundreds(Math.floor(num/1000))+" thousand "+this.convert_hundreds(num%1000); } else { return this.convert_hundreds(num); } } convert_hundreds(num){ if (num>99){ return this.ones[Math.floor(num/100)]+" hundred "+this.convert_tens(num%100); } else { return this.convert_tens(num); } } convert_tens(num){ if (num < 10){ return (Number(num) != 0 && num.toString().startsWith("0") ? "oh " : "") + this.ones[Number(num)]; } else if (num>=10 && num<20) { return this.teens[num-10]; } else { return this.tens[Math.floor(num/10)]+" "+this.ones[num%10]; } } convertNumberToWords(num){ if(num == "0"){ return "zero"; } else if(num.startsWith('0')){ return this.convert_digit_by_digit(num); } else if ((num<1000&&num>99)||(num>10000&&num<1000000)) { //read area and zip codes digit by digit return this.convert_digit_by_digit(num); } else if ((num > 1000 && num < 2000)||(num>2009&&num<3000)) { //read years as two sets of two digits return (num % 100 != 0 ? this.convert_sets_of_two(num) : this.convert_tens(num.substring(0, 2)) + " hundred"); } else { return this.convert_millions(num); } } /** * Preprocess text: * - convert symbols to words * - convert numbers to words * - filter out characters that should be left unspoken * @param {string} s Text * @return {string} Pre-processsed text. */ preProcessText(s) { return s.replace('/[#_*\":;]/g','') .replace( this.symbolsReg, (symbol) => { return ' ' + this.symbols[symbol] + ' '; }) .replace(/(\d)\.(\d)/g, '$1 point $2') // Number separator .replace(/\d+/g, this.convertNumberToWords.bind(this)) // Numbers to words .replace(/(\D)\1\1+/g, "$1$1") // max 2 repeating chars .replaceAll(' ',' ') // Only one repeating space .normalize('NFD').replace(/[\u0300-\u036f]/g, '').normalize('NFC') // Remove non-English diacritics .trim(); } /** * Convert word to Oculus LipSync Visemes and durations * @param {string} w Text * @return {Object} Oculus LipSync Visemes and durations. */ wordsToVisemes(w) { let o = { words: w.toUpperCase(), visemes: [], times: [], durations: [], i:0 }; let t = 0; const chars = [...o.words]; while( o.i < chars.length ) { const c = chars[o.i]; const ruleset = this.rules[c]; if ( ruleset ) { for(let i=0; i<ruleset.length; i++) { const rule = ruleset[i]; const test = o.words.substring(0, o.i) + c.toLowerCase() + o.words.substring(o.i+1); let matches = test.match(rule.regex); if ( matches ) { rule.visemes.forEach( viseme => { if ( o.visemes.length && o.visemes[ o.visemes.length - 1 ] === viseme ) { const d = 0.7 * (this.visemeDurations[viseme] || 1); o.durations[ o.durations.length - 1 ] += d; t += d; } else { const d = this.visemeDurations[viseme] || 1; o.visemes.push( viseme ); o.times.push(t); o.durations.push( d ); t += d; } }) o.i += rule.move; break; } } } else { o.i++; t += this.specialDurations[c] || 0; } } return o; } } export { LipsyncEn };