echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

github.com/echogarden-project/echogarden

echogarden-project/echogarden

478 lines (410 loc) • 7.9 kB

text/typescript

export function ipaPhoneToKirshenbaum(ipaPhone: string) { let result = '' if (ipaPhone[0] === 'ˈ' || ipaPhone[0] === 'ˌ') { result += ipaToKirshenbaum[ipaPhone[0]] ipaPhone = ipaPhone.substring(1) } const convertedFullPhone = ipaToKirshenbaum[ipaPhone] if (convertedFullPhone) { result += convertedFullPhone } else { for (const char of ipaPhone) { const convertedChar = ipaToKirshenbaum[char] if (convertedChar === undefined) { throw new Error(`Couldn't convert IPA character '${char}' (part of phone '${ipaPhone}') to the Kirshenbaum notation`) } result += convertedChar ?? '_' } } return result } export function ipaWordToTimitTokens(ipaWord: string, subphoneCount = 0) { let result: string[] = [] for (const ipaPhone of ipaWord) { const convertedPhoneSequence = ipaPhoneToTimit(ipaPhone, subphoneCount) if (convertedPhoneSequence == undefined) { throw new Error(`Could not find a TIMIT equivalent for ipa phone '${ipaPhone}'`) } result = [...result, ...convertedPhoneSequence] } return result } export function ipaPhoneToTimit(ipaPhone: string, subphoneCount = 0) { let tokens = ipaToTimit[ipaPhone] if (!tokens) { return undefined } if (subphoneCount > 0) { tokens = splitTokensToSubphones(tokens, subphoneCount) } return tokens } export function arpabetPhoneToIpa(arpabetPhone: string) { return arpabetToIPA[arpabetPhone.toUpperCase()] } export function timitPhoneToIpa(arpabetPhone: string) { return timitToIPA[arpabetPhone.toUpperCase()] } export function splitTokensToSubphones(tokens: string[], subphoneCount = 3) { const result: string[] = [] for (const token of tokens) { if (token == 'pause') { result.push(token) continue } for (let i = 1; i <= subphoneCount; i++) { result.push(`${token}/${i}`) } } return result } // ARPABET was invented for English. // The standard dictionary written in ARPABET is the CMU dictionary. // TIMIT is written in a variant of ARPABET that includes a couple // of non-standard allophones, and most significantly, includes // separate symbols for the closure and release portions of each stop. const arpabetToIPA: { [p: string]: string | undefined } = { 'AA': 'ɑ', 'AE': 'æ', 'AH': 'ʌ', 'AH0': 'ə', 'AO': 'ɔ', 'AW': 'aʊ', 'AY': 'aɪ', 'EH': 'ɛ', 'ER': 'ɝ', 'ER0': 'ɚ', 'EY': 'eɪ', 'IH': 'ɪ', 'IH0': 'ɨ', 'IY': 'i', 'OW': 'oʊ', 'OY': 'ɔɪ', 'UH': 'ʊ', 'UW': 'u', 'B': 'b', 'CH': 'tʃ', 'D': 'd', 'DH': 'ð', 'EL': 'l̩ ', 'EM': 'm̩', 'EN': 'n̩', 'F': 'f', 'G': 'ɡ', 'HH': 'h', 'JH': 'dʒ', 'K': 'k', 'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ŋ', 'P': 'p', 'Q': 'ʔ', 'R': 'ɹ', 'S': 's', 'SH': 'ʃ', 'T': 't', 'TH': 'θ', 'V': 'v', 'W': 'w', 'WH': 'ʍ', 'Y': 'j', 'Z': 'z', 'ZH': 'ʒ' } const timitToIPA: { [p: string]: string | undefined } = { ...arpabetToIPA, 'AX': 'ə', 'AX-H': 'ə̥', 'AXR': 'ɚ', 'B': '', 'BCL': 'b', 'D': '', 'DCL': 'd', 'DX': 'ɾ', 'ENG': 'ŋ̍', 'EPI': '', 'G': '', 'GCL': 'g', 'HV': 'ɦ', 'H//': '', 'IX': 'ɨ', 'KCL': 'k', 'K': '', 'NX': 'ɾ̃', 'P': '', 'PAU': '', 'PCL': 'p', 'T': '', 'TCL': 't', 'UX': 'ʉ', } const ipaToTimit: { [p: string]: string[] | undefined } = { // Vowels 'ɑ': ['aa'], 'ɑː': ['aa'], 'ɑːɹ': ['aa', 'r'], // ? 'a': ['aa'], // ? UK English 'aʊ': ['aw'], 'aɪ': ['ay'], 'aɪɚ': ['ay', 'r'], // ? 'aɪə': ['ay', 'ax'], // ? 'æ': ['ae'], 'æʊ': ['ae', 'uh'], // ? 'ʌ': ['ah'], 'ɐ': ['ah'], // ? 'ə': ['ax'], 'ə̥': ['ax-h'], 'əʊ': ['ax', 'uh'], // ? UK English 'ɚ': ['axr'], 'ᵻ': ['ax'], // ? 'ɔ': ['ao'], 'ɔː': ['ao'], 'ɔːɹ': ['ao', 'r'], // ? 'ɔɪ': ['oy'], 'ɒ': ['ao'], // ? UK English 'ɛ': ['eh'], 'ɜː': ['eh'], // ? 'ɝ': ['er'], 'ɛɹ': ['er'], // ? 'eɪ': ['ey'], 'eə': ['ey', 'ax'], // ? UK English 'eː': ['ey'], // ? UK English 'ɪ': ['ih'], 'ɪɹ': ['ih', 'r'], // ? 'ɨ': ['ix'], 'i': ['iy'], 'iː': ['iy'], // ? 'iə': ['iy', 'ax'], // ? 'oʊ': ['ow'], 'oː': ['ow'], // ? 'oːɹ': ['ow', 'r'], // ? 'ʊ': ['uh'], 'ʊɹ': ['uh', 'r'], // ? 'ʊə': ['uh', 'ax'], // ? UK English 'u': ['uw'], 'uː': ['uw'], // ? // Consonants 'b': ['bcl', 'b'], 'tʃ': ['tcl', 'ch'], 'd': ['dcl', 'd'], 'ð': ['dh'], 'l̩': ['el'], 'əl': ['el'], // ? 'm̩': ['em'], 'n̩': ['en'], 'f': ['f'], 'ɡ': ['gcl', 'g'], 'h': ['hh'], 'dʒ': ['dcl', 'jh'], 'k': ['kcl', 'k'], 'l': ['l'], 'm': ['m'], 'n': ['n'], 'ŋ': ['ng'], 'p': ['pcl', 'p'], 'ʔ': ['q'], 'ɹ': ['r'], 's': ['s'], 'ʃ': ['sh'], 't': ['tcl', 't'], 'θ': ['th'], 'v': ['v'], 'w': ['w'], 'ʍ': ['wh'], 'j': ['y'], 'z': ['z'], 'ʒ': ['zh'], 'ɾ̃': ['nx'], 'ʉ': ['ux'], 'ɦ': ['hv'], 'ŋ̍': ['eng'], 'ɾ': ['dx'], } // Kirshenbaum is an ASCII encoding of IPA // The variant used by eSpeak is used here, and has many differences to standard Kirshenbaum export const ipaToKirshenbaum: Record<string, string> = { //// Vowels // Close 'i': 'i', 'y': 'y', 'ɨ': 'i"', 'ʉ': 'u"', 'ɯ': 'u-', 'u': 'u', // Near-close 'ɪ': 'I', 'ɩ': 'I', //'ᵻ': 'I', 'ᵻ': 'I#', // eSpeak 'ʏ': 'I.', 'ʊ': 'U', 'ɷ': 'U', // Close-mid 'e': 'e', 'ø': 'Y', //'ɘ': '@<umd>', // Reference 'ɘ': '@', // Simplified 'ɵ': '@.', //'ɵ': '8', // eSpeak 'ɤ': 'o-', 'o': 'o', // Mid: 'ə': '@', // Open-mid: 'ɛ': 'E', 'œ': 'W', 'ɜ': '3', // eSpeak 'ɜː': '3:', //'ɜ': 'V"', 'ɞ': 'O"', 'ʌ': 'V', 'ɔ': 'O', // Near-open //'æ': '&', // Reference. note: passing '&' to eSpeak in SSML (XML) mode will cause an error due to escaping conflict 'æ': 'a', // eSpeak 'ɐ': 'a#', // Open 'a': 'a', //'ɶ': '&.', // Reference. note: passing '&' to eSpeak in SSML (XML) mode will cause an error due to escaping conflict 'ɶ': 'W#', // eSpeak 'ä': 'a"', 'ɒ̈': 'A".', 'ɑ': 'A', 'ɑː': '0', // eSpeak //'ɒ': 'A.', // Reference 'ɒ': '0', // eSpeak // Rhotic //'ɚ': 'R', 'ɚ': '3', // eSpeak //'ɝ': 'R<umd>', // Reference 'ɝ': 'R', // Simplified // More vowels 'ᵊ': '', //// Consonants // Nasals 'm': 'm', 'ɱ': 'M', 'n̪': 'n[', 'n': 'n', 'ɳ': 'n.', 'ṇ': 'n.', 'ɲ': 'n^', 'ŋ': 'N', 'ɴ': 'n"', //'n͡g': 'n<lbv>', // Stops 'p': 'p', 'b': 'b', 't̪': 't[', 'd̪': 'd[', 't': 't', 'd': 'd', 'ʈ': 't.', 'ṭ': 't.', 'ɖ': 'd.', 'ḍ': 'd.', 'c': 'c', 'ɟ': 'J', 'k': 'k', 'ɡ': 'g', 'q': 'q', 'ɢ': 'G', //'k͡p': 't<lbv>', //'ɡ͡b': 'd<lbv>', 'ʔ': '?', // Fricatives 'φ': 'P', 'Φ': 'P', 'β': 'B', 'f': 'f', 'v': 'v', 'θ': 'T', 'ð': 'D', 's': 's', 'z': 'z', 'ʂ': 's.', 'ṣ': 's.', 'ʐ': 'z.', 'ẓ': 'z.', 'ʃ': 'S', 'ʒ': 'Z', 'ç': 'C', 'ʝ': 'C<vcd>', 'x': 'x', 'ɣ': 'Q', 'χ': 'X', 'ʁ': 'g"', // Reference //'ʁ': 'r', // eSpeak 'ʍ': 'w<vls>', 'w': 'w', 'ħ': 'H', 'ʕ': 'H<vcd>', 'h': 'h', 'ɦ': 'h<?>', // Approximants 'ʋ': 'r<lbd>', 'ɹ̪': 'r[', 'ɹ': 'r', 'ɻ': 'r.', 'ɻ̣': 'r.', 'j': 'j', 'ɥ': 'j<rnd>', 'ɰ': 'j<vel>', // Laterals 'l̪': 'l[', 'l': 'l', 'ɭ': 'l.', 'ḷ': 'l.', 'ʎ': 'l^', 'ʟ': 'L', // Trills 'ʙ': 'b<trl>', //'r': 'r<trl>', // Reference alv trl 'r': 'R', // eSpeak 'ʀ': 'r"', // Reference uvl trl // Flaps //'ɾ': '*', 'ɾ': 't#', // eSpeak 'ɽ': '*.', 'ṛ': '*.', // Ejectives 'pʼ': 'p`', 'tʼ': 't`', 'cʼ': 'c`', 'kʼ': 'k`', 'qʼ': 'q`', 'ʠ': 'q`', // Implosives 'ɓ': 'b`', 'ɗ': 'd`', 'ʄ': 'J`', 'ɠ': 'g`', 'ʛ': 'G`', // Clicks 'ʘ': 'p!', 'ǀ': 't!', 'ʇ': 't!', 'ǃ': 'c!', 'ǂ': 'c!', 'ʗ': 'c!', 'ʞ': 'k!', 'ǁ': 'l!', 'ʖ': 'l!', // Lateral fricative 'ɬ': 'L', 'ɮ': 'z<lat>', 'ɫ': 'L', // Lateral flap 'ɺ': '*<lat>', // Other consonants 'ʰ': '#', 'ʲ': ';', '̪': '[', '̃': '~', // Stress marks and other misc characters 'ˈ': '\'', 'ˌ': ',', 'ː': ':', '-': '-', }