UNPKG

phonemize

Version:

Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.

239 lines (238 loc) 8.78 kB
"use strict"; /** * Utility functions for phoneme format conversion */ Object.defineProperty(exports, "__esModule", { value: true }); exports.ipaToArpabet = ipaToArpabet; exports.arpabetToIpa = arpabetToIpa; exports.convertChineseTonesToArrows = convertChineseTonesToArrows; exports.pinyinToZhuyin = pinyinToZhuyin; exports.convertChineseTonesToUnicode = convertChineseTonesToUnicode; const consts_1 = require("./consts"); /** * Convert IPA phonetic notation to ARPABET format * @param ipa - IPA phonetic string * @returns ARPABET formatted string */ function ipaToArpabet(ipa) { if (!ipa || typeof ipa !== 'string' || !ipa.trim()) { return ""; } const result = []; let i = 0; while (i < ipa.length) { const char = ipa[i]; // Handle stress markers if (consts_1.IPA_TO_STRESS[char]) { const stress = consts_1.IPA_TO_STRESS[char]; // Apply stress to the next phoneme i++; const nextPhoneme = getNextPhoneme(ipa, i); if (nextPhoneme) { result.push(nextPhoneme.arpabet + stress); i += nextPhoneme.length; } continue; } // Try two-character IPA symbols first const twoChar = ipa.substring(i, i + 2); if (consts_1.IPA_TO_ARPABET[twoChar]) { result.push(consts_1.IPA_TO_ARPABET[twoChar]); i += 2; continue; } // Try single character if (consts_1.IPA_TO_ARPABET[char]) { result.push(consts_1.IPA_TO_ARPABET[char]); i++; continue; } // Handle unknown characters if (char === ' ') { if (result.length > 0 && result[result.length - 1] !== ' ') { result.push(' '); } } else if (char.trim()) { // Unknown non-space character - push as undefined result.push('undefined'); } i++; } return result.join(' ').replace(/\s+/g, ' ').trim(); } /** * Convert ARPABET phonetic notation to IPA format * @param arpabet - ARPABET phonetic string * @returns IPA formatted string */ function arpabetToIpa(arpabet) { if (!arpabet || typeof arpabet !== 'string' || !arpabet.trim()) { return ""; } const phonemes = arpabet.split(/\s+/).filter(p => p.trim()); const result = []; let primaryStressFound = false; let secondaryStressFound = false; // First pass: convert phonemes without stress markers for (const phoneme of phonemes) { const stressMatch = phoneme.match(/([012])$/); const stress = (stressMatch === null || stressMatch === void 0 ? void 0 : stressMatch[0]) || ""; const basePhoneme = phoneme.replace(/[012]$/, ""); const ipaPhoneme = consts_1.ARPABET_TO_IPA[basePhoneme]; if (ipaPhoneme) { result.push(ipaPhoneme); // Track stress positions if (stress === "1") { primaryStressFound = true; } else if (stress === "2") { secondaryStressFound = true; } } else { // Preserve unknown phonemes as-is result.push(phoneme); } } // Add stress markers at the beginning if found let finalResult = result.join(""); if (primaryStressFound) { finalResult = "ˈ" + finalResult; } else if (secondaryStressFound) { finalResult = "ˌ" + finalResult; } return finalResult; } /** * Helper function to extract the next phoneme from IPA string * @param ipa - IPA string * @param startIndex - Starting index * @returns Object with ARPABET equivalent and length */ function getNextPhoneme(ipa, startIndex) { // Try two-character symbols first const twoChar = ipa.substring(startIndex, startIndex + 2); if (consts_1.IPA_TO_ARPABET[twoChar]) { return { arpabet: consts_1.IPA_TO_ARPABET[twoChar], length: 2 }; } // Try single character const oneChar = ipa[startIndex]; if (consts_1.IPA_TO_ARPABET[oneChar]) { return { arpabet: consts_1.IPA_TO_ARPABET[oneChar], length: 1 }; } return null; } /** * Convert Chinese IPA tone marks to arrow format * @param ipa - IPA string with Chinese tone marks * @returns IPA string with arrow tone symbols */ function convertChineseTonesToArrows(ipa) { if (!ipa || typeof ipa !== 'string') { return ipa; } let result = ipa; // Sort by length (longest first) to avoid partial replacements const toneKeys = Object.keys(consts_1.CHINESE_TONE_TO_ARROW).sort((a, b) => b.length - a.length); for (const tonePattern of toneKeys) { const arrowSymbol = consts_1.CHINESE_TONE_TO_ARROW[tonePattern]; result = result.replace(new RegExp(tonePattern, 'g'), arrowSymbol); } return result; } /** * Convert pinyin syllable to Zhuyin (Bopomofo) notation * @param pinyin - Pinyin syllable with tone number (e.g., "zhong1", "wen2") * @returns Zhuyin notation with tone number (e.g., "ㄓㄨㄥ1", "ㄨㄣ2") */ function pinyinToZhuyin(pinyin) { if (!(pinyin === null || pinyin === void 0 ? void 0 : pinyin.trim())) { return pinyin; } // Extract tone number from the end const toneMatch = pinyin.match(/([1-5])$/); const toneNumber = toneMatch ? toneMatch[1] : ''; const syllableWithoutTone = pinyin.replace(/[1-5]$/, ''); // Handle special complete syllables first if (consts_1.PINYIN_FINALS_TO_ZHUYIN[syllableWithoutTone]) { return consts_1.PINYIN_FINALS_TO_ZHUYIN[syllableWithoutTone] + toneNumber; } // Decompose pinyin into initial and final const { initial, final } = decomposePinyinSyllable(syllableWithoutTone); let zhuyin = ''; // Convert initial if (initial && consts_1.PINYIN_INITIALS_TO_ZHUYIN[initial]) { zhuyin += consts_1.PINYIN_INITIALS_TO_ZHUYIN[initial]; } // Convert final if (final && consts_1.PINYIN_FINALS_TO_ZHUYIN[final]) { zhuyin += consts_1.PINYIN_FINALS_TO_ZHUYIN[final]; } else if (final) { // If the final is not recognized, the syllable is invalid. Revert to the original. zhuyin = syllableWithoutTone; console.warn(`Could not find a Zhuyin mapping for pinyin final: ${final}`); } else if (!final && initial) { // If there is only an initial but it's not a special syllable, it's invalid. zhuyin = syllableWithoutTone; } // Append the tone number. Default to 5 (neutral tone) if not present. return zhuyin + (toneNumber || '5'); } /** * Decompose pinyin syllable into initial and final parts * @param syllable - Pinyin syllable without tone * @returns Object with initial and final parts */ function decomposePinyinSyllable(syllable) { // Handle empty or invalid input if (!(syllable === null || syllable === void 0 ? void 0 : syllable.trim())) { return { initial: '', final: '' }; } // Special cases for retroflex sounds if (syllable.startsWith('zh')) { return { initial: 'zh', final: syllable.slice(2) }; } if (syllable.startsWith('ch')) { return { initial: 'ch', final: syllable.slice(2) }; } if (syllable.startsWith('sh')) { return { initial: 'sh', final: syllable.slice(2) }; } // Handle other two-letter initials (none in standard pinyin) // Single letter initials const possibleInitials = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']; for (const initial of possibleInitials) { if (syllable.startsWith(initial)) { return { initial, final: syllable.slice(initial.length) }; } } // No initial found, entire syllable is final return { initial: '', final: syllable }; } /** * Convert Chinese IPA arrow format back to Unicode tone marks * @param ipa - IPA string with arrow tone symbols * @returns IPA string with Unicode tone marks */ function convertChineseTonesToUnicode(ipa) { if (!ipa || typeof ipa !== 'string') { return ipa; } let result = ipa; // Reverse mapping from arrows to Unicode const arrowToUnicode = {}; for (const [unicode, arrow] of Object.entries(consts_1.CHINESE_TONE_TO_ARROW)) { arrowToUnicode[arrow] = unicode; } // Sort by length (longest first) to handle ↓↗ before ↓ const arrowKeys = Object.keys(arrowToUnicode).sort((a, b) => b.length - a.length); for (const arrowSymbol of arrowKeys) { const unicodePattern = arrowToUnicode[arrowSymbol]; result = result.replace(new RegExp(arrowSymbol.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), unicodePattern); } return result; }