phonemize
Version:
Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.
239 lines (238 loc) • 8.78 kB
JavaScript
;
/**
* Utility functions for phoneme format conversion
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.ipaToArpabet = ipaToArpabet;
exports.arpabetToIpa = arpabetToIpa;
exports.convertChineseTonesToArrows = convertChineseTonesToArrows;
exports.pinyinToZhuyin = pinyinToZhuyin;
exports.convertChineseTonesToUnicode = convertChineseTonesToUnicode;
const consts_1 = require("./consts");
/**
* Convert IPA phonetic notation to ARPABET format
* @param ipa - IPA phonetic string
* @returns ARPABET formatted string
*/
function ipaToArpabet(ipa) {
if (!ipa || typeof ipa !== 'string' || !ipa.trim()) {
return "";
}
const result = [];
let i = 0;
while (i < ipa.length) {
const char = ipa[i];
// Handle stress markers
if (consts_1.IPA_TO_STRESS[char]) {
const stress = consts_1.IPA_TO_STRESS[char];
// Apply stress to the next phoneme
i++;
const nextPhoneme = getNextPhoneme(ipa, i);
if (nextPhoneme) {
result.push(nextPhoneme.arpabet + stress);
i += nextPhoneme.length;
}
continue;
}
// Try two-character IPA symbols first
const twoChar = ipa.substring(i, i + 2);
if (consts_1.IPA_TO_ARPABET[twoChar]) {
result.push(consts_1.IPA_TO_ARPABET[twoChar]);
i += 2;
continue;
}
// Try single character
if (consts_1.IPA_TO_ARPABET[char]) {
result.push(consts_1.IPA_TO_ARPABET[char]);
i++;
continue;
}
// Handle unknown characters
if (char === ' ') {
if (result.length > 0 && result[result.length - 1] !== ' ') {
result.push(' ');
}
}
else if (char.trim()) {
// Unknown non-space character - push as undefined
result.push('undefined');
}
i++;
}
return result.join(' ').replace(/\s+/g, ' ').trim();
}
/**
* Convert ARPABET phonetic notation to IPA format
* @param arpabet - ARPABET phonetic string
* @returns IPA formatted string
*/
function arpabetToIpa(arpabet) {
if (!arpabet || typeof arpabet !== 'string' || !arpabet.trim()) {
return "";
}
const phonemes = arpabet.split(/\s+/).filter(p => p.trim());
const result = [];
let primaryStressFound = false;
let secondaryStressFound = false;
// First pass: convert phonemes without stress markers
for (const phoneme of phonemes) {
const stressMatch = phoneme.match(/([012])$/);
const stress = (stressMatch === null || stressMatch === void 0 ? void 0 : stressMatch[0]) || "";
const basePhoneme = phoneme.replace(/[012]$/, "");
const ipaPhoneme = consts_1.ARPABET_TO_IPA[basePhoneme];
if (ipaPhoneme) {
result.push(ipaPhoneme);
// Track stress positions
if (stress === "1") {
primaryStressFound = true;
}
else if (stress === "2") {
secondaryStressFound = true;
}
}
else {
// Preserve unknown phonemes as-is
result.push(phoneme);
}
}
// Add stress markers at the beginning if found
let finalResult = result.join("");
if (primaryStressFound) {
finalResult = "ˈ" + finalResult;
}
else if (secondaryStressFound) {
finalResult = "ˌ" + finalResult;
}
return finalResult;
}
/**
* Helper function to extract the next phoneme from IPA string
* @param ipa - IPA string
* @param startIndex - Starting index
* @returns Object with ARPABET equivalent and length
*/
function getNextPhoneme(ipa, startIndex) {
// Try two-character symbols first
const twoChar = ipa.substring(startIndex, startIndex + 2);
if (consts_1.IPA_TO_ARPABET[twoChar]) {
return { arpabet: consts_1.IPA_TO_ARPABET[twoChar], length: 2 };
}
// Try single character
const oneChar = ipa[startIndex];
if (consts_1.IPA_TO_ARPABET[oneChar]) {
return { arpabet: consts_1.IPA_TO_ARPABET[oneChar], length: 1 };
}
return null;
}
/**
* Convert Chinese IPA tone marks to arrow format
* @param ipa - IPA string with Chinese tone marks
* @returns IPA string with arrow tone symbols
*/
function convertChineseTonesToArrows(ipa) {
if (!ipa || typeof ipa !== 'string') {
return ipa;
}
let result = ipa;
// Sort by length (longest first) to avoid partial replacements
const toneKeys = Object.keys(consts_1.CHINESE_TONE_TO_ARROW).sort((a, b) => b.length - a.length);
for (const tonePattern of toneKeys) {
const arrowSymbol = consts_1.CHINESE_TONE_TO_ARROW[tonePattern];
result = result.replace(new RegExp(tonePattern, 'g'), arrowSymbol);
}
return result;
}
/**
* Convert pinyin syllable to Zhuyin (Bopomofo) notation
* @param pinyin - Pinyin syllable with tone number (e.g., "zhong1", "wen2")
* @returns Zhuyin notation with tone number (e.g., "ㄓㄨㄥ1", "ㄨㄣ2")
*/
function pinyinToZhuyin(pinyin) {
if (!(pinyin === null || pinyin === void 0 ? void 0 : pinyin.trim())) {
return pinyin;
}
// Extract tone number from the end
const toneMatch = pinyin.match(/([1-5])$/);
const toneNumber = toneMatch ? toneMatch[1] : '';
const syllableWithoutTone = pinyin.replace(/[1-5]$/, '');
// Handle special complete syllables first
if (consts_1.PINYIN_FINALS_TO_ZHUYIN[syllableWithoutTone]) {
return consts_1.PINYIN_FINALS_TO_ZHUYIN[syllableWithoutTone] + toneNumber;
}
// Decompose pinyin into initial and final
const { initial, final } = decomposePinyinSyllable(syllableWithoutTone);
let zhuyin = '';
// Convert initial
if (initial && consts_1.PINYIN_INITIALS_TO_ZHUYIN[initial]) {
zhuyin += consts_1.PINYIN_INITIALS_TO_ZHUYIN[initial];
}
// Convert final
if (final && consts_1.PINYIN_FINALS_TO_ZHUYIN[final]) {
zhuyin += consts_1.PINYIN_FINALS_TO_ZHUYIN[final];
}
else if (final) {
// If the final is not recognized, the syllable is invalid. Revert to the original.
zhuyin = syllableWithoutTone;
console.warn(`Could not find a Zhuyin mapping for pinyin final: ${final}`);
}
else if (!final && initial) {
// If there is only an initial but it's not a special syllable, it's invalid.
zhuyin = syllableWithoutTone;
}
// Append the tone number. Default to 5 (neutral tone) if not present.
return zhuyin + (toneNumber || '5');
}
/**
* Decompose pinyin syllable into initial and final parts
* @param syllable - Pinyin syllable without tone
* @returns Object with initial and final parts
*/
function decomposePinyinSyllable(syllable) {
// Handle empty or invalid input
if (!(syllable === null || syllable === void 0 ? void 0 : syllable.trim())) {
return { initial: '', final: '' };
}
// Special cases for retroflex sounds
if (syllable.startsWith('zh')) {
return { initial: 'zh', final: syllable.slice(2) };
}
if (syllable.startsWith('ch')) {
return { initial: 'ch', final: syllable.slice(2) };
}
if (syllable.startsWith('sh')) {
return { initial: 'sh', final: syllable.slice(2) };
}
// Handle other two-letter initials (none in standard pinyin)
// Single letter initials
const possibleInitials = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w'];
for (const initial of possibleInitials) {
if (syllable.startsWith(initial)) {
return { initial, final: syllable.slice(initial.length) };
}
}
// No initial found, entire syllable is final
return { initial: '', final: syllable };
}
/**
* Convert Chinese IPA arrow format back to Unicode tone marks
* @param ipa - IPA string with arrow tone symbols
* @returns IPA string with Unicode tone marks
*/
function convertChineseTonesToUnicode(ipa) {
if (!ipa || typeof ipa !== 'string') {
return ipa;
}
let result = ipa;
// Reverse mapping from arrows to Unicode
const arrowToUnicode = {};
for (const [unicode, arrow] of Object.entries(consts_1.CHINESE_TONE_TO_ARROW)) {
arrowToUnicode[arrow] = unicode;
}
// Sort by length (longest first) to handle ↓↗ before ↓
const arrowKeys = Object.keys(arrowToUnicode).sort((a, b) => b.length - a.length);
for (const arrowSymbol of arrowKeys) {
const unicodePattern = arrowToUnicode[arrowSymbol];
result = result.replace(new RegExp(arrowSymbol.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), unicodePattern);
}
return result;
}