phonemize
Version:
Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.
321 lines (320 loc) • 11.5 kB
JavaScript
"use strict";
/**
* Chinese Grapheme-to-Phoneme (G2P) conversion system
* Converts Chinese text to International Phonetic Alphabet (IPA) notation
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.chineseG2P = exports.ChineseG2P = void 0;
const pinyin_pro_1 = require("pinyin-pro");
const dict_json_1 = __importDefault(require("../data/zh/dict.json"));
const utils_1 = require("./utils");
(0, pinyin_pro_1.addDict)(dict_json_1.default, 'phonemize-zh');
/**
* Comprehensive pinyin to IPA phoneme mapping
* Organized by phoneme type for better maintainability
*/
const PINYIN_TO_IPA = {
// === INITIALS (聲母) ===
// Stops
'b': 'p', // 不送氣清雙唇塞音
'p': 'pʰ', // 送氣清雙唇塞音
'd': 't', // 不送氣清齒齦塞音
't': 'tʰ', // 送氣清齒齦塞音
'g': 'k', // 不送氣清軟顎塞音
'k': 'kʰ', // 送氣清軟顎塞音
// Affricates
'j': 'tɕ', // 不送氣清齦顎塞擦音
'q': 'tɕʰ', // 送氣清齦顎塞擦音
'zh': 'ʈʂ', // 不送氣清捲舌塞擦音
'ch': 'ʈʂʰ', // 送氣清捲舌塞擦音
'z': 'ts', // 不送氣清齒塞擦音
'c': 'tsʰ', // 送氣清齒塞擦音
// Fricatives
'f': 'f', // 清唇齒擦音
'x': 'ɕ', // 清齦顎擦音
'sh': 'ʂ', // 清捲舌擦音
'r': 'ʐ', // 濁捲舌擦音
's': 's', // 清齒擦音
'h': 'x', // 清軟顎擦音
// Nasals & Liquids
'm': 'm', // 雙唇鼻音
'n': 'n', // 齒齦鼻音
'l': 'l', // 齒齦邊音
// Glides
'w': 'w', // 圓唇軟顎近音
'y': 'j', // 硬顎近音
// === FINALS (韻母) ===
// Simple vowels
'a': 'a', // 低央不圓唇元音
'o': 'o', // 中後圓唇元音
'e': 'ə', // 中央元音(schwa)
'i': 'i', // 高前不圓唇元音
'u': 'u', // 高後圓唇元音
'ü': 'y', // 高前圓唇元音
'v': 'y', // ü的替代拼寫
// Diphthongs
'ai': 'aɪ', // 央低到高前雙元音
'ei': 'eɪ', // 中前到高前雙元音
'ao': 'ɑʊ', // 央低到高後雙元音
'ou': 'oʊ', // 中後到高後雙元音
// Nasal finals
'an': 'an', // 央低元音+齒齦鼻音
'en': 'ən', // 中央元音+齒齦鼻音
'ang': 'ɑŋ', // 央低元音+軟顎鼻音
'eng': 'əŋ', // 中央元音+軟顎鼻音
'ong': 'ʊŋ', // 高後近圓唇元音+軟顎鼻音
'er': 'ɚ', // 中央r化元音
// Complex finals with medials
'ia': 'ia', // i+a
'ie': 'iɛ', // i+e(實際音值接近ɛ)
'iao': 'iɑʊ', // i+ao
'iu': 'ioʊ', // i+ou(簡化拼寫)
'iou': 'ioʊ', // i+ou(完整拼寫)
'ian': 'iɛn', // i+an(實際音值)
'in': 'in', // i+n
'iang': 'iɑŋ', // i+ang
'ing': 'iŋ', // i+ng
'iong': 'iʊŋ', // i+ong
'ua': 'ua', // u+a
'uo': 'uɔ', // u+o
'uai': 'uaɪ', // u+ai
'ui': 'ueɪ', // u+ei(簡化拼寫)
'uei': 'ueɪ', // u+ei(完整拼寫)
'uan': 'uan', // u+an
'un': 'uən', // u+en(簡化拼寫)
'uen': 'uən', // u+en(完整拼寫)
'uang': 'uɑŋ', // u+ang
'ueng': 'uəŋ', // u+eng
'üe': 'yɛ', // ü+e
've': 'yɛ', // üe的替代拼寫
'üan': 'yɛn', // ü+an
'van': 'yɛn', // üan的替代拼寫
'ün': 'yn', // ü+n
'vn': 'yn', // ün的替代拼寫
// === SPECIAL SYLLABLES (特殊音節) ===
// Retroflex vowels
'zhi': 'ʈʂɨ', // zh+空韻
'chi': 'ʈʂʰɨ', // ch+空韻
'shi': 'ʂɨ', // sh+空韻
'ri': 'ʐɨ', // r+空韻
'zi': 'tsɨ', // z+空韻
'ci': 'tsʰɨ', // c+空韻
'si': 'sɨ', // s+空韻
// === COMMON COMPLETE SYLLABLES ===
'zhong': 'ʈʂʊŋ', // 中
'wen': 'wən', // 文
'hao': 'xɑʊ', // 好
'de': 'tə', // 的
'de0': 'tə', // 的 (輕聲)
'wo': 'wɔ', // 我
'ta': 'tʰa', // 他/她/它
'zhe': 'ʈʂə', // 這
'ge': 'kə', // 個
'le': 'lə', // 了
'yi': 'i', // 一/抑
'san': 'san', // 三
'wu': 'wu', // 五
'liu': 'lioʊ', // 六
'qi': 'tɕi', // 七
'ba': 'pa', // 八
'jiu': 'tɕioʊ', // 九
};
/**
* Chinese tone marks in IPA notation
*/
const TONE_MARKS = {
1: '˥˥', // 第一聲:高平調 (55)
2: '˧˥', // 第二聲:中升調 (35)
3: '˧˩˧', // 第三聲:低降升調 (214)
4: '˥˩', // 第四聲:高降調 (51)
5: '˧', // 輕聲:中平調 (3)
0: '', // 無聲調標記
};
// Cached values for performance
const INITIAL_PATTERNS = ['zh', 'ch', 'sh']; // Three-letter initials
const SINGLE_INITIALS = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w'];
/**
* Main Chinese G2P processor class
*/
class ChineseG2P {
/**
* Convert Chinese text to IPA phonetic notation
* @param text - Chinese text to convert
* @returns Space-separated IPA string
*/
textToIPA(text) {
if (!(text === null || text === void 0 ? void 0 : text.trim()))
return '';
const results = this.textToPinyinResults(text);
return results.map(result => result.ipa).join(' ');
}
/**
* Convert Chinese text to Zhuyin (Bopomofo) notation
* @param text - Chinese text to convert
* @returns Space-separated Zhuyin string with tone numbers
*/
textToZhuyin(text) {
if (!(text === null || text === void 0 ? void 0 : text.trim()))
return '';
const results = this.textToPinyinResults(text);
return results.map(result => {
if (this.isChinese(result.word)) {
// Convert Chinese characters to Zhuyin
// result.pinyin already includes tone number, so use it directly
return (0, utils_1.pinyinToZhuyin)(result.pinyin);
}
else {
// Keep non-Chinese characters as-is
return result.word;
}
}).join(' ');
}
/**
* Convert Chinese text to detailed pinyin analysis results
* @param text - Chinese text to convert
* @returns Array of detailed conversion results
*/
textToPinyinResults(text) {
if (!(text === null || text === void 0 ? void 0 : text.trim()))
return [];
const results = [];
try {
// Use pinyin-pro for pinyin conversion with tone numbers
const pinyinResults = (0, pinyin_pro_1.pinyin)(text, {
toneType: 'num',
type: 'array',
v: true, // Use 'v' for 'ü'
nonZh: 'removed' // Remove non-Chinese characters from pinyin result
});
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (this.isChinese(char)) {
const pinyinResult = pinyinResults[Math.min(i, pinyinResults.length - 1)];
const { syllable, tone } = this.parsePinyinWithTone(pinyinResult || char);
const ipa = this.pinyinToIPA(syllable, tone);
results.push({
pinyin: pinyinResult || char,
tone: tone,
ipa: ipa,
word: char
});
}
else {
// Non-Chinese characters preserved as-is
results.push({
pinyin: char,
tone: 0,
ipa: char,
word: char
});
}
}
}
catch (error) {
// Fallback: return characters as-is if pinyin conversion fails
console.warn('Chinese G2P conversion failed:', error);
return Array.from(text).map(char => ({
pinyin: char,
tone: 0,
ipa: char,
word: char
}));
}
return results;
}
/**
* Convert pinyin syllable to IPA with tone
* @param pinyin - Pinyin syllable (without tone)
* @param tone - Tone number (1-5)
* @returns IPA string with tone marks
*/
pinyinToIPA(pinyin, tone) {
// Direct mapping lookup (most efficient)
const directMapping = PINYIN_TO_IPA[pinyin];
if (directMapping) {
return directMapping + TONE_MARKS[tone];
}
// Decompose and reconstruct
const { initial, final } = this.decomposePinyin(pinyin);
const initialIPA = PINYIN_TO_IPA[initial] || '';
const finalIPA = PINYIN_TO_IPA[final] || final;
return initialIPA + finalIPA + TONE_MARKS[tone];
}
/**
* Decompose pinyin into initial and final components
* @param pinyin - Complete pinyin syllable
* @returns Object with initial and final parts
*/
decomposePinyin(pinyin) {
// Check three-letter initials first (more specific)
for (const initial of INITIAL_PATTERNS) {
if (pinyin.startsWith(initial)) {
return {
initial: initial,
final: pinyin.slice(initial.length)
};
}
}
// Check single-letter initials
for (const initial of SINGLE_INITIALS) {
if (pinyin.startsWith(initial)) {
return {
initial: initial,
final: pinyin.slice(1)
};
}
}
// No initial found - treat entire string as final
return {
initial: '',
final: pinyin
};
}
/**
* Parse pinyin string with tone number
* @param pinyinWithTone - Pinyin with tone number suffix
* @returns Object with syllable and tone
*/
parsePinyinWithTone(pinyinWithTone) {
const match = pinyinWithTone.match(/^(.+?)([1-5]?)$/);
if (match) {
const syllable = match[1];
const toneStr = match[2];
const tone = toneStr ? parseInt(toneStr, 10) : 5; // Default to neutral tone
return { syllable, tone };
}
// Fallback
return { syllable: pinyinWithTone, tone: 5 };
}
/**
* Check if a character is Chinese
* @param char - Character to check
* @returns True if character is Chinese
*/
isChinese(char) {
const code = char.charCodeAt(0);
return (code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
(code >= 0x3400 && code <= 0x4dbf) || // CJK Extension A
(code >= 0x20000 && code <= 0x2a6df) || // CJK Extension B
(code >= 0x2a700 && code <= 0x2b73f) || // CJK Extension C
(code >= 0x2b740 && code <= 0x2b81f) || // CJK Extension D
(code >= 0x2b820 && code <= 0x2ceaf) || // CJK Extension E
(code >= 0x2ceb0 && code <= 0x2ebef); // CJK Extension F
}
/**
* Check if text contains Chinese characters
* @param text - Text to check
* @returns True if text contains Chinese characters
*/
isChineseText(text) {
return Array.from(text).some(char => this.isChinese(char));
}
}
exports.ChineseG2P = ChineseG2P;
/**
* Global Chinese G2P instance for convenient usage
*/
exports.chineseG2P = new ChineseG2P();