phonemize
Version:
Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.
150 lines (149 loc) • 4.82 kB
JavaScript
;
/**
* Phonemize Library - Main API
*
* A comprehensive text-to-phoneme conversion library supporting:
* - IPA (International Phonetic Alphabet) output
* - ARPABET phonetic notation
* - Multilingual text processing (Chinese, Japanese, Korean, Thai, Arabic, Russian)
* - Rule-based G2P conversion with dictionary lookup
* - Compound word decomposition
* - Number and abbreviation expansion
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.Tokenizer = void 0;
exports.phonemize = phonemize;
exports.toIPA = toIPA;
exports.toARPABET = toARPABET;
exports.toZhuyin = toZhuyin;
exports.addPronunciation = addPronunciation;
exports.createTokenizer = createTokenizer;
const g2p_1 = require("./g2p");
const tokenizer_1 = require("./tokenizer");
Object.defineProperty(exports, "Tokenizer", { enumerable: true, get: function () { return tokenizer_1.Tokenizer; } });
/**
* Main phonemize function implementation
*/
function phonemize(text, options = {}) {
// Handle legacy boolean parameter
if (options === true) {
options = { returnArray: true };
}
const tokenizer = new tokenizer_1.Tokenizer(options);
if (options.returnArray) {
return tokenizer.tokenizeToTokens(text);
}
return tokenizer.tokenizeToString(text);
}
/**
* Convert text to International Phonetic Alphabet (IPA) notation
*
* @param text - Input text to convert
* @param options - Configuration options (format will be overridden to 'ipa')
* @returns IPA phonetic string
*
* @example
* ```typescript
* toIPA("hello world") // "həloʊ wɝld"
* toIPA("中文", { anyAscii: false }) // "ʈʂʊŋ˥˥ wən˧˥"
* ```
*/
function toIPA(text, options) {
const ipaOptions = Object.assign(Object.assign({}, options), { format: "ipa" });
const tokenizer = new tokenizer_1.Tokenizer(ipaOptions);
return tokenizer.tokenizeToString(text);
}
/**
* Convert text to ARPABET phonetic notation
*
* @param text - Input text to convert
* @param options - Configuration options (format will be overridden to 'arpabet')
* @returns ARPABET phonetic string
*
* @example
* ```typescript
* toARPABET("hello world") // "HH AH L OW W ER L D"
* toARPABET("testing", { stripStress: true }) // "T EH S T IH NG"
* ```
*/
function toARPABET(text, options) {
const arpabetOptions = Object.assign(Object.assign({}, options), { format: "arpabet" });
const tokenizer = new tokenizer_1.Tokenizer(arpabetOptions);
return tokenizer.tokenizeToString(text);
}
/**
* Convert text to Zhuyin (Bopomofo) notation
* Chinese characters are converted to Zhuyin with tone numbers,
* non-Chinese characters are converted to IPA as fallback.
*
* @param text - Input text to convert
* @param options - Configuration options (format will be overridden to 'zhuyin')
* @returns Zhuyin phonetic string with tone numbers
*
* @example
* ```typescript
* toZhuyin("中文") // "ㄓㄨㄥ1 ㄨㄣ2"
* toZhuyin("中文 hello") // "ㄓㄨㄥ1 ㄨㄣ2 həˈloʊ"
* toZhuyin("測試", { stripStress: true }) // "ㄘㄜ4 ㄕ4"
* ```
*/
function toZhuyin(text, options) {
const zhuyinOptions = Object.assign(Object.assign({}, options), { format: "zhuyin" });
const tokenizer = new tokenizer_1.Tokenizer(zhuyinOptions);
return tokenizer.tokenizeToString(text);
}
/**
* Add custom pronunciation to the internal dictionary
*
* @param word - Word to add pronunciation for
* @param pronunciation - IPA pronunciation string
*
* @example
* ```typescript
* addPronunciation("github", "ɡɪthʌb");
* toIPA("github") // "ɡɪthʌb"
* ```
*/
function addPronunciation(word, pronunciation) {
if (!(word === null || word === void 0 ? void 0 : word.trim()) || !(pronunciation === null || pronunciation === void 0 ? void 0 : pronunciation.trim())) {
throw new Error("Both word and pronunciation must be non-empty strings");
}
g2p_1.g2pModel.addPronunciation(word.toLowerCase(), pronunciation);
}
/**
* Create a custom tokenizer instance with specific configuration
*
* @param options - Tokenizer configuration options
* @returns Configured Tokenizer instance
*
* @example
* ```typescript
* const tokenizer = createTokenizer({
* format: "ipa",
* stripStress: true,
* separator: "-"
* });
*
* const result = tokenizer.tokenizeToString("hello");
* ```
*/
function createTokenizer(options = {}) {
return new tokenizer_1.Tokenizer(options);
}
/**
* Phonemize library default export
* Provides all core functions and classes for CommonJS compatibility
*/
const phonemizer = {
// === Core Functions ===
phonemize,
toIPA,
toARPABET,
toZhuyin,
// === Utilities ===
addPronunciation,
createTokenizer,
// === Classes ===
Tokenizer: tokenizer_1.Tokenizer,
};
exports.default = phonemizer;