UNPKG

phonemize

Version:

Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.

150 lines (149 loc) 4.82 kB
"use strict"; /** * Phonemize Library - Main API * * A comprehensive text-to-phoneme conversion library supporting: * - IPA (International Phonetic Alphabet) output * - ARPABET phonetic notation * - Multilingual text processing (Chinese, Japanese, Korean, Thai, Arabic, Russian) * - Rule-based G2P conversion with dictionary lookup * - Compound word decomposition * - Number and abbreviation expansion */ Object.defineProperty(exports, "__esModule", { value: true }); exports.Tokenizer = void 0; exports.phonemize = phonemize; exports.toIPA = toIPA; exports.toARPABET = toARPABET; exports.toZhuyin = toZhuyin; exports.addPronunciation = addPronunciation; exports.createTokenizer = createTokenizer; const g2p_1 = require("./g2p"); const tokenizer_1 = require("./tokenizer"); Object.defineProperty(exports, "Tokenizer", { enumerable: true, get: function () { return tokenizer_1.Tokenizer; } }); /** * Main phonemize function implementation */ function phonemize(text, options = {}) { // Handle legacy boolean parameter if (options === true) { options = { returnArray: true }; } const tokenizer = new tokenizer_1.Tokenizer(options); if (options.returnArray) { return tokenizer.tokenizeToTokens(text); } return tokenizer.tokenizeToString(text); } /** * Convert text to International Phonetic Alphabet (IPA) notation * * @param text - Input text to convert * @param options - Configuration options (format will be overridden to 'ipa') * @returns IPA phonetic string * * @example * ```typescript * toIPA("hello world") // "həloʊ wɝld" * toIPA("中文", { anyAscii: false }) // "ʈʂʊŋ˥˥ wən˧˥" * ``` */ function toIPA(text, options) { const ipaOptions = Object.assign(Object.assign({}, options), { format: "ipa" }); const tokenizer = new tokenizer_1.Tokenizer(ipaOptions); return tokenizer.tokenizeToString(text); } /** * Convert text to ARPABET phonetic notation * * @param text - Input text to convert * @param options - Configuration options (format will be overridden to 'arpabet') * @returns ARPABET phonetic string * * @example * ```typescript * toARPABET("hello world") // "HH AH L OW W ER L D" * toARPABET("testing", { stripStress: true }) // "T EH S T IH NG" * ``` */ function toARPABET(text, options) { const arpabetOptions = Object.assign(Object.assign({}, options), { format: "arpabet" }); const tokenizer = new tokenizer_1.Tokenizer(arpabetOptions); return tokenizer.tokenizeToString(text); } /** * Convert text to Zhuyin (Bopomofo) notation * Chinese characters are converted to Zhuyin with tone numbers, * non-Chinese characters are converted to IPA as fallback. * * @param text - Input text to convert * @param options - Configuration options (format will be overridden to 'zhuyin') * @returns Zhuyin phonetic string with tone numbers * * @example * ```typescript * toZhuyin("中文") // "ㄓㄨㄥ1 ㄨㄣ2" * toZhuyin("中文 hello") // "ㄓㄨㄥ1 ㄨㄣ2 həˈloʊ" * toZhuyin("測試", { stripStress: true }) // "ㄘㄜ4 ㄕ4" * ``` */ function toZhuyin(text, options) { const zhuyinOptions = Object.assign(Object.assign({}, options), { format: "zhuyin" }); const tokenizer = new tokenizer_1.Tokenizer(zhuyinOptions); return tokenizer.tokenizeToString(text); } /** * Add custom pronunciation to the internal dictionary * * @param word - Word to add pronunciation for * @param pronunciation - IPA pronunciation string * * @example * ```typescript * addPronunciation("github", "ɡɪthʌb"); * toIPA("github") // "ɡɪthʌb" * ``` */ function addPronunciation(word, pronunciation) { if (!(word === null || word === void 0 ? void 0 : word.trim()) || !(pronunciation === null || pronunciation === void 0 ? void 0 : pronunciation.trim())) { throw new Error("Both word and pronunciation must be non-empty strings"); } g2p_1.g2pModel.addPronunciation(word.toLowerCase(), pronunciation); } /** * Create a custom tokenizer instance with specific configuration * * @param options - Tokenizer configuration options * @returns Configured Tokenizer instance * * @example * ```typescript * const tokenizer = createTokenizer({ * format: "ipa", * stripStress: true, * separator: "-" * }); * * const result = tokenizer.tokenizeToString("hello"); * ``` */ function createTokenizer(options = {}) { return new tokenizer_1.Tokenizer(options); } /** * Phonemize library default export * Provides all core functions and classes for CommonJS compatibility */ const phonemizer = { // === Core Functions === phonemize, toIPA, toARPABET, toZhuyin, // === Utilities === addPronunciation, createTokenizer, // === Classes === Tokenizer: tokenizer_1.Tokenizer, }; exports.default = phonemizer;