UNPKG

phonemize

Version:

Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.

github.com/hans00/phonemize

hans00/phonemize

421 lines (420 loc) • 17.4 kB

JavaScript

"use strict"; /** * Text tokenization and phoneme processing system * Handles language detection, preprocessing, and format conversion */ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.Tokenizer = void 0; exports.tokenizeText = tokenizeText; exports.textToIPA = textToIPA; exports.textToARPABET = textToARPABET; const any_ascii_1 = __importDefault(require("any-ascii")); const g2p_1 = require("./g2p"); const expand_1 = require("./expand"); const pos_tagger_1 = require("./pos-tagger"); const consts_1 = require("./consts"); const multilingual_processor_1 = require("./multilingual-processor"); const zh_g2p_1 = require("./zh-g2p"); const utils_1 = require("./utils"); /** * Fast ARPABET to IPA conversion for legacy compatibility */ function arpabetToIpa(arpabet) { var _a; const stress = (_a = arpabet.match(/[012]$/)) === null || _a === void 0 ? void 0 : _a[0]; const arpabetWithoutStress = arpabet.replace(/[012]$/, ""); const ipa = consts_1.ARPABET_TO_IPA[arpabetWithoutStress]; return stress ? `${consts_1.IPA_STRESS_MAP[stress]}${ipa}` : ipa; } /** * Main tokenizer class for phoneme processing */ class Tokenizer { constructor(options = {}) { this.options = Object.assign({ stripStress: false, format: "ipa", separator: " ", anyAscii: false, homograph: {}, toneFormat: "unicode" }, options); } /** * Preprocess text with language detection and segmentation */ _preprocess(text) { const segments = this._segmentByLanguage(text); if (!this.options.anyAscii) { return { text, languageMap: {}, segments, }; } // Apply anyAscii conversion while preserving Chinese for G2P const words = text.split(/(\s+)/); const languageMap = {}; let processedText = ''; for (const word of words) { const trimmed = word.trim(); if (trimmed && !consts_1.PUNCTUATION.includes(trimmed)) { const detectedLang = (0, multilingual_processor_1.detectLanguage)(trimmed); if (detectedLang) { if (detectedLang === 'zh' && zh_g2p_1.chineseG2P.isChineseText(trimmed)) { // Preserve Chinese text for G2P processing processedText += word; languageMap[trimmed.toLowerCase()] = detectedLang; } else { // Convert non-Chinese multilingual text to ASCII const asciiWord = (0, any_ascii_1.default)(trimmed); processedText += word.replace(trimmed, asciiWord); languageMap[asciiWord.toLowerCase()] = detectedLang; } } else { // Convert non-multilingual text to ASCII processedText += (0, any_ascii_1.default)(word); } } else { // Preserve whitespace and punctuation processedText += word; } } return { text: processedText, languageMap, segments, }; } /** * Segment text by character-level language detection */ _segmentByLanguage(text) { const segments = []; let currentSegment = ''; let currentLanguage = ''; let segmentStartIndex = 0; for (let i = 0; i < text.length; i++) { const char = text[i]; const charLang = this._detectCharLanguage(char); if (charLang !== currentLanguage) { // Language changed - save current segment if not empty if (currentSegment.trim()) { segments.push({ text: currentSegment, language: currentLanguage || 'en', startIndex: segmentStartIndex }); } // Start new segment currentSegment = char; currentLanguage = charLang; segmentStartIndex = i; } else { currentSegment += char; } } // Add final segment if (currentSegment.trim()) { segments.push({ text: currentSegment, language: currentLanguage || 'en', startIndex: segmentStartIndex }); } return segments; } /** * Fast character-level language detection */ _detectCharLanguage(char) { const code = char.charCodeAt(0); // Chinese (CJK) - most common ranges first if ((code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs (code >= 0x3400 && code <= 0x4dbf) || // CJK Extension A (code >= 0x20000 && code <= 0x2a6df)) { // CJK Extension B return 'zh'; } // Japanese if ((code >= 0x3040 && code <= 0x309f) || // Hiragana (code >= 0x30a0 && code <= 0x30ff)) { // Katakana return 'ja'; } // Korean if ((code >= 0xac00 && code <= 0xd7af) || // Hangul Syllables (code >= 0x1100 && code <= 0x11ff) || // Hangul Jamo (code >= 0x3130 && code <= 0x318f)) { // Hangul Compatibility Jamo return 'ko'; } // Thai if (code >= 0x0e00 && code <= 0x0e7f) { return 'th'; } // Arabic if ((code >= 0x0600 && code <= 0x06ff) || // Arabic (code >= 0x0750 && code <= 0x077f) || // Arabic Supplement (code >= 0xfb50 && code <= 0xfdff) || // Arabic Presentation Forms-A (code >= 0xfe70 && code <= 0xfeff)) { // Arabic Presentation Forms-B return 'ar'; } // Cyrillic (Russian, etc.) if (code >= 0x0400 && code <= 0x04ff) { return 'ru'; } // Default to English/Latin return 'en'; } /** * Post-process phonemes for format conversion and cleanup */ _postProcess(phonemes) { if (this.options.format === "arpabet") { // Convert to ARPABET format phonemes = (0, utils_1.ipaToArpabet)(phonemes); // Remove ARPABET stress markers if requested if (this.options.stripStress) { phonemes = phonemes.replace(/[012]/g, ""); } } else if (this.options.format === "zhuyin") { // Zhuyin format processing - handled per token, not here // This is a placeholder for any global zhuyin post-processing // The actual conversion happens in the tokenize method return phonemes; } else { // IPA format processing // Convert Chinese tone format if requested if (this.options.toneFormat === "arrow") { phonemes = (0, utils_1.convertChineseTonesToArrows)(phonemes); } // Remove IPA stress markers if requested if (this.options.stripStress) { phonemes = phonemes.replace(/[ˈˌ]/g, ""); } } return phonemes; } /** * Core tokenization method - converts text to phoneme array */ tokenize(text) { var _a, _b; if (!(text === null || text === void 0 ? void 0 : text.trim())) return []; const { text: processedText, languageMap } = this._preprocess(text); const expandedText = (0, expand_1.expandText)(processedText); // Improved tokenization for better Chinese word preservation const tokens = this._smartTokenize(expandedText); // Get POS tags for homograph disambiguation const cleanWords = tokens.filter(token => token.trim() && !consts_1.PUNCTUATION.includes(token.trim())); const posResults = pos_tagger_1.simplePOSTagger.tagWords(cleanWords); const phonemes = []; let cleanWordIndex = 0; for (const token of tokens) { const cleanToken = token.trim(); // Handle punctuation - preserve it if (consts_1.PUNCTUATION.includes(cleanToken)) { phonemes.push(cleanToken); continue; } // Get POS tag for homograph disambiguation const pos = (_a = posResults[cleanWordIndex]) === null || _a === void 0 ? void 0 : _a.pos; cleanWordIndex++; // Check for custom pronunciations const customPronunciation = (_b = this.options.homograph) === null || _b === void 0 ? void 0 : _b[cleanToken.toLowerCase()]; if (customPronunciation) { let processed = this._postProcess(customPronunciation); // Apply custom separator to individual phonemes if needed if (this.options.separator !== " ") { processed = processed.split(' ').join(this.options.separator); } phonemes.push(processed); continue; } // Check language map for multilingual words const detectedLanguage = languageMap[cleanToken.toLowerCase()]; // Handle Zhuyin format specially if (this.options.format === "zhuyin") { let pronunciation; // Check if it's Chinese text if (zh_g2p_1.chineseG2P.isChineseText(cleanToken)) { // Convert Chinese to Zhuyin pronunciation = zh_g2p_1.chineseG2P.textToZhuyin(cleanToken); } else { // Convert non-Chinese to IPA as fallback pronunciation = (0, g2p_1.predict)(cleanToken, pos, detectedLanguage); // Apply IPA post-processing but not tone format conversion if (this.options.stripStress) { pronunciation = pronunciation.replace(/[ˈˌ]/g, ""); } } // Apply custom separator if (this.options.separator !== " ") { pronunciation = pronunciation.split(' ').join(this.options.separator); } phonemes.push(pronunciation); } else { // Regular IPA/ARPABET processing let pronunciation = (0, g2p_1.predict)(cleanToken, pos, detectedLanguage); pronunciation = this._postProcess(pronunciation); // Apply custom separator to individual phonemes if needed if (this.options.separator !== " ") { pronunciation = pronunciation.split(' ').join(this.options.separator); } phonemes.push(pronunciation); } } return phonemes; } /** * Smart tokenization using efficient regex patterns */ _smartTokenize(text) { const tokenRegex = /([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]+|\w+['']?\w*|[^\w\s\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])/g; const tokens = []; let match; while ((match = tokenRegex.exec(text)) !== null) { const token = match[1]; // Skip pure whitespace tokens if (/^\s+$/.test(token)) { continue; } // Handle punctuation - only add if it's in our known punctuation list if (token.length === 1 && consts_1.PUNCTUATION.includes(token)) { tokens.push(token); continue; } // Add word tokens (Chinese, English, numbers, contractions, etc.) if (token.trim()) { tokens.push(token.trim()); } } return tokens; } /** * Convert text to phoneme string with specified separator */ tokenizeToString(text) { const phonemes = this.tokenize(text); // Join phonemes, handling punctuation attachment properly const result = []; for (let i = 0; i < phonemes.length; i++) { const phoneme = phonemes[i]; if (consts_1.PUNCTUATION.includes(phoneme)) { // Attach punctuation to previous phoneme without space if (result.length > 0) { result[result.length - 1] += phoneme; } else { result.push(phoneme); } } else { // For custom separators, split phonemes into characters if (this.options.separator !== " ") { result.push(phoneme.split('').join(this.options.separator)); } else { result.push(phoneme); } } } return result.join(this.options.separator === " " ? " " : " "); } /** * Convert text to detailed phoneme tokens with metadata */ tokenizeToTokens(text) { var _a, _b; if (!(text === null || text === void 0 ? void 0 : text.trim())) return []; const { text: processedText, languageMap } = this._preprocess(text); const expandedText = (0, expand_1.expandText)(processedText); // Use regex to get tokens with their positions in original text const tokenRegex = /([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]+|\w+['']?\w*|[^\w\s\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])/g; const tokenMatches = []; let match; while ((match = tokenRegex.exec(expandedText)) !== null) { const token = match[1]; // Skip pure whitespace tokens if (/^\s+$/.test(token)) { continue; } // Only process non-whitespace tokens if (token.trim()) { tokenMatches.push({ token: token.trim(), position: match.index }); } } // Get POS tags for homograph disambiguation const cleanWords = tokenMatches.filter(({ token }) => !consts_1.PUNCTUATION.includes(token)); const posResults = pos_tagger_1.simplePOSTagger.tagWords(cleanWords.map(({ token }) => token)); const results = []; let cleanWordIndex = 0; for (const { token, position } of tokenMatches) { if (!consts_1.PUNCTUATION.includes(token)) { // Get POS tag for homograph disambiguation const pos = (_a = posResults[cleanWordIndex]) === null || _a === void 0 ? void 0 : _a.pos; cleanWordIndex++; // Check for custom pronunciations const customPronunciation = (_b = this.options.homograph) === null || _b === void 0 ? void 0 : _b[token.toLowerCase()]; let phoneme; if (customPronunciation) { phoneme = this._postProcess(customPronunciation); } else { // Check language map for multilingual words const detectedLanguage = languageMap[token.toLowerCase()]; // Handle Zhuyin format specially if (this.options.format === "zhuyin") { if (zh_g2p_1.chineseG2P.isChineseText(token)) { // Convert Chinese to Zhuyin phoneme = zh_g2p_1.chineseG2P.textToZhuyin(token); } else { // Convert non-Chinese to IPA as fallback phoneme = (0, g2p_1.predict)(token, pos, detectedLanguage); // Apply IPA post-processing but not tone format conversion if (this.options.stripStress) { phoneme = phoneme.replace(/[ˈˌ]/g, ""); } } } else { // Regular IPA/ARPABET processing const pronunciation = (0, g2p_1.predict)(token, pos, detectedLanguage); phoneme = this._postProcess(pronunciation); } } results.push({ phoneme, word: token, position }); } } return results; } } exports.Tokenizer = Tokenizer; // Legacy function exports for backward compatibility function tokenizeText(text, _g2pPredict, // Deprecated parameter options = {}) { const tokenizer = new Tokenizer(options); return tokenizer.tokenizeToTokens(text); } function textToIPA(text, _g2pPredict, // Deprecated parameter options = {}) { const tokenizer = new Tokenizer(Object.assign(Object.assign({}, options), { format: "ipa" })); return tokenizer.tokenizeToString(text); } function textToARPABET(text, _g2pPredict, // Deprecated parameter options = {}) { const tokenizer = new Tokenizer(Object.assign(Object.assign({}, options), { format: "arpabet" })); return tokenizer.tokenizeToString(text); }