UNPKG

phonemize

Version:

Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.

github.com/hans00/phonemize

hans00/phonemize

247 lines (246 loc) • 7.59 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.simplePOSTagger = exports.SimplePOSTagger = void 0; class SimplePOSTagger { /** * Check if a word is likely a noun based on its endings */ isLikelyNoun(word) { const lowerWord = word.toLowerCase(); // Check common noun endings for (const ending of SimplePOSTagger.NOUN_ENDINGS) { if (lowerWord.endsWith(ending)) { return true; } } // Common nouns that don't follow patterns const commonNouns = ['way', 'book', 'books', 'paper', 'time', 'people', 'world', 'life', 'hand', 'part', 'child', 'eye', 'woman', 'place', 'work', 'week', 'case', 'point', 'company', 'number', 'group', 'problem', 'fact']; return commonNouns.includes(lowerWord); } /** * Tag a single word with its most likely POS */ tagWord(word, context) { var _a, _b; const lowerWord = word.toLowerCase(); // Check if current word is determiner first if (SimplePOSTagger.DETERMINERS.includes(lowerWord)) { return { word, pos: "DT", confidence: 0.9 }; } // Check context clues first (higher confidence) if (context && context.length >= 1) { const prevWord = (_a = context[0]) === null || _a === void 0 ? void 0 : _a.toLowerCase(); // First element is previous word const nextWord = context.length >= 2 ? (_b = context[1]) === null || _b === void 0 ? void 0 : _b.toLowerCase() : undefined; // Second element is next word // Enhanced detection patterns - highest priority first // Previous word is determiner -> likely noun (HIGHEST priority for structural patterns) if (prevWord && SimplePOSTagger.DETERMINERS.includes(prevWord)) { return { word, pos: "!V", confidence: 0.95 }; } // Imperative patterns: Please/Don't + word -> likely verb if (prevWord && ['please', "don't", 'do', "doesn't", 'never'].includes(prevWord)) { return { word, pos: "V", confidence: 0.9 }; } // Modal + word -> likely verb (can read, will lead, etc.) if (prevWord && ['can', 'will', 'would', 'should', 'could', 'may', 'might', 'must'].includes(prevWord)) { return { word, pos: "V", confidence: 0.9 }; } // Subject pronoun + word -> likely verb (I read, he leads, etc.) if (prevWord && ['i', 'you', 'he', 'she', 'it', 'we', 'they'].includes(prevWord)) { return { word, pos: "V", confidence: 0.85 }; } // Previous word is auxiliary verb -> likely verb if (prevWord && SimplePOSTagger.AUX_VERBS.includes(prevWord)) { return { word, pos: "V", confidence: 0.8 }; } // Word + determiner/article -> current word likely verb (read the, lead the, etc.) if (nextWord && SimplePOSTagger.DETERMINERS.includes(nextWord)) { return { word, pos: "V", confidence: 0.8 }; } // Word + noun -> current word likely verb/adjective if (nextWord && this.isLikelyNoun(nextWord)) { return { word, pos: "V", confidence: 0.75 }; } // Word followed by 'to' -> likely verb (infinitive) if (nextWord === "to") { return { word, pos: "V", confidence: 0.7 }; } // Previous word is preposition -> likely noun if (prevWord && SimplePOSTagger.PREPOSITIONS.includes(prevWord)) { return { word, pos: "!V", confidence: 0.7 }; } } // Check word endings (medium confidence) for (const ending of SimplePOSTagger.VERB_ENDINGS) { if (lowerWord.endsWith(ending)) { // Special cases for ambiguous endings if (ending === "ed") { return { word, pos: "VBD", confidence: 0.6 }; // Past tense } if (ending === "ing") { return { word, pos: "V", confidence: 0.6 }; } if (ending === "s" && lowerWord.length > 2) { // Could be verb (3rd person) or plural noun return { word, pos: "V", confidence: 0.4 }; } return { word, pos: "V", confidence: 0.5 }; } } for (const ending of SimplePOSTagger.NOUN_ENDINGS) { if (lowerWord.endsWith(ending)) { return { word, pos: "!V", confidence: 0.5 }; } } for (const ending of SimplePOSTagger.ADJECTIVE_ENDINGS) { if (lowerWord.endsWith(ending)) { if (ending === "ly") { return { word, pos: "ADJ", confidence: 0.6 }; // Adverb, but we'll treat as adjective } return { word, pos: "!V", confidence: 0.5 }; } } // Default fallback - assume noun (most common for homographs) return { word, pos: "!V", confidence: 0.3 }; } /** * Tag multiple words in sequence with context */ tagWords(words) { const results = []; for (let i = 0; i < words.length; i++) { const word = words[i]; const context = [ i > 0 ? words[i - 1] : "", i < words.length - 1 ? words[i + 1] : "", ].filter((w) => w); results.push(this.tagWord(word, context)); } return results; } /** * Simple sentence-level POS tagging */ tagSentence(text) { // Simple tokenization - split by spaces and punctuation const words = text .toLowerCase() .split(/[\s,.!?;:()]+/) .filter((word) => word.length > 0); return this.tagWords(words); } } exports.SimplePOSTagger = SimplePOSTagger; // Common verb endings SimplePOSTagger.VERB_ENDINGS = [ "ed", "ing", "es", "s", "en", "er", "ize", "ise", "fy", "ate", ]; // Common noun endings SimplePOSTagger.NOUN_ENDINGS = [ "tion", "sion", "ness", "ment", "ity", "ty", "er", "or", "ist", "ian", "ism", "age", "ure", "ence", "ance", ]; // Common adjective endings SimplePOSTagger.ADJECTIVE_ENDINGS = [ "able", "ible", "al", "ial", "ed", "en", "er", "est", "ful", "ic", "ish", "ive", "less", "ly", "ous", "y", ]; // Common function words that indicate following word might be a noun SimplePOSTagger.DETERMINERS = [ "the", "a", "an", "this", "that", "these", "those", "my", "your", "his", "her", "its", "our", "their", ]; // Common auxiliary verbs and modal verbs SimplePOSTagger.AUX_VERBS = [ "am", "is", "are", "was", "were", "be", "being", "been", "have", "has", "had", "having", "do", "does", "did", "doing", "will", "would", "shall", "should", "can", "could", "may", "might", "must", ]; // Common prepositions that indicate following word might be a noun SimplePOSTagger.PREPOSITIONS = [ "in", "on", "at", "by", "for", "with", "from", "to", "of", "about", "under", "over", "through", "between", "among", ]; exports.simplePOSTagger = new SimplePOSTagger();