phonemize
Version:
Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.
247 lines (246 loc) • 7.59 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.simplePOSTagger = exports.SimplePOSTagger = void 0;
class SimplePOSTagger {
/**
* Check if a word is likely a noun based on its endings
*/
isLikelyNoun(word) {
const lowerWord = word.toLowerCase();
// Check common noun endings
for (const ending of SimplePOSTagger.NOUN_ENDINGS) {
if (lowerWord.endsWith(ending)) {
return true;
}
}
// Common nouns that don't follow patterns
const commonNouns = ['way', 'book', 'books', 'paper', 'time', 'people', 'world', 'life', 'hand', 'part', 'child', 'eye', 'woman', 'place', 'work', 'week', 'case', 'point', 'company', 'number', 'group', 'problem', 'fact'];
return commonNouns.includes(lowerWord);
}
/**
* Tag a single word with its most likely POS
*/
tagWord(word, context) {
var _a, _b;
const lowerWord = word.toLowerCase();
// Check if current word is determiner first
if (SimplePOSTagger.DETERMINERS.includes(lowerWord)) {
return { word, pos: "DT", confidence: 0.9 };
}
// Check context clues first (higher confidence)
if (context && context.length >= 1) {
const prevWord = (_a = context[0]) === null || _a === void 0 ? void 0 : _a.toLowerCase(); // First element is previous word
const nextWord = context.length >= 2 ? (_b = context[1]) === null || _b === void 0 ? void 0 : _b.toLowerCase() : undefined; // Second element is next word
// Enhanced detection patterns - highest priority first
// Previous word is determiner -> likely noun (HIGHEST priority for structural patterns)
if (prevWord && SimplePOSTagger.DETERMINERS.includes(prevWord)) {
return { word, pos: "!V", confidence: 0.95 };
}
// Imperative patterns: Please/Don't + word -> likely verb
if (prevWord && ['please', "don't", 'do', "doesn't", 'never'].includes(prevWord)) {
return { word, pos: "V", confidence: 0.9 };
}
// Modal + word -> likely verb (can read, will lead, etc.)
if (prevWord && ['can', 'will', 'would', 'should', 'could', 'may', 'might', 'must'].includes(prevWord)) {
return { word, pos: "V", confidence: 0.9 };
}
// Subject pronoun + word -> likely verb (I read, he leads, etc.)
if (prevWord && ['i', 'you', 'he', 'she', 'it', 'we', 'they'].includes(prevWord)) {
return { word, pos: "V", confidence: 0.85 };
}
// Previous word is auxiliary verb -> likely verb
if (prevWord && SimplePOSTagger.AUX_VERBS.includes(prevWord)) {
return { word, pos: "V", confidence: 0.8 };
}
// Word + determiner/article -> current word likely verb (read the, lead the, etc.)
if (nextWord && SimplePOSTagger.DETERMINERS.includes(nextWord)) {
return { word, pos: "V", confidence: 0.8 };
}
// Word + noun -> current word likely verb/adjective
if (nextWord && this.isLikelyNoun(nextWord)) {
return { word, pos: "V", confidence: 0.75 };
}
// Word followed by 'to' -> likely verb (infinitive)
if (nextWord === "to") {
return { word, pos: "V", confidence: 0.7 };
}
// Previous word is preposition -> likely noun
if (prevWord && SimplePOSTagger.PREPOSITIONS.includes(prevWord)) {
return { word, pos: "!V", confidence: 0.7 };
}
}
// Check word endings (medium confidence)
for (const ending of SimplePOSTagger.VERB_ENDINGS) {
if (lowerWord.endsWith(ending)) {
// Special cases for ambiguous endings
if (ending === "ed") {
return { word, pos: "VBD", confidence: 0.6 }; // Past tense
}
if (ending === "ing") {
return { word, pos: "V", confidence: 0.6 };
}
if (ending === "s" && lowerWord.length > 2) {
// Could be verb (3rd person) or plural noun
return { word, pos: "V", confidence: 0.4 };
}
return { word, pos: "V", confidence: 0.5 };
}
}
for (const ending of SimplePOSTagger.NOUN_ENDINGS) {
if (lowerWord.endsWith(ending)) {
return { word, pos: "!V", confidence: 0.5 };
}
}
for (const ending of SimplePOSTagger.ADJECTIVE_ENDINGS) {
if (lowerWord.endsWith(ending)) {
if (ending === "ly") {
return { word, pos: "ADJ", confidence: 0.6 }; // Adverb, but we'll treat as adjective
}
return { word, pos: "!V", confidence: 0.5 };
}
}
// Default fallback - assume noun (most common for homographs)
return { word, pos: "!V", confidence: 0.3 };
}
/**
* Tag multiple words in sequence with context
*/
tagWords(words) {
const results = [];
for (let i = 0; i < words.length; i++) {
const word = words[i];
const context = [
i > 0 ? words[i - 1] : "",
i < words.length - 1 ? words[i + 1] : "",
].filter((w) => w);
results.push(this.tagWord(word, context));
}
return results;
}
/**
* Simple sentence-level POS tagging
*/
tagSentence(text) {
// Simple tokenization - split by spaces and punctuation
const words = text
.toLowerCase()
.split(/[\s,.!?;:()]+/)
.filter((word) => word.length > 0);
return this.tagWords(words);
}
}
exports.SimplePOSTagger = SimplePOSTagger;
// Common verb endings
SimplePOSTagger.VERB_ENDINGS = [
"ed",
"ing",
"es",
"s",
"en",
"er",
"ize",
"ise",
"fy",
"ate",
];
// Common noun endings
SimplePOSTagger.NOUN_ENDINGS = [
"tion",
"sion",
"ness",
"ment",
"ity",
"ty",
"er",
"or",
"ist",
"ian",
"ism",
"age",
"ure",
"ence",
"ance",
];
// Common adjective endings
SimplePOSTagger.ADJECTIVE_ENDINGS = [
"able",
"ible",
"al",
"ial",
"ed",
"en",
"er",
"est",
"ful",
"ic",
"ish",
"ive",
"less",
"ly",
"ous",
"y",
];
// Common function words that indicate following word might be a noun
SimplePOSTagger.DETERMINERS = [
"the",
"a",
"an",
"this",
"that",
"these",
"those",
"my",
"your",
"his",
"her",
"its",
"our",
"their",
];
// Common auxiliary verbs and modal verbs
SimplePOSTagger.AUX_VERBS = [
"am",
"is",
"are",
"was",
"were",
"be",
"being",
"been",
"have",
"has",
"had",
"having",
"do",
"does",
"did",
"doing",
"will",
"would",
"shall",
"should",
"can",
"could",
"may",
"might",
"must",
];
// Common prepositions that indicate following word might be a noun
SimplePOSTagger.PREPOSITIONS = [
"in",
"on",
"at",
"by",
"for",
"with",
"from",
"to",
"of",
"about",
"under",
"over",
"through",
"between",
"among",
];
exports.simplePOSTagger = new SimplePOSTagger();