@raven-js/cortex
Version:
Zero-dependency machine learning, AI, and data processing library for modern JavaScript
455 lines (409 loc) • 10.8 kB
JavaScript
/**
* @author Anonyfox <max@anonyfox.com>
* @license MIT
* @see {@link https://github.com/Anonyfox/ravenjs}
* @see {@link https://ravenjs.dev}
* @see {@link https://anonyfox.com}
*/
/**
* @file Porter2 English stemmer implementation.
*
* Implements the Porter2 (Snowball English) stemming algorithm for reducing
* English words to their morphological root forms. Uses the official Porter2
* rules with optimized performance for high-throughput text processing.
*/
// Vowels pattern for R1/R2 region calculation
const VOWELS = /[aeiouy]/;
const NON_VOWELS = /[^aeiouy]/;
// Special word endings that should be preserved
const SPECIAL_WORDS = new Map([
["skis", "ski"],
["skies", "sky"],
["dying", "die"],
["lying", "lie"],
["tying", "tie"],
["idly", "idl"],
["gently", "gentl"],
["ugly", "ugli"],
["early", "earli"],
["only", "onli"],
["singly", "singl"],
["sky", "sky"],
["news", "news"],
["howe", "howe"],
["atlas", "atlas"],
["cosmos", "cosmos"],
["bias", "bias"],
["andes", "andes"],
]);
/**
* Applies Porter2 stemming algorithm to reduce English words to root forms.
*
* Implements the complete Porter2 specification with R1/R2 region detection,
* suffix removal rules, and special case handling. Designed for high-performance
* text processing with minimal memory allocation.
*
* @param {string} word - The English word to stem
* @returns {string} The stemmed root form
*
* @example
* // Common word stemming
* stemPorter2('running'); // 'run'
* stemPorter2('flies'); // 'fli'
*
* @example
* // Preserve root meaning
* stemPorter2('dogs'); // 'dog'
* stemPorter2('churches'); // 'church'
*
* @example
* // Handle complex suffixes
* stemPorter2('nationalism'); // 'nation'
* stemPorter2('rationalization'); // 'ration'
*/
export function stemPorter2(word) {
if (!word || word.length < 3) return word;
// Convert to lowercase for processing
word = word.toLowerCase();
// Check special words first
if (SPECIAL_WORDS.has(word)) {
return SPECIAL_WORDS.get(word);
}
// Step 0: Remove possessives
word = step0(word);
// Calculate R1 and R2 regions
const r1 = calculateR1(word);
const r2 = calculateR2(word);
// Step 1a: Remove plural suffixes
word = step1a(word);
// Step 1b: Remove past tense and gerund suffixes
word = step1b(word, r1);
// Step 1c: Replace y/Y with i
word = step1c(word);
// Step 2: Remove derivational suffixes
word = step2(word, r1);
// Step 3: Remove additional suffixes
word = step3(word, r1, r2);
// Step 4: Remove common word endings
word = step4(word, r2);
// Step 5: Remove final e and double letters
word = step5(word, r1, r2);
return word;
}
/**
* Step 0: Remove possessive suffixes ('s, 's, s')
* @param {string} word - Word to process
* @returns {string} Word with possessive suffixes removed
*/
function step0(word) {
if (word.endsWith("'s'")) {
return word.slice(0, -3);
}
if (word.endsWith("'s")) {
return word.slice(0, -2);
}
if (word.endsWith("'")) {
return word.slice(0, -1);
}
return word;
}
/**
* Step 1a: Remove plural suffixes
* @param {string} word - Word to process
* @returns {string} Word with plural suffixes removed
*/
function step1a(word) {
if (word.endsWith("sses")) {
return word.slice(0, -2); // sses -> ss
}
if (word.endsWith("ied") || word.endsWith("ies")) {
// ies/ied -> i if stem has more than one letter, otherwise -> ie
const suffix = word.endsWith("ies") ? "ies" : "ied";
const stem = word.slice(0, -suffix.length);
return stem.length > 1 ? `${stem}i` : `${stem}ie`;
}
if (word.endsWith("us") || word.endsWith("ss")) {
return word; // unchanged
}
if (word.endsWith("s") && word.length > 2) {
// Remove s if stem contains a vowel (not just immediately before)
const beforeS = word.slice(0, -1);
if (containsVowel(beforeS)) {
return beforeS;
}
}
return word;
}
/**
* Step 1b: Remove past tense and gerund suffixes
* @param {string} word - Word to process
* @param {number} r1 - R1 region start index
* @returns {string} Word with past tense suffixes removed
*/
function step1b(word, r1) {
if (word.endsWith("eedly")) {
return r1 <= word.length - 5 ? word.slice(0, -3) : word;
}
if (word.endsWith("eed")) {
return r1 <= word.length - 3 ? word.slice(0, -1) : word;
}
// Handle ed/edly/ing/ingly if stem contains vowel
const suffixes = ["ingly", "edly", "ing", "ed"];
for (const suffix of suffixes) {
if (word.endsWith(suffix)) {
const stem = word.slice(0, -suffix.length);
if (containsVowel(stem)) {
// Apply post-removal rules
if (stem.endsWith("at") || stem.endsWith("bl") || stem.endsWith("iz")) {
return `${stem}e`;
}
if (isDoubleConsonant(stem) && !stem.match(/[lsz]$/)) {
return stem.slice(0, -1);
}
if (isShortWord(stem)) {
return `${stem}e`;
}
return stem;
}
}
}
return word;
}
/**
* Step 1c: Replace y/Y with i if preceded by consonant
* @param {string} word - Word to process
* @returns {string} Word with y/Y replaced
*/
function step1c(word) {
if ((word.endsWith("y") || word.endsWith("Y")) && word.length > 2) {
if (NON_VOWELS.test(word[word.length - 2])) {
return `${word.slice(0, -1)}i`;
}
}
return word;
}
/**
* Step 2: Remove derivational suffixes
* @param {string} word - Word to process
* @param {number} r1 - R1 region start index
* @returns {string} Word with derivational suffixes removed
*/
function step2(word, r1) {
const suffixMap = {
ization: "ize",
ational: "ate",
fulness: "ful",
ousness: "ous",
iveness: "ive",
tional: "tion",
biliti: "ble",
lessli: "less",
entli: "ent",
ation: "ate",
alism: "al",
aliti: "al",
ousli: "ous",
iviti: "ive",
fulli: "ful",
enci: "ence",
anci: "ance",
abli: "able",
izer: "ize",
ator: "ate",
alli: "al",
bli: "ble",
};
for (const [suffix, replacement] of Object.entries(suffixMap)) {
if (word.endsWith(suffix) && r1 <= word.length - suffix.length) {
return word.slice(0, -suffix.length) + replacement;
}
}
// Special cases
if (word.endsWith("li") && word.length > 2) {
const precedingChar = word[word.length - 3];
if ("cdeghkmnrt".includes(precedingChar) && r1 <= word.length - 2) {
return word.slice(0, -2);
}
}
return word;
}
/**
* Step 3: Remove additional suffixes
* @param {string} word - Word to process
* @param {number} r1 - R1 region start index
* @param {number} r2 - R2 region start index
* @returns {string} Word with additional suffixes removed
*/
function step3(word, r1, r2) {
const suffixMap = {
ational: "ate",
tional: "tion",
alize: "al",
icate: "ic",
iciti: "ic",
ical: "ic",
ful: "",
ness: "",
};
for (const [suffix, replacement] of Object.entries(suffixMap)) {
if (word.endsWith(suffix)) {
const requiredR = suffix === "ful" || suffix === "ness" ? r2 : r1;
if (requiredR <= word.length - suffix.length) {
return word.slice(0, -suffix.length) + replacement;
}
}
}
if (word.endsWith("ative") && r2 <= word.length - 5) {
return word.slice(0, -5);
}
return word;
}
/**
* Step 4: Remove common word endings
* @param {string} word - Word to process
* @param {number} r2 - R2 region start index
* @returns {string} Word with common endings removed
*/
function step4(word, r2) {
const suffixes = [
"al",
"ance",
"ence",
"er",
"ic",
"able",
"ible",
"ant",
"ement",
"ment",
"ent",
"ion",
"ou",
"ism",
"ate",
"iti",
"ous",
"ive",
"ize",
];
for (const suffix of suffixes) {
if (word.endsWith(suffix) && r2 <= word.length - suffix.length) {
// Special handling for ion
if (suffix === "ion" && word.length > 3) {
const precedingChar = word[word.length - 4];
if ("st".includes(precedingChar)) {
return word.slice(0, -3);
}
} else {
return word.slice(0, -suffix.length);
}
}
}
return word;
}
/**
* Step 5: Remove final e and handle double letters
* @param {string} word - Word to process
* @param {number} r1 - R1 region start index
* @param {number} r2 - R2 region start index
* @returns {string} Word with final processing applied
*/
function step5(word, r1, r2) {
// Step 5a: Remove e
if (word.endsWith("e")) {
if (r2 <= word.length - 1) {
return word.slice(0, -1);
}
if (r1 <= word.length - 1 && !isShortSyllable(word.slice(0, -1))) {
return word.slice(0, -1);
}
}
// Step 5b: Remove double l
if (word.endsWith("ll") && r2 <= word.length - 1) {
return word.slice(0, -1);
}
return word;
}
/**
* Calculate R1 region (first non-vowel after first vowel)
* @param {string} word - Word to analyze
* @returns {number} R1 region start index
*/
function calculateR1(word) {
// Special cases
if (
word.startsWith("gener") ||
word.startsWith("commun") ||
word.startsWith("arsen")
) {
return 5;
}
for (let i = 1; i < word.length; i++) {
if (VOWELS.test(word[i - 1]) && NON_VOWELS.test(word[i])) {
return i + 1;
}
}
return word.length;
}
/**
* Calculate R2 region (first non-vowel after first vowel in R1)
* @param {string} word - Word to analyze
* @returns {number} R2 region start index
*/
function calculateR2(word) {
const r1 = calculateR1(word);
for (let i = r1 + 1; i < word.length; i++) {
if (VOWELS.test(word[i - 1]) && NON_VOWELS.test(word[i])) {
return i + 1;
}
}
return word.length;
}
/**
* Check if word contains a vowel
* @param {string} word - Word to check
* @returns {boolean} True if word contains vowel
*/
function containsVowel(word) {
return VOWELS.test(word);
}
/**
* Check if word ends with double consonant
* @param {string} word - Word to check
* @returns {boolean} True if word ends with double consonant
*/
function isDoubleConsonant(word) {
if (word.length < 2) return false;
const last = word[word.length - 1];
const secondLast = word[word.length - 2];
return last === secondLast && NON_VOWELS.test(last);
}
/**
* Check if word is a short word (ends with short syllable and R1 is empty)
* @param {string} word - Word to check
* @returns {boolean} True if word is short
*/
function isShortWord(word) {
return calculateR1(word) >= word.length && isShortSyllable(word);
}
/**
* Check if word ends with short syllable
* @param {string} word - Word to check
* @returns {boolean} True if word ends with short syllable
*/
function isShortSyllable(word) {
if (word.length < 2) return false;
// Short syllable: consonant + vowel + consonant (not w, x, Y)
if (word.length >= 3) {
const chars = word.slice(-3);
return (
NON_VOWELS.test(chars[0]) &&
VOWELS.test(chars[1]) &&
NON_VOWELS.test(chars[2]) &&
!"wxy".includes(chars[2])
);
}
// Two-letter case: vowel + consonant
const chars = word.slice(-2);
return VOWELS.test(chars[0]) && NON_VOWELS.test(chars[1]);
}