UNPKG

devcontext

Version:

DevContext is a cutting-edge Model Context Protocol (MCP) server designed to provide developers with continuous, project-centric context awareness.

344 lines (306 loc) 9.11 kB
/** * Text processing utilities * Provides lightweight implementations of stemming and other text processing functions */ /** * Checks if a character is a vowel * @param {string} char - Character to check * @returns {boolean} True if the character is a vowel, false otherwise * @private */ const isVowel = (char) => { return ['a', 'e', 'i', 'o', 'u'].includes(char); }; /** * Measures the number of consonant sequences in a word * Used in Porter stemmer algorithm to determine when to apply certain rules * @param {string} word - Word to measure * @returns {number} The number of consonant sequences * @private */ const measure = (word) => { // Replace the initial consonant or vowel sequences with single letters const simplified = word.replace(/^[^aeiou]+/, 'C') .replace(/^[aeiou]+/, 'V') .replace(/[^aeiou]+/g, 'C') .replace(/[aeiou]+/g, 'V'); // Count the CV pairs (consonant-vowel sequences) const matches = simplified.match(/CV/g); return matches ? matches.length : 0; }; /** * Checks if a word contains a vowel * @param {string} word - Word to check * @returns {boolean} True if the word contains a vowel, false otherwise * @private */ const containsVowel = (word) => { return /[aeiou]/.test(word); }; /** * Checks if a word ends with a double consonant * @param {string} word - Word to check * @returns {boolean} True if the word ends with a double consonant, false otherwise * @private */ const endsWithDoubleConsonant = (word) => { return word.length > 1 && word[word.length - 1] === word[word.length - 2] && !isVowel(word[word.length - 1]); }; /** * Checks if a word ends with consonant-vowel-consonant pattern where the final consonant is not w, x, or y * @param {string} word - Word to check * @returns {boolean} True if the pattern is matched, false otherwise * @private */ const endsWithCVC = (word) => { if (word.length < 3) return false; const lastChar = word[word.length - 1]; return !isVowel(word[word.length - 3]) && isVowel(word[word.length - 2]) && !isVowel(lastChar) && !['w', 'x', 'y'].includes(lastChar); }; /** * Applies Porter stemmer step 1a: simplify plural forms * @param {string} word - Word to process * @returns {string} Modified word * @private */ const step1a = (word) => { if (word.endsWith('sses')) return word.slice(0, -2); if (word.endsWith('ies')) return word.slice(0, -2); if (word.endsWith('ss')) return word; if (word.endsWith('s')) return word.slice(0, -1); return word; }; /** * Applies Porter stemmer step 1b: handle -ed and -ing endings * @param {string} word - Word to process * @returns {string} Modified word * @private */ const step1b = (word) => { let result = word; if (word.endsWith('eed')) { if (measure(word.slice(0, -3)) > 0) { result = word.slice(0, -1); } } else if (word.endsWith('ed') && containsVowel(word.slice(0, -2))) { result = word.slice(0, -2); return step1bPostProcess(result); } else if (word.endsWith('ing') && containsVowel(word.slice(0, -3))) { result = word.slice(0, -3); return step1bPostProcess(result); } return result; }; /** * Post-processes words after removing -ed or -ing in step 1b * @param {string} word - Word to process * @returns {string} Modified word * @private */ const step1bPostProcess = (word) => { if (['at', 'bl', 'iz'].some(ending => word.endsWith(ending))) { return word + 'e'; } else if (endsWithDoubleConsonant(word) && !['l', 's', 'z'].includes(word[word.length - 1])) { return word.slice(0, -1); } else if (measure(word) === 1 && endsWithCVC(word)) { return word + 'e'; } return word; }; /** * Applies Porter stemmer step 1c: -y endings * @param {string} word - Word to process * @returns {string} Modified word * @private */ const step1c = (word) => { if (word.endsWith('y') && containsVowel(word.slice(0, -1))) { return word.slice(0, -1) + 'i'; } return word; }; /** * Applies Porter stemmer step 2: handle common suffixes * @param {string} word - Word to process * @returns {string} Modified word * @private */ const step2 = (word) => { const suffixes = [ ['ational', 'ate'], ['tional', 'tion'], ['enci', 'ence'], ['anci', 'ance'], ['izer', 'ize'], ['abli', 'able'], ['alli', 'al'], ['entli', 'ent'], ['eli', 'e'], ['ousli', 'ous'], ['ization', 'ize'], ['ation', 'ate'], ['ator', 'ate'], ['alism', 'al'], ['iveness', 'ive'], ['fulness', 'ful'], ['ousness', 'ous'], ['aliti', 'al'], ['iviti', 'ive'], ['biliti', 'ble'] ]; for (const [suffix, replacement] of suffixes) { if (word.endsWith(suffix)) { const stem = word.slice(0, -suffix.length); if (measure(stem) > 0) { return stem + replacement; } break; } } return word; }; /** * Applies Porter stemmer step 3: more suffixes * @param {string} word - Word to process * @returns {string} Modified word * @private */ const step3 = (word) => { const suffixes = [ ['icate', 'ic'], ['ative', ''], ['alize', 'al'], ['iciti', 'ic'], ['ical', 'ic'], ['ful', ''], ['ness', ''] ]; for (const [suffix, replacement] of suffixes) { if (word.endsWith(suffix)) { const stem = word.slice(0, -suffix.length); if (measure(stem) > 0) { return stem + replacement; } break; } } return word; }; /** * Applies Porter stemmer step 4: long suffixes * @param {string} word - Word to process * @returns {string} Modified word * @private */ const step4 = (word) => { const suffixes = ['al', 'ance', 'ence', 'er', 'ic', 'able', 'ible', 'ant', 'ement', 'ment', 'ent', 'ion', 'ou', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize']; for (const suffix of suffixes) { if (word.endsWith(suffix)) { const stem = word.slice(0, -suffix.length); if (measure(stem) > 1) { // Special case for 'ion' if (suffix === 'ion' && !['s', 't'].includes(stem[stem.length - 1])) { continue; } return stem; } break; } } return word; }; /** * Applies Porter stemmer step 5a: e endings * @param {string} word - Word to process * @returns {string} Modified word * @private */ const step5a = (word) => { if (word.endsWith('e')) { const stem = word.slice(0, -1); if (measure(stem) > 1 || (measure(stem) === 1 && !endsWithCVC(stem))) { return stem; } } return word; }; /** * Applies Porter stemmer step 5b: double l endings * @param {string} word - Word to process * @returns {string} Modified word * @private */ const step5b = (word) => { if (word.endsWith('ll') && measure(word) > 1) { return word.slice(0, -1); } return word; }; /** * Stems a word using a simplified Porter stemmer algorithm * Reduces words to their base/root form by removing common suffixes * @param {string} word - Word to stem * @returns {string} The stemmed word */ export const stem = (word) => { if (word.length < 3) return word.toLowerCase(); let result = word.toLowerCase(); result = step1a(result); result = step1b(result); result = step1c(result); result = step2(result); result = step3(result); result = step4(result); result = step5a(result); result = step5b(result); return result; }; /** * Generates n-grams from an array of tokens * @param {string[]} tokens - Array of strings (words/tokens) * @param {number} n - Size of n-grams to generate (e.g., 2 for bigrams, 3 for trigrams) * @returns {string[]} Array of n-grams (strings joined by spaces) */ export const generateNgrams = (tokens, n) => { // Handle edge cases if (!tokens || tokens.length === 0) return []; if (n <= 0) return []; if (tokens.length < n) return [tokens.join(' ')]; const ngrams = []; // Generate n-grams by sliding a window of size n over the tokens array for (let i = 0; i <= tokens.length - n; i++) { const ngram = tokens.slice(i, i + n).join(' '); ngrams.push(ngram); } return ngrams; }; /** * Calculates Term Frequency (TF) for a given array of tokens * TF is defined as count of token / total number of tokens in the document * @param {string[]} tokens - Array of strings (words/tokens) representing a document * @returns {Object.<string, number>} Object where keys are unique tokens and values are their term frequencies */ const calculateTF = (tokens) => { if (!tokens || tokens.length === 0) { return {}; } const frequencies = {}; const totalTokens = tokens.length; // Count occurrences of each token for (const token of tokens) { frequencies[token] = (frequencies[token] || 0) + 1; } // Calculate term frequencies (count / total) const termFrequencies = {}; for (const [token, count] of Object.entries(frequencies)) { termFrequencies[token] = count / totalTokens; } return termFrequencies; }; export { stem, generateNgrams, calculateTF }; export default stem;