@raven-js/cortex
Version:
Zero-dependency machine learning, AI, and data processing library for modern JavaScript
205 lines (175 loc) • 6.09 kB
JavaScript
/**
* @author Anonyfox <max@anonyfox.com>
* @license MIT
* @see {@link https://github.com/Anonyfox/ravenjs}
* @see {@link https://ravenjs.dev}
* @see {@link https://anonyfox.com}
*/
/**
* @file Unified N-gram feature extraction for character and word level analysis.
*
* Provides a single ngrams() function that can extract character n-grams, word n-grams,
* or both based on configuration. Defaults to word n-grams as most commonly used.
* Reuses battle-tested segmentation functions for consistent tokenization.
*/
import { foldCase, normalizeUnicode } from "../normalization/index.js";
import { tokenizeWords } from "../segmentation/tokenize-words.js";
/**
* Unified n-gram extraction function with configurable algorithm selection.
*
* Extracts character n-grams, word n-grams, or both based on configuration.
* Provides a clean single-function API with smart defaults for common usage.
*
* @param {string} text - Input text to extract n-grams from
* @param {Object} [options] - Configuration options
* @param {string} [options.type="words"] - Type of n-grams: "chars", "words", or "mixed"
* @param {number} [options.n] - N-gram size (auto-detected: chars=3, words=2)
* @param {number} [options.stride=1] - Step size between n-grams
* @param {boolean} [options.normalize=true] - Apply Unicode normalization
* @param {boolean} [options.lowercase=true] - Convert to lowercase
* @param {string} [options.separator=" "] - Separator for word n-grams (words only)
* @param {number} [options.charN=3] - Character n-gram size (mixed only)
* @param {number} [options.wordN=2] - Word n-gram size (mixed only)
* @returns {string[]|Object} Array of n-grams or object with char/word arrays for mixed type
*
* @example
* // Default: word n-grams (most common)
* const wordBigrams = ngrams("machine learning algorithms");
* console.log(wordBigrams); // ["machine learning", "learning algorithms"]
*
* @example
* // Character n-grams
* const charTrigrams = ngrams("hello", { type: "chars" });
* console.log(charTrigrams); // ["hel", "ell", "llo"]
*
* @example
* // Mixed: both character and word n-grams
* const mixed = ngrams("hello world", { type: "mixed" });
* console.log(mixed); // { char: ["hel", "ell", ...], word: ["hello world"] }
*
* @example
* // Custom configuration
* const trigrams = ngrams("natural language processing", {
* type: "words",
* n: 3,
* lowercase: false
* });
* console.log(trigrams); // ["natural language processing"]
*/
export function ngrams(text, options = {}) {
const {
type = "words",
n = type === "chars" ? 3 : 2, // Smart default based on type
stride = 1,
normalize = true,
lowercase = true,
separator = " ",
charN = 3,
wordN = 2,
} = options;
// Delegate to appropriate internal function
switch (type) {
case "chars":
return extractCharNgrams(text, n, stride, { normalize, lowercase });
case "words":
return extractWordNgrams(text, n, stride, {
normalize,
lowercase,
separator,
});
case "mixed":
return extractMixedNgrams(text, {
charN, // Always use the provided charN parameter
wordN, // Always use the provided wordN parameter
stride,
options: { normalize, lowercase, separator },
});
default:
throw new Error(`Unknown n-gram type: "${type}". Must be "chars", "words", or "mixed"`);
}
}
// Keep internal functions for the unified API
/**
* @param {string} text
* @param {number} n
* @param {number} stride
* @param {{normalize?: boolean, lowercase?: boolean}} options
* @returns {string[]}
*/
function extractCharNgrams(text, n = 3, stride = 1, options = {}) {
const { normalize = true, lowercase = true } = options;
if (typeof text !== "string" || text.length === 0) {
return [];
}
if (n < 1 || stride < 1) {
throw new Error("N-gram size and stride must be positive integers");
}
let processedText = text;
// Apply normalization if requested
if (normalize) {
processedText = normalizeUnicode(processedText);
}
// Apply case folding if requested
if (lowercase) {
processedText = foldCase(processedText);
}
const ngramsList = [];
// Extract n-grams with configurable stride
for (let i = 0; i <= processedText.length - n; i += stride) {
ngramsList.push(processedText.slice(i, i + n));
}
return ngramsList;
}
/**
* @param {string} text
* @param {number} n
* @param {number} stride
* @param {{normalize?: boolean, lowercase?: boolean, separator?: string}} options
* @returns {string[]}
*/
function extractWordNgrams(text, n = 2, stride = 1, options = {}) {
const { normalize = true, lowercase = true, separator = " " } = options;
if (typeof text !== "string" || text.length === 0) {
return [];
}
if (n < 1 || stride < 1) {
throw new Error("N-gram size and stride must be positive integers");
}
let processedText = text;
// Apply normalization if requested
if (normalize) {
processedText = normalizeUnicode(processedText);
}
// Apply case folding if requested
if (lowercase) {
processedText = foldCase(processedText);
}
// Use our hardened tokenizeWords function
const words = tokenizeWords(processedText);
if (words.length < n) {
return [];
}
const ngramsList = [];
// Extract word n-grams with configurable stride
for (let i = 0; i <= words.length - n; i += stride) {
// Build n-gram string without allocating a subarray
let token = words[i];
for (let j = 1; j < n; j++) {
token += separator + words[i + j];
}
ngramsList.push(token);
}
return ngramsList;
}
/**
* @param {string} text
* @param {{charN?: number, wordN?: number, stride?: number, options?: object}} config
* @returns {{char: string[], word: string[]}}
*/
function extractMixedNgrams(text, config = {}) {
const { charN = 3, wordN = 2, stride = 1, options = {} } = config;
return {
char: extractCharNgrams(text, charN, stride, options),
word: extractWordNgrams(text, wordN, stride, options),
};
}