cmpstr
Version:
CmpStr is a lightweight, fast and well performing package for calculating string similarity
1,304 lines (1,299 loc) • 195 kB
JavaScript
/**
* CmpStr v3.0.1 dev-052fa0c-250614
* This is a lightweight, fast and well performing library for calculating string similarity.
* (c) 2023-2025 Paul Köhler @komed3 / MIT License
* Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr
*/
/**
* Deep Merge Utility
* src/utils/DeepMerge.ts
*
* This module provides utility functions for deep merging objects, getting values by path,
* and setting values by path in a deeply nested object structure.
*
* It supports dot and bracket notation (e.g. `a.b[0].c`) as well as escaped keys.
*
* Included functions:
* - `get`: Retrieve a deeply nested value by path
* - `set`: Assign a value to a nested path
* - `merge`: Deeply merge two objects
* - `has`: Check whether a path exists
* - `rmv`: Delete a value at a path
*
* @module Utils/DeepMerge
* @author Paul Köhler
* @license MIT
*/
/**
* Parse a path string into an array of keys.
*
* @param {string} p - The path string, e.g. `a.b.c` or `a[0].b`
* @returns {(string|number)[]} - An array of keys, e.g. `['a', 'b', 'c']` or `['a', 0, 'b']`
*/
const parse = (p) => (p.replace(/\[(\d+)]/g, '.$1').split('.').map(s => /^\d+$/.test(s) ? +s : s));
/**
* Deeply get a value from an object by a path string.
*
* @template T - The type of the object to get the value from
* @param {T} t - The object to get the value from
* @param {string} path - The path string, e.g. `a.b.c`
* @param {any} fallback - The default value to return if the path does not exist
* @returns {T|R|undefined} - The value at the specified path, otherwise the default value
*/
function get(t, path, fallback) {
return parse(path).reduce((o, k) => o?.[k] ?? fallback, t);
}
/**
* Deeply set a value in an object by a path string.
*
* @template T - The type of the object to get the value from
* @param {T} t - The object to set the value in
* @param {string} path - The path string, e.g. `a.b.c`
* @param {any} value - The value to set at the specified path
* @returns {T} - The modified object with the value set at the specified path
* @throws {Error} - Throws an error if the key is not a valid identifier
*/
function set(t, path, value) {
// If the path is empty, return the value
if (path === '')
return value;
// Split the path into the first key and the rest of the path
const [k, ...r] = parse(path);
// Throw an error if the key is not a valid identifier
if (t !== undefined && (typeof t !== 'object' || t === null))
throw Error(`cannot set property <${k}> of <${JSON.stringify(t)}>`);
// Assign the value to the specified key in the object
return Object.assign(t ?? (typeof k === 'number' ? [] : Object.create(null)), {
[k]: set(t?.[k], r.join('.'), value)
});
}
/**
* Deeply merge two objects, where the second object overrides the first.
*
* @template T - The type of the object to get the value from
* @param {T} t - The target object to merge into
* @param {T} o - The source object to merge from
* @param {boolean} [mergeUndefined=false] - Whether to merge undefined values
* @returns {T} - The merged object
*/
function merge(t = Object.create(null), o = Object.create(null), mergeUndefined = false) {
// Iterate over the keys of the source object and merge them into the target object
return Object.keys(o).forEach(k => {
const val = o[k];
// If the value is undefined and mergeUndefined is false, skip it
if (!mergeUndefined && val === undefined)
return;
// Skip dangerous property names to prevent prototype pollution
if (k === '__proto__' || k === 'constructor')
return;
// If the value is an object and not an array, recursively merge it
t[k] = typeof val === 'object' && !Array.isArray(val)
? merge(typeof t[k] === 'object' && !Array.isArray(t[k])
? t[k] : Object.create(null), val)
: val;
}), t;
}
/**
* Delete a value at a specified path in an object.
*
* @template T - The type of the object to get the value from
* @param {T} t - The object to delete the value from
* @param {string} path - The path string, e.g. `a.b.c`
* @param {boolean} [preserveEmpty=false] - Whether to preserve empty objects/arrays
* @returns {T} - The modified object with the value deleted at the specified path
*/
function rmv(t, path, preserveEmpty = false) {
const r = (o, k, i = 0) => {
const key = k[i];
// Delete the key if it is not an object or if it is the last key in the path
if (!o || typeof o !== 'object')
return false;
if (i === k.length - 1)
return delete o[key];
if (!r(o[key], k, i + 1))
return false;
// If preserveEmpty is false, check if the object or array is empty
if (!preserveEmpty) {
const val = o[key];
// If the value is an empty array or object, delete the key
if (typeof val === 'object' && ((Array.isArray(val) && val.every(v => v == null)) ||
(!Array.isArray(val) && Object.keys(val).length === 0)))
delete o[key];
}
return true;
};
r(t, parse(path));
return t;
}
/**
* Profiler Utility
* src/utils/profiler.ts
*
* @see https://en.wikipedia.org/wiki/Profiling_(computer_programming)
*
* This class provides methods to run synchronous and asynchronous functions, capturing
* their execution time and memory usage, and storing the results in a set of profiler
* entries. It supports both Node.js and browser environments, detecting the environment
* automatically.
*
* The class is optimized for minimal overhead and can be used for fine-grained
* performance profiling.
*
* @module Utils/Profiler
* @author Paul Köhler (komed3)
* @license MIT
*/
/**
* Profiler class for measuring execution time and memory usage of functions.
*/
class Profiler {
// Environment detection
static ENV;
// Singleton instance
static instance;
// Store for profiler entries
store = new Set();
// Total time and memory consumption
totalTime = 0;
totalMem = 0;
// The Profiler active state
active;
/**
* Sets the environment based on the available global objects.
* Detects if running in Node.js or browser and sets the ENV property accordingly.
*/
static detectEnv() {
// Check for Node.js environment
if (typeof process !== 'undefined')
Profiler.ENV = 'nodejs';
// Check for browser environment
else if (typeof performance !== 'undefined')
Profiler.ENV = 'browser';
// If neither, set ENV to unknown
else
Profiler.ENV = 'unknown';
}
/**
* Returns the singleton instance of the Perf class.
* If the instance does not exist, it creates a new one.
*
* @param {boolean} [enable=false] - Optional parameter to enable the profiler upon instantiation
* @returns {Profiler} - Singleton Profiler instance
*/
static getInstance(enable) {
// Ensure the environment is detected
if (!Profiler.ENV)
Profiler.detectEnv();
// If instance does not exist, create a new one
if (!Profiler.instance)
Profiler.instance = new Profiler(enable);
// Return singleton instance
return Profiler.instance;
}
/**
* Private constructor to enforce singleton pattern.
* Initializes the store for profiler entries.
*
* @param {boolean} [enable=false] - Optional parameter to enable the profiler
*/
constructor(enable) { this.active = enable ?? false; }
/**
* Gets the current time based on the environment.
*
* Uses process.hrtime.bigint() for Node.js, performance.now() for browsers,
* and Date.now() as a fallback.
*
* @returns {number} - Current time in milliseconds
*/
now() {
switch (Profiler.ENV) {
// Node.js environment
case 'nodejs': return Number(process.hrtime.bigint()) / 1e6;
// Browser environment
case 'browser': return performance.now();
// Fallback
default: return Date.now();
}
}
/**
* Gets the current memory usage based on the environment.
*
* Uses process.memoryUsage().heapUsed for Node.js, performance.memory.usedJSHeapSize
* for browsers, and returns 0 as a fallback.
*
* @returns {number} - Current memory usage in bytes
*/
mem() {
switch (Profiler.ENV) {
// Node.js environment
case 'nodejs': return process.memoryUsage().heapUsed;
// Browser environment
case 'browser': return performance.memory?.usedJSHeapSize ?? 0;
// Fallback
default: return 0;
}
}
/**
* Enables the profiler.
* Sets the active state to true, allowing profiling to occur.
*/
enable() { this.active = true; }
/**
* Disables the profiler.
* Sets the active state to false, preventing further profiling.
*/
disable() { this.active = false; }
/**
* Resets the profiler by clearing the store, total time and memory consumption.
* This method is useful for starting a new profiling session.
*/
clear() {
this.store.clear();
this.totalTime = 0;
this.totalMem = 0;
}
/**
* Runs a synchronous function and profiles its execution time and memory usage.
* If the profiler is not active, it simply executes the function without profiling.
*
* @param {() => T} fn - Function to be executed and profiled
* @param {Record<string, any>} meta - Metadata to be associated with the profiling entry
* @returns {T} - The result of the executed function
*/
run(fn, meta = {}) {
// If the profiler is not active, simply execute the function without profiling
if (!this.active)
return fn();
// Capture the start time and memory usage
const startTime = this.now(), startMem = this.mem();
// Execute the function and capture the result
const res = fn();
// Calculate the time and memory consumption
const deltaTime = this.now() - startTime;
const deltaMem = this.mem() - startMem;
// Add the profiling entry to the store
this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
this.totalTime += deltaTime, this.totalMem += deltaMem;
// Return the result of the function
return res;
}
/**
* Runs an asynchronous function and profiles its execution time and memory usage.
* If the profiler is not active, it simply executes the function without profiling.
*
* @param {() => Promise<T>} fn - Asynchronous function to be executed and profiled
* @param {Record<string, any>} meta - Metadata to be associated with the profiling entry
* @returns {Promise<T>} - A promise that resolves to the result of the executed function
*/
async runAsync(fn, meta = {}) {
// If the profiler is not active, simply execute the function without profiling
if (!this.active)
return await fn();
// Capture the start time and memory usage
const startTime = this.now(), startMem = this.mem();
// Execute the asynchronous function and wait for its result
const res = await fn();
// Calculate the time and memory consumption
const deltaTime = this.now() - startTime;
const deltaMem = this.mem() - startMem;
// Add the profiling entry to the store
this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
this.totalTime += deltaTime, this.totalMem += deltaMem;
// Return the result of the function
return res;
}
/**
* Retrieves all profiler entries stored in the profiler.
*
* @returns {ProfilerEntry<any>[]} - An array of profiler entries
*/
getAll() { return [...this.store]; }
/**
* Retrieves the last profiler entry stored in the profiler.
*
* @returns {ProfilerEntry<any> | undefined} - The last profiler entry or undefined if no entries exist
*/
getLast() { return this.getAll().pop(); }
/**
* Retrieves the total time and memory consumption recorded by the profiler.
*
* @returns {{ time: number, mem: number }} - An object containing total time and memory usage
*/
getTotal() {
return {
time: this.totalTime, mem: this.totalMem
};
}
/**
* Returns the services provided by the Profiler class.
* This allows for easy access to the profiler's methods.
*
* @returns {ProfilerService<any>} - An object containing methods to control the profiler
*/
services = {
enable: this.enable.bind(this),
disable: this.disable.bind(this),
clear: this.clear.bind(this),
report: this.getAll.bind(this),
last: this.getLast.bind(this),
total: this.getTotal.bind(this)
};
}
/**
* TextAnalyzer Utility
* src/utils/TextAnalyzer.ts
*
* The TextAnalyzer class provides a comprehensive set of methods for analyzing and
* extracting statistics from a given text. It supports word and sentence tokenization,
* character and word frequency analysis, syllable estimation, readability metrics
* (Flesch, Kincaid, LIX, WSTF), and various ratios and histograms. Designed for
* efficiency and flexibility, it is suitable for linguistic research, readability
* scoring, and text preprocessing tasks.
*
* @module Utils/TextAnalyzer
* @author Paul Köhler (komed3)
* @license MIT
*/
class TextAnalyzer {
// The original text to analyze
text;
// Tokenized words and sentences
words = [];
sentences = [];
// Frequency maps for characters and words
charFrequency = new Map();
wordHistogram = new Map();
syllableCache = new Map();
/**
* Constructs a new TextAnalyzer instance with the provided input text.
*
* @param {string} input - The text to analyze
*/
constructor(input) {
this.text = input.trim();
this.tokenize();
this.computeFrequencies();
}
/**
* Tokenizes the input text into words and sentences.
*/
tokenize() {
this.words = [], this.sentences = [];
const text = this.text;
const wordRegex = /\p{L}+/gu;
let match;
// Tokenize words using Unicode property escapes for letters
while ((match = wordRegex.exec(text)) !== null) {
this.words.push(match[0].toLowerCase());
}
// Tokenize sentences using punctuation marks as delimiters
this.sentences = text.split(/(?<=[.!?])\s+/).filter(Boolean);
}
/**
* Computes character and word frequencies from the tokenized text.
*/
computeFrequencies() {
// Compute character frequencies
for (const char of this.text)
this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
// Compute word frequencies
for (const word of this.words)
this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
}
/**
* Estimates the number of syllables in a word using a simple heuristic.
*
* @param {string} word - The word to estimate syllables for
* @returns {number} - Estimated syllable count
*/
estimateSyllables(word) {
// Check cache first to avoid redundant calculations
if (this.syllableCache.has(word))
return this.syllableCache.get(word);
// Normalize the word: lowercase and remove non-letter characters
const clean = word.toLowerCase().replace(/[^a-zäöüß]/g, '');
const matches = clean.match(/[aeiouyäöü]+/g);
// Count syllables based on vowel groups
const count = matches ? matches.length : 1;
this.syllableCache.set(word, count);
return count;
}
/**
* Gets the original text length in characters.
*
* @return {number} - Length of the text
*/
getLength() { return this.text.length; }
/**
* Gets the number of words in the text.
*
* @return {number} - Count of words
*/
getWordCount() { return this.words.length; }
/**
* Gets the number of sentences in the text.
*
* @return {number} - Count of sentences
*/
getSentenceCount() { return this.sentences.length; }
/**
* Gets the average word length in the text.
*
* @return {number} - Average length of words
*/
getAvgWordLength() {
let totalLen = 0;
for (const w of this.words)
totalLen += w.length;
return this.words.length ? totalLen / this.words.length : 0;
}
/**
* Gets the average sentence length in words.
*
* @return {number} - Average length of sentences
*/
getAvgSentenceLength() {
return this.sentences.length ? this.words.length / this.sentences.length : 0;
}
/**
* Gets a histogram of word frequencies in the text.
*
* @returns {Record<string, number>} - A histogram of word frequencies
*/
getWordHistogram() {
return Object.fromEntries(this.wordHistogram);
}
/**
* Gets the most common words in the text, limited to a specified number.
*
* @param {number} [limit=5] - Maximum number of common words to return
* @returns {string[]} - Array of the most common words
*/
getMostCommonWords(limit = 5) {
return [...this.wordHistogram.entries()]
.sort((a, b) => b[1] - a[1])
.slice(0, limit).map(e => e[0]);
}
/**
* Gets the least common words (hapax legomena) in the text.
*
* Hapax legomena are words that occur only once in the text.
*
* @returns {string[]} - Array of hapax legomena
*/
getHapaxLegomena() {
return [...this.wordHistogram.entries()]
.filter(([, c]) => c === 1)
.map(e => e[0]);
}
/**
* Checks if the text contains any numbers.
*
* @returns {boolean} - True if numbers are present, false otherwise
*/
hasNumbers() { return /\d/.test(this.text); }
/**
* Calculates the ratio of uppercase letters to total letters in the text.
*
* @return {number} - Ratio of uppercase letters to total letters
*/
getUpperCaseRatio() {
let upper = 0, letters = 0;
for (let i = 0, len = this.text.length; i < len; i++) {
const c = this.text[i];
if (/[A-Za-zÄÖÜäöüß]/.test(c)) {
letters++;
if (/[A-ZÄÖÜ]/.test(c))
upper++;
}
}
return letters ? upper / letters : 0;
}
/**
* Gets the frequency of each character in the text.
*
* @returns {Record<string, number>} - A record of character frequencies
*/
getCharFrequency() {
return Object.fromEntries(this.charFrequency);
}
/**
* Gets the frequency of each Unicode block in the text.
*
* @returns {Record<string, number>} - A record of Unicode block frequencies
*/
getUnicodeStats() {
const result = {};
for (const [char, count] of this.charFrequency) {
// Get the Unicode block for the character
const block = char
.charCodeAt(0).toString(16)
.padStart(4, '0').toUpperCase();
// Increment the count for the block
result[block] = (result[block] ?? 0) + count;
}
return result;
}
/**
* Gets the ratio of long words (words with length >= len) to total words.
*
* @param {number} [len=7] - Minimum length for a word to be considered long
* @returns {number} - Ratio of long words to total words
*/
getLongWordRatio(len = 7) {
let long = 0;
for (const w of this.words)
if (w.length >= len)
long++;
return this.words.length ? long / this.words.length : 0;
}
/**
* Gets the ratio of short words (words with length <= len) to total words.
*
* @param {number} [len=3] - Maximum length for a word to be considered short
* @returns {number} - Ratio of short words to total words
*/
getShortWordRatio(len = 3) {
let short = 0;
for (const w of this.words)
if (w.length <= len)
short++;
return this.words.length ? short / this.words.length : 0;
}
/**
* Estimates the number of syllables in the text.
*
* @returns {number} - Total estimated syllable count
*/
getSyllablesCount() {
let count = 0;
for (const w of this.words)
count += this.estimateSyllables(w);
return count;
}
/**
* Gets the number of monosyllabic words (words with exactly one syllable).
*
* @returns {number} - Count of monosyllabic words
*/
getMonosyllabicWordCount() {
let count = 0;
for (const w of this.words)
if (this.estimateSyllables(w) === 1)
count++;
return count;
}
/**
* Gets the number of words with at least a specified minimum syllable count.
*
* @param {number} min - Minimum syllable count for a word to be included
* @returns {number} - Count of words meeting the syllable criteria
*/
getMinSyllablesWordCount(min) {
let count = 0;
for (const w of this.words)
if (this.estimateSyllables(w) >= min)
count++;
return count;
}
/**
* Gets the number of words with at most a specified maximum syllable count.
*
* @param {number} max - Maximum syllable count for a word to be included
* @returns {number} - Count of words meeting the syllable criteria
*/
getMaxSyllablesWordCount(max) {
let count = 0;
for (const w of this.words)
if (this.estimateSyllables(w) <= max)
count++;
return count;
}
/**
* Calculates the Honore's R statistic for the text as a measure of lexical richness.
*
* @returns {number} - The Honore's R statistic
*/
getHonoresR() {
return (100 * Math.log(this.words.length)) / (1 - (this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1)));
}
/**
* Estimates the reading time for the text based on words per minute (WPM).
*
* @param {number} [wpm=200] - Words per minute for the calculation
* @returns {number} - Estimated reading time in minutes
*/
getReadingTime(wpm = 200) {
return Math.max(1, this.words.length / (wpm ?? 1));
}
/**
* Calculates various readability scores based on the text.
*
* This method supports multiple readability metrics:
* - Flesch Reading Ease
* - Flesch-Kincaid Grade Level
*
* @param {'flesch'|'fleschde'|'kincaid'} [metric='flesch'] - The readability metric to calculate
* @returns {number} - The calculated readability score
*/
getReadabilityScore(metric = 'flesch') {
const w = this.words.length || 1;
const s = this.sentences.length || 1;
const y = this.getSyllablesCount() || 1;
const asl = w / s;
const asw = y / w;
switch (metric) {
// Flesch Reading Ease formula
case 'flesch': return 206.835 - (1.015 * asl) - (84.6 * asw);
// Flesch Reading Ease formula for German texts
case 'fleschde': return 180 - asl - (58.5 * asw);
// Flesch-Kincaid Grade Level formula
case 'kincaid': return (0.39 * asl) + (11.8 * asw) - 15.59;
}
}
/**
* Calculates the LIX (Lesbarhetsindex) score for the text.
*
* The LIX score is a readability index that combines average word length and sentence length.
*
* @returns {number} - The LIX score
*/
getLIXScore() {
const w = this.words.length || 1;
const s = this.sentences.length || 1;
const l = this.getLongWordRatio() * w;
return (w / s) + (l / w * 100);
}
/**
* Calculates the Wiener Sachtextformel (WSTF) scores for the text.
*
* The WSTF scores are a set of readability metrics based on word and sentence characteristics.
*
* @returns {[number, number, number, number]} - An array of WSTF scores
*/
getWSTFScore() {
const w = this.words.length || 1;
const h = this.getMinSyllablesWordCount(3) / w * 100;
const s = this.getAvgSentenceLength();
const l = this.getLongWordRatio() * 100;
const m = this.getMonosyllabicWordCount() / w * 100;
return [
0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.8750,
0.2007 * h + 0.1682 * s + 0.1373 * l - 2.7790,
0.2963 * h + 0.1905 * s - 1.1144,
0.2744 * h + 0.2656 * s - 1.6930
];
}
}
/**
* DiffChecker Utility
* src/utils/DiffChecker.ts
*
* The DiffChecker class provides a robust and efficient utility for comparing two
* texts and extracting their differences (full lines or word mode). It supports
* context-aware grouping of changes, unified diff output (with CLI color or ASCII
* markup), and detailed change magnitude metrics. The class is highly configurable,
* allowing users to choose the diff granularity, case sensitivity, context lines,
* grouping, and output style. It is suitable for text comparison, code review
* tools, document versioning, and any application requiring precise and human-
* readable difference reporting.
*
* Features:
* - Line and word-based diffing
* - Case-insensitive comparison option
* - Context lines and grouping of adjacent changes
* - Unified diff output (ASCII or colored CLI)
* - Highlighting of changed segments within lines
* - Change magnitude calculation (relative to group or line)
* - Expand-all mode for full file context
*
* @module Utils/DiffChecker
* @author Paul Köhler (komed3)
* @license MIT
*/
/**
* The DiffChecker class provides methods to compare two texts and generate
* structured diffs, grouped diffs, and unified diff outputs.
*/
class DiffChecker {
// Original input texts and options
a;
b;
options;
// Computed diff entries and groups
entries = [];
grouped = [];
// Flag to indicate if the diff has already been computed
diffRun = false;
/**
* Constructs a new DiffChecker instance for comparing two texts.
*
* @param {string} a - The first (original) text
* @param {string} b - The second (modified) text
* @param {DiffOptions} [opt] - Optional diff configuration
*/
constructor(a, b, opt = {}) {
// Set the two texts to compare
this.a = a, this.b = b;
// Merge default with user-provided options
this.options = { ...{
mode: 'word',
caseInsensitive: false,
contextLines: 1,
groupedLines: true,
expandLines: false,
showChangeMagnitude: true,
maxMagnitudeSymbols: 5,
lineBreak: '\n'
}, ...opt };
// Run the diff computation immediately
this.computeDiff();
}
/**
* Splits both input texts into arrays of lines and returns them
* with the maximum line count.
*
* @returns { linesA: string[], linesB: string[], maxLen: number }
*/
text2lines() {
// Trim and split the input texts into lines
const linesA = this.a.trim().split(/\r?\n/);
const linesB = this.b.trim().split(/\r?\n/);
return { linesA, linesB, maxLen: Math.max(linesA.length, linesB.length) };
}
/**
* Tokenizes a string according to the current diff mode (line or word).
*
* @param {string} input - The string to tokenize
* @returns {string[]} - Array of tokens
*/
tokenize(input) {
const { mode } = this.options;
switch (mode) {
// Tokenize by lines
case 'line': return [input];
// Tokenize by words
case 'word': return input.split(/\s+/);
}
}
/**
* Concatenates an array of tokens back into a string, respecting the diff mode.
*
* @param {string[]} input - Array of tokens
* @returns {string} - Concatenated string
*/
concat(input) {
const { mode } = this.options;
return input.join(mode === 'word' ? ' ' : '');
}
/**
* Computes the diff between the two input texts and populates the
* entries and grouped arrays.
*/
computeDiff() {
if (!this.diffRun) {
// Get the lines from both texts
const { linesA, linesB, maxLen } = this.text2lines();
// Loop through each line and compare them
for (let i = 0; i < maxLen; i++) {
const a = linesA[i] || '';
const b = linesB[i] || '';
// Perform line diffing
this.lineDiff(a, b, i);
}
// Find groups of adjacent changes
this.findGroups();
// Set the diff run flag to true
this.diffRun = true;
}
}
/**
* Compares two lines and records their differences at the configured granularity.
*
* @param {string} a - Line from the first text
* @param {string} b - Line from the second text
* @param {number} line - Line number
*/
lineDiff(a, b, line) {
const { mode, caseInsensitive } = this.options;
const baseLen = Math.max(a.length, b.length);
let A = a, B = b;
// If case-insensitive mode is enabled, convert both lines to lowercase
if (caseInsensitive)
A = a.toLowerCase(), B = b.toLowerCase();
let diffs = [];
let delSize = 0, insSize = 0;
if (mode === 'line') {
// For line mode, compare the entire lines directly
if (A !== B) {
diffs.push({
posA: 0, posB: 0,
del: a, ins: b,
size: b.length - a.length
});
delSize = a.length;
insSize = b.length;
}
}
else {
// For word mode, find precise diffs between tokenized lines
diffs = this.preciseDiff(a, A, b, B);
// Calculate total sizes of deletions and insertions
for (const d of diffs)
delSize += d.del.length, insSize += d.ins.length;
}
if (diffs.length) {
// Add the diff entry for this line
this.entries.push({
line, diffs, delSize, insSize, baseLen,
totalSize: insSize - delSize,
magnitude: this.magnitude(delSize, insSize, baseLen)
});
}
}
/**
* Finds all minimal diff blocks between two tokenized strings,
* returning original text and positions.
*
* @param {string} a - Original line (case preserved)
* @param {string} A - Original line (possibly lowercased)
* @param {string} b - Modified line (case preserved)
* @param {string} B - Modified line (possibly lowercased)
* @returns {DiffEntry[]} - Array of diff entries for this line
*/
preciseDiff(a, A, b, B) {
// Helper function to calculate positions of tokens in the original text
const posIndex = (t) => t.reduce((p, _, i) => (p.push(i ? p[i - 1] + t[i - 1].length + 1 : 0), p), []);
// Original and tokenized arrays, their lengths and position arrays
const origA = this.tokenize(a);
const origB = this.tokenize(b);
const tokenA = this.tokenize(A);
const tokenB = this.tokenize(B);
const lenA = tokenA.length;
const lenB = tokenB.length;
const posArrA = posIndex(origA);
const posArrB = posIndex(origB);
// Find all matching blocks (LCS)
const matches = [];
let ai = 0, bi = 0;
while (ai < lenA && bi < lenB) {
// If tokens match, find the length of the match
if (tokenA[ai] === tokenB[bi]) {
let len = 1;
// Extend the match as long as tokens continue to match
while (ai + len < lenA && bi + len < lenB &&
tokenA[ai + len] === tokenB[bi + len])
len++;
matches.push({ ai, bi, len });
ai += len, bi += len;
}
else {
let found = false;
// Look ahead for next sync point (greedy, but avoids long tails)
for (let offset = 1; offset <= 3 && !found; offset++) {
// Check if the next token in A matches the current token in B
if (ai + offset < lenA && tokenA[ai + offset] === tokenB[bi]) {
matches.push({ ai: ai + offset, bi, len: 1 });
ai += offset + 1, bi += 1, found = true;
}
// Check if the next token in B matches the current token in A
else if (bi + offset < lenB && tokenA[ai] === tokenB[bi + offset]) {
matches.push({ ai, bi: bi + offset, len: 1 });
ai += 1, bi += offset + 1, found = true;
}
}
// If no match was found, advance both pointers by one
if (!found)
ai++, bi++;
}
}
// Walk through tokens and emit diffs between matches
const diffs = [];
let i = 0, j = 0;
for (const m of matches) {
// If there are unmatched tokens before the match, record them
if (i < m.ai || j < m.bi) {
// Slice the original arrays to get the unmatched tokens
const delArr = origA.slice(i, m.ai);
const insArr = origB.slice(j, m.bi);
// Push the diff entry for unmatched tokens
diffs.push({
posA: posArrA[i] ?? 0,
posB: posArrB[j] ?? 0,
del: this.concat(delArr),
ins: this.concat(insArr),
size: insArr.join('').length - delArr.join('').length
});
}
// Advance to after the match
i = m.ai + m.len, j = m.bi + m.len;
}
// Tail diffs after the last match
if (i < lenA || j < lenB) {
// Slice the original arrays to get the unmatched tokens
const delArr = origA.slice(i);
const insArr = origB.slice(j);
// Push the diff entry for unmatched tokens at the end
diffs.push({
posA: posArrA[i] ?? 0,
posB: posArrB[j] ?? 0,
del: this.concat(delArr),
ins: this.concat(insArr),
size: insArr.join('').length - delArr.join('').length
});
}
// Remove empty diffs
return diffs.filter(d => d.del.length > 0 || d.ins.length > 0);
}
/**
* Groups adjacent changed lines together, including context lines,
* and calculates group metrics.
*/
findGroups() {
const { contextLines } = this.options;
// Helper function to add a group to the grouped array
const addGroup = (group, start, end) => {
// Calculate total sizes and base length for the group
const [delSize, insSize, totalSize, baseLen] = [
'delSize', 'insSize', 'totalSize', 'baseLen'
].map(k => group.reduce((sum, e) => sum + e[k], 0));
// Push the group to the grouped array
this.grouped.push({
start, end, delSize, insSize, totalSize,
line: group[0].line, entries: group,
magnitude: this.magnitude(delSize, insSize, baseLen)
});
};
let group = [];
let start = 0, end = 0;
// Iterate through each diff entry to find groups
for (const entry of this.entries) {
const s = Math.max(0, entry.line - contextLines);
const e = entry.line + contextLines;
// If the group is empty or the current entry is adjacent to the last one
if (!group.length || s <= end + 1) {
// If this is the first entry, set the start position
if (!group.length)
start = s;
end = Math.max(end, e);
group.push(entry);
}
else {
// If the group is not empty, finalize it and start a new one
addGroup(group, start, end);
group = [entry], start = s, end = e;
}
}
// If there is a remaining group, finalize it
if (group.length)
addGroup(group, start, end);
}
/**
* Calculates the change magnitude string for a group or line.
*
* @param {number} del - Number of deleted characters
* @param {number} ins - Number of inserted characters
* @param {number} baseLen - Base length for normalization
* @returns {string} - Magnitude string (e.g. "++-")
*/
magnitude(del, ins, baseLen) {
const { maxMagnitudeSymbols } = this.options;
const total = del + ins;
// If there are no changes or base length is zero, return empty string
if (total === 0 || baseLen === 0)
return '';
// Calculate the length of the magnitude string based on the full length
const magLen = Math.min(maxMagnitudeSymbols, Math.max(Math.round(total / baseLen * maxMagnitudeSymbols), 1));
// Calculate the number of plus and minus symbols
const plus = Math.round((ins / total) * magLen);
const minus = magLen - plus;
// Return the magnitude string with plus and minus symbols
return '+'.repeat(plus) + '-'.repeat(minus);
}
/**
* Generates a unified diff output as a string, with optional CLI coloring.
*
* @param {boolean} cli - If true, use CLI colors; otherwise, ASCII markup
* @returns {string} - Unified diff output
*/
output(cli) {
const { mode, contextLines, groupedLines, expandLines, showChangeMagnitude, lineBreak } = this.options;
// Get the lines and maximum length from the input texts
const { linesA, linesB, maxLen } = this.text2lines();
const linePad = Math.max(4, maxLen.toString().length);
// Helper functions for coloring and formatting (ASCII or CLI colored)
const highlight = (s, ansi) => cli ? `\x1b[${ansi}m${s}\x1b[0m` : s;
const cy = (s) => highlight(s, '36');
const gy = (s) => highlight(s, '90');
const gn = (s) => highlight(s, '32');
const rd = (s) => highlight(s, '31');
const ye = (s) => highlight(s, '33');
const del = (s) => cli ? `\x1b[37;41m${s}\x1b[31;49m` : `-[${s}]`;
const ins = (s) => cli ? `\x1b[37;42m${s}\x1b[32;49m` : `+[${s}]`;
// Function to output a block of lines with optional header
const block = (start, end, forced, headerEntry) => {
// If there is a header entry, output the header
if (headerEntry)
header(headerEntry);
// Loop through the range and output lines
for (let i = start; i <= end; i++)
line(i, forced ?? i);
out.push('');
};
// Function to output a header for a group or line
const header = (e) => {
out.push(`${(' '.repeat(linePad))} ${(cy(`@@ -${(e.line + 1)},${e.delSize} +${(e.line + 1)},${e.insSize} @@`))} ${(showChangeMagnitude ? ye(e.magnitude) : '')}`);
};
// Function to output a single line with optional diff highlighting
const line = (i, forced) => {
// If the line exists in either text, output it
if (linesA[i] || linesB[i]) {
// Find the diff entry for this line, if it exists
const entry = this.entries.find(e => e.line === i);
// Format the line number with padding
const lineNo = (i + 1).toString().padStart(linePad, ' ');
if (entry && forced === i) {
// If there is an entry, output the line with diff highlighting
out.push(`${lineNo} ${rd(`- ${mark(linesA[i], entry.diffs, 'del')}`)}`);
out.push(`${' '.repeat(linePad)} ${gn(`+ ${mark(linesB[i], entry.diffs, 'ins')}`)}`);
}
else {
// If no entry, just output the line without diff (context lines)
out.push(`${lineNo} ${gy(linesA[i])}`);
}
}
};
// Function to mark changes in a line based on the diffs
const mark = (line, diffs, type) => {
// If there are no diffs or the mode is line, return the line as is
if (!diffs.length || mode === 'line')
return line;
let res = '', idx = 0;
// Loop through each diff entry and apply the changes
for (const d of diffs) {
// Get the position and value based on the type
const pos = type === 'del' ? d.posA : d.posB;
const val = type === 'del' ? d.del : d.ins;
// If the value is empty, skip it
if (!val)
continue;
// Add the unchanged part of the line before the change
if (pos > idx)
res += line.slice(idx, pos);
// Add the changed part of the line with appropriate formatting
res += (type === 'del' ? del(val) : ins(val));
idx = pos + val.length;
}
// Return the marked line with any remaining unchanged part
return res + line.slice(idx);
};
let out = [''];
switch (true) {
// For expandLines, output the entire file context
case expandLines:
block(0, maxLen);
break;
// For groupedLines, output each group with its start and end
case groupedLines:
for (const group of this.grouped)
block(group.start, group.end, undefined, group);
break;
// For individual lines, output each entry with context lines
default:
for (const entry of this.entries)
block(entry.line - contextLines, entry.line + contextLines, entry.line, entry);
break;
}
// Output the final diff as a string (ASCII or CLI colored)
return out.join(lineBreak);
}
/**
* Returns the structured diff as an array of DiffLine objects.
*
* @returns {DiffLine[]} - Array of line-level diffs
*/
getStructuredDiff() { return this.entries; }
/**
* Returns the grouped diff as an array of DiffGroup objects.
*
* @returns {DiffGroup[]} - Array of grouped diffs
*/
getGroupedDiff() { return this.grouped; }
/**
* Returns the unified diff as a plain ASCII string.
*
* @returns {string} - Unified diff (ASCII)
*/
getASCIIDiff() { return this.output(false); }
/**
* Returns the unified diff as a CLI-colored string.
*
* @returns {string} - Unified diff (CLI colors)
*/
getCLIDiff() { return this.output(true); }
}
/**
* Hash Table Utility
* src/utils/HashTable.ts
*
* @see https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function
* @see https://en.wikipedia.org/wiki/Hash_table
*
* This module implements an instantiable hash table/cache using the FNV-1a hash algorithm.
* It allows for multiple independent caches (e.g. for metrics, normalization, etc.) with
* type safety and high performance. The FNV-1a algorithm is factored out into its own
* static utility class to avoid code duplication and memory overhead.
*
* The key() method supports any number of string arguments, enabling flexible cache keys
* for different use cases (e.g. normalization, metrics, etc.).
*
* @module Utils/HashTable
* @author Paul Köhler (komed3)
* @license MIT
*/
/**
* Hasher Utility
* Static class for FNV-1a hash calculation.
*/
class Hasher {
// Constants for the FNV-1a hash algorithm
static FNV_PRIME = 0x01000193;
static HASH_OFFSET = 0x811c9dc5;
/**
* Computes a hash value for a given string using the FNV-1a algorithm.
* Processes the string in chunks of 4 characters for better performance.
*
* @param {string} str - The string to hash
* @return {number} - The computed hash value as an unsigned 32-bit integer
*/
static fnv1a(str) {
const len = str.length;
let hash = this.HASH_OFFSET;
// Process 4 characters at a time for better performance
const chunks = Math.floor(len / 4);
for (let i = 0; i < chunks; i++) {
const pos = i * 4;
// Combine 4 chars into a single number for faster processing
const chunk = ((str.charCodeAt(pos)) |
(str.charCodeAt(pos + 1) << 8) |
(str.charCodeAt(pos + 2) << 16) |
(str.charCodeAt(pos + 3) << 24));
hash ^= chunk;
hash *= this.FNV_PRIME;
}
// Handle remaining characters
const remaining = len % 4;
if (remaining > 0) {
const pos = chunks * 4;
for (let i = 0; i < remaining; i++) {
hash ^= str.charCodeAt(pos + i);
hash *= this.FNV_PRIME;
}
}
// Final mixing to improve distribution
hash ^= hash >>> 16;
hash *= 0x85ebca6b;
hash ^= hash >>> 13;
hash *= 0xc2b2ae35;
hash ^= hash >>> 16;
// Convert to unsigned 32-bit integer
return hash >>> 0;
}
}
/**
* HashTable class implements an instantiable hash table/cache.
* Allows for multiple independent caches with type safety and high performance.
*
* @template K - The type of the label for the key (e.g. string, MetricName, …)
* @template T - The type of value to be stored in the hash table (e.g. MetricCompute, string, …)
*/
class HashTable {
// The max. length of a string to hash, which is set to 2048 characters.
static MAX_LEN = 2048;
// The max. size of the hash table, which is set to 10,000.
static TABLE_SIZE = 10_000;
/**
* The internal map to store entries.
* The key is a string generated from the label and any number of hashed strings.
* The value is of type T.
*/
table = new Map();
/**
* Generates a unique hash key for any number of string arguments.
* The key is in the format "label-H1-H2-H3-..."
*
* @param {K} label - Label for this key (e.g. metric name, normalization flags, …)
* @param {string[]} strs - Array of strings to hash (e.g. input, params, …)
* @param {boolean} [sorted=false] - Whether to sort the hashes before creating the key
* @returns {string|false} - A unique hash key or false if any string is too long
*/
key(label, strs, sorted = false) {
// Return false if any string exceeds the maximum length
for (const str of strs) {
if (str.length > HashTable.MAX_LEN)
return false;
}
// Hash all strings
const hashes = strs.map(s => Hasher.fnv1a(s));
// Sort them in ascending order
if (sorted)
hashes.sort();
// Build key: label-H1-H2-H3-...
return [label, ...hashes].join('-');
}
/**
* Checks if a key exists in the hash table.
*
* @param {string} key - The key to check
* @returns {boolean} - True if the key exists, false otherwise
*/
has(key) { return this.table.has(key); }
/**
* Retrieves the entry from the hash table by its key.
*
* @param {string} key - The key to look up
* @returns {T|undefined} - The entry if found, undefined otherwise
*/
get(key) { return this.table.get(key); }
/**
* Adds an entry to the hash table.
*
* @param {string} key - The hashed key for the entry
* @param {T} entry - The entry itself to add
* @param {boolean} [update=true] - Whether to update the entry if it already exists
* @returns {boolean} - True if added successfully, false if the table is full
*/
set(key, entry, update = true) {
// If the table is not full and the key does not exist or update is true, add the entry
if (this.table.size < HashTable.TABLE_SIZE && (update || !this.table.has(key))) {
this.table.set(key, entry);
return true;
}
return false;
}
/**
* Deletes an entry from the hash table by its key.
*
* @param {string} key - The key of the entry to delete
*/
delete(key) { this.table.delete(key); }
/**
* Clears the hash table.
* This method removes all entries from the hash table.
*/
clear() { this.table.clear(); }
/**
* Returns the current size of the hash table.
*
* @returns {number} - The number of entries in the hash table
*/
size() { return this.table.size; }
}
/**
* Normalizer Utility
* src/utils/Normalizer.ts
*
* @see https://en.wikipedia.org/wiki/Text_normalization
* @see https://en.wikipedia.org/wiki/Unicode_equivalence
*
* This module provides a