UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

github.com/komed3/cmpstr

1,304 lines (1,299 loc) • 195 kB

JavaScript

/** * CmpStr v3.0.1 dev-052fa0c-250614 * This is a lightweight, fast and well performing library for calculating string similarity. * (c) 2023-2025 Paul Köhler @komed3 / MIT License * Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr */ /** * Deep Merge Utility * src/utils/DeepMerge.ts * * This module provides utility functions for deep merging objects, getting values by path, * and setting values by path in a deeply nested object structure. * * It supports dot and bracket notation (e.g. `a.b[0].c`) as well as escaped keys. * * Included functions: * - `get`: Retrieve a deeply nested value by path * - `set`: Assign a value to a nested path * - `merge`: Deeply merge two objects * - `has`: Check whether a path exists * - `rmv`: Delete a value at a path * * @module Utils/DeepMerge * @author Paul Köhler * @license MIT */ /** * Parse a path string into an array of keys. * * @param {string} p - The path string, e.g. `a.b.c` or `a[0].b` * @returns {(string|number)[]} - An array of keys, e.g. `['a', 'b', 'c']` or `['a', 0, 'b']` */ const parse = (p) => (p.replace(/\[(\d+)]/g, '.$1').split('.').map(s => /^\d+$/.test(s) ? +s : s)); /** * Deeply get a value from an object by a path string. * * @template T - The type of the object to get the value from * @param {T} t - The object to get the value from * @param {string} path - The path string, e.g. `a.b.c` * @param {any} fallback - The default value to return if the path does not exist * @returns {T|R|undefined} - The value at the specified path, otherwise the default value */ function get(t, path, fallback) { return parse(path).reduce((o, k) => o?.[k] ?? fallback, t); } /** * Deeply set a value in an object by a path string. * * @template T - The type of the object to get the value from * @param {T} t - The object to set the value in * @param {string} path - The path string, e.g. `a.b.c` * @param {any} value - The value to set at the specified path * @returns {T} - The modified object with the value set at the specified path * @throws {Error} - Throws an error if the key is not a valid identifier */ function set(t, path, value) { // If the path is empty, return the value if (path === '') return value; // Split the path into the first key and the rest of the path const [k, ...r] = parse(path); // Throw an error if the key is not a valid identifier if (t !== undefined && (typeof t !== 'object' || t === null)) throw Error(`cannot set property <${k}> of <${JSON.stringify(t)}>`); // Assign the value to the specified key in the object return Object.assign(t ?? (typeof k === 'number' ? [] : Object.create(null)), { [k]: set(t?.[k], r.join('.'), value) }); } /** * Deeply merge two objects, where the second object overrides the first. * * @template T - The type of the object to get the value from * @param {T} t - The target object to merge into * @param {T} o - The source object to merge from * @param {boolean} [mergeUndefined=false] - Whether to merge undefined values * @returns {T} - The merged object */ function merge(t = Object.create(null), o = Object.create(null), mergeUndefined = false) { // Iterate over the keys of the source object and merge them into the target object return Object.keys(o).forEach(k => { const val = o[k]; // If the value is undefined and mergeUndefined is false, skip it if (!mergeUndefined && val === undefined) return; // Skip dangerous property names to prevent prototype pollution if (k === '__proto__' || k === 'constructor') return; // If the value is an object and not an array, recursively merge it t[k] = typeof val === 'object' && !Array.isArray(val) ? merge(typeof t[k] === 'object' && !Array.isArray(t[k]) ? t[k] : Object.create(null), val) : val; }), t; } /** * Delete a value at a specified path in an object. * * @template T - The type of the object to get the value from * @param {T} t - The object to delete the value from * @param {string} path - The path string, e.g. `a.b.c` * @param {boolean} [preserveEmpty=false] - Whether to preserve empty objects/arrays * @returns {T} - The modified object with the value deleted at the specified path */ function rmv(t, path, preserveEmpty = false) { const r = (o, k, i = 0) => { const key = k[i]; // Delete the key if it is not an object or if it is the last key in the path if (!o || typeof o !== 'object') return false; if (i === k.length - 1) return delete o[key]; if (!r(o[key], k, i + 1)) return false; // If preserveEmpty is false, check if the object or array is empty if (!preserveEmpty) { const val = o[key]; // If the value is an empty array or object, delete the key if (typeof val === 'object' && ((Array.isArray(val) && val.every(v => v == null)) || (!Array.isArray(val) && Object.keys(val).length === 0))) delete o[key]; } return true; }; r(t, parse(path)); return t; } /** * Profiler Utility * src/utils/profiler.ts * * @see https://en.wikipedia.org/wiki/Profiling_(computer_programming) * * This class provides methods to run synchronous and asynchronous functions, capturing * their execution time and memory usage, and storing the results in a set of profiler * entries. It supports both Node.js and browser environments, detecting the environment * automatically. * * The class is optimized for minimal overhead and can be used for fine-grained * performance profiling. * * @module Utils/Profiler * @author Paul Köhler (komed3) * @license MIT */ /** * Profiler class for measuring execution time and memory usage of functions. */ class Profiler { // Environment detection static ENV; // Singleton instance static instance; // Store for profiler entries store = new Set(); // Total time and memory consumption totalTime = 0; totalMem = 0; // The Profiler active state active; /** * Sets the environment based on the available global objects. * Detects if running in Node.js or browser and sets the ENV property accordingly. */ static detectEnv() { // Check for Node.js environment if (typeof process !== 'undefined') Profiler.ENV = 'nodejs'; // Check for browser environment else if (typeof performance !== 'undefined') Profiler.ENV = 'browser'; // If neither, set ENV to unknown else Profiler.ENV = 'unknown'; } /** * Returns the singleton instance of the Perf class. * If the instance does not exist, it creates a new one. * * @param {boolean} [enable=false] - Optional parameter to enable the profiler upon instantiation * @returns {Profiler} - Singleton Profiler instance */ static getInstance(enable) { // Ensure the environment is detected if (!Profiler.ENV) Profiler.detectEnv(); // If instance does not exist, create a new one if (!Profiler.instance) Profiler.instance = new Profiler(enable); // Return singleton instance return Profiler.instance; } /** * Private constructor to enforce singleton pattern. * Initializes the store for profiler entries. * * @param {boolean} [enable=false] - Optional parameter to enable the profiler */ constructor(enable) { this.active = enable ?? false; } /** * Gets the current time based on the environment. * * Uses process.hrtime.bigint() for Node.js, performance.now() for browsers, * and Date.now() as a fallback. * * @returns {number} - Current time in milliseconds */ now() { switch (Profiler.ENV) { // Node.js environment case 'nodejs': return Number(process.hrtime.bigint()) / 1e6; // Browser environment case 'browser': return performance.now(); // Fallback default: return Date.now(); } } /** * Gets the current memory usage based on the environment. * * Uses process.memoryUsage().heapUsed for Node.js, performance.memory.usedJSHeapSize * for browsers, and returns 0 as a fallback. * * @returns {number} - Current memory usage in bytes */ mem() { switch (Profiler.ENV) { // Node.js environment case 'nodejs': return process.memoryUsage().heapUsed; // Browser environment case 'browser': return performance.memory?.usedJSHeapSize ?? 0; // Fallback default: return 0; } } /** * Enables the profiler. * Sets the active state to true, allowing profiling to occur. */ enable() { this.active = true; } /** * Disables the profiler. * Sets the active state to false, preventing further profiling. */ disable() { this.active = false; } /** * Resets the profiler by clearing the store, total time and memory consumption. * This method is useful for starting a new profiling session. */ clear() { this.store.clear(); this.totalTime = 0; this.totalMem = 0; } /** * Runs a synchronous function and profiles its execution time and memory usage. * If the profiler is not active, it simply executes the function without profiling. * * @param {() => T} fn - Function to be executed and profiled * @param {Record<string, any>} meta - Metadata to be associated with the profiling entry * @returns {T} - The result of the executed function */ run(fn, meta = {}) { // If the profiler is not active, simply execute the function without profiling if (!this.active) return fn(); // Capture the start time and memory usage const startTime = this.now(), startMem = this.mem(); // Execute the function and capture the result const res = fn(); // Calculate the time and memory consumption const deltaTime = this.now() - startTime; const deltaMem = this.mem() - startMem; // Add the profiling entry to the store this.store.add({ time: deltaTime, mem: deltaMem, res, meta }); this.totalTime += deltaTime, this.totalMem += deltaMem; // Return the result of the function return res; } /** * Runs an asynchronous function and profiles its execution time and memory usage. * If the profiler is not active, it simply executes the function without profiling. * * @param {() => Promise<T>} fn - Asynchronous function to be executed and profiled * @param {Record<string, any>} meta - Metadata to be associated with the profiling entry * @returns {Promise<T>} - A promise that resolves to the result of the executed function */ async runAsync(fn, meta = {}) { // If the profiler is not active, simply execute the function without profiling if (!this.active) return await fn(); // Capture the start time and memory usage const startTime = this.now(), startMem = this.mem(); // Execute the asynchronous function and wait for its result const res = await fn(); // Calculate the time and memory consumption const deltaTime = this.now() - startTime; const deltaMem = this.mem() - startMem; // Add the profiling entry to the store this.store.add({ time: deltaTime, mem: deltaMem, res, meta }); this.totalTime += deltaTime, this.totalMem += deltaMem; // Return the result of the function return res; } /** * Retrieves all profiler entries stored in the profiler. * * @returns {ProfilerEntry<any>[]} - An array of profiler entries */ getAll() { return [...this.store]; } /** * Retrieves the last profiler entry stored in the profiler. * * @returns {ProfilerEntry<any> | undefined} - The last profiler entry or undefined if no entries exist */ getLast() { return this.getAll().pop(); } /** * Retrieves the total time and memory consumption recorded by the profiler. * * @returns {{ time: number, mem: number }} - An object containing total time and memory usage */ getTotal() { return { time: this.totalTime, mem: this.totalMem }; } /** * Returns the services provided by the Profiler class. * This allows for easy access to the profiler's methods. * * @returns {ProfilerService<any>} - An object containing methods to control the profiler */ services = { enable: this.enable.bind(this), disable: this.disable.bind(this), clear: this.clear.bind(this), report: this.getAll.bind(this), last: this.getLast.bind(this), total: this.getTotal.bind(this) }; } /** * TextAnalyzer Utility * src/utils/TextAnalyzer.ts * * The TextAnalyzer class provides a comprehensive set of methods for analyzing and * extracting statistics from a given text. It supports word and sentence tokenization, * character and word frequency analysis, syllable estimation, readability metrics * (Flesch, Kincaid, LIX, WSTF), and various ratios and histograms. Designed for * efficiency and flexibility, it is suitable for linguistic research, readability * scoring, and text preprocessing tasks. * * @module Utils/TextAnalyzer * @author Paul Köhler (komed3) * @license MIT */ class TextAnalyzer { // The original text to analyze text; // Tokenized words and sentences words = []; sentences = []; // Frequency maps for characters and words charFrequency = new Map(); wordHistogram = new Map(); syllableCache = new Map(); /** * Constructs a new TextAnalyzer instance with the provided input text. * * @param {string} input - The text to analyze */ constructor(input) { this.text = input.trim(); this.tokenize(); this.computeFrequencies(); } /** * Tokenizes the input text into words and sentences. */ tokenize() { this.words = [], this.sentences = []; const text = this.text; const wordRegex = /\p{L}+/gu; let match; // Tokenize words using Unicode property escapes for letters while ((match = wordRegex.exec(text)) !== null) { this.words.push(match[0].toLowerCase()); } // Tokenize sentences using punctuation marks as delimiters this.sentences = text.split(/(?<=[.!?])\s+/).filter(Boolean); } /** * Computes character and word frequencies from the tokenized text. */ computeFrequencies() { // Compute character frequencies for (const char of this.text) this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1); // Compute word frequencies for (const word of this.words) this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1); } /** * Estimates the number of syllables in a word using a simple heuristic. * * @param {string} word - The word to estimate syllables for * @returns {number} - Estimated syllable count */ estimateSyllables(word) { // Check cache first to avoid redundant calculations if (this.syllableCache.has(word)) return this.syllableCache.get(word); // Normalize the word: lowercase and remove non-letter characters const clean = word.toLowerCase().replace(/[^a-zäöüß]/g, ''); const matches = clean.match(/[aeiouyäöü]+/g); // Count syllables based on vowel groups const count = matches ? matches.length : 1; this.syllableCache.set(word, count); return count; } /** * Gets the original text length in characters. * * @return {number} - Length of the text */ getLength() { return this.text.length; } /** * Gets the number of words in the text. * * @return {number} - Count of words */ getWordCount() { return this.words.length; } /** * Gets the number of sentences in the text. * * @return {number} - Count of sentences */ getSentenceCount() { return this.sentences.length; } /** * Gets the average word length in the text. * * @return {number} - Average length of words */ getAvgWordLength() { let totalLen = 0; for (const w of this.words) totalLen += w.length; return this.words.length ? totalLen / this.words.length : 0; } /** * Gets the average sentence length in words. * * @return {number} - Average length of sentences */ getAvgSentenceLength() { return this.sentences.length ? this.words.length / this.sentences.length : 0; } /** * Gets a histogram of word frequencies in the text. * * @returns {Record<string, number>} - A histogram of word frequencies */ getWordHistogram() { return Object.fromEntries(this.wordHistogram); } /** * Gets the most common words in the text, limited to a specified number. * * @param {number} [limit=5] - Maximum number of common words to return * @returns {string[]} - Array of the most common words */ getMostCommonWords(limit = 5) { return [...this.wordHistogram.entries()] .sort((a, b) => b[1] - a[1]) .slice(0, limit).map(e => e[0]); } /** * Gets the least common words (hapax legomena) in the text. * * Hapax legomena are words that occur only once in the text. * * @returns {string[]} - Array of hapax legomena */ getHapaxLegomena() { return [...this.wordHistogram.entries()] .filter(([, c]) => c === 1) .map(e => e[0]); } /** * Checks if the text contains any numbers. * * @returns {boolean} - True if numbers are present, false otherwise */ hasNumbers() { return /\d/.test(this.text); } /** * Calculates the ratio of uppercase letters to total letters in the text. * * @return {number} - Ratio of uppercase letters to total letters */ getUpperCaseRatio() { let upper = 0, letters = 0; for (let i = 0, len = this.text.length; i < len; i++) { const c = this.text[i]; if (/[A-Za-zÄÖÜäöüß]/.test(c)) { letters++; if (/[A-ZÄÖÜ]/.test(c)) upper++; } } return letters ? upper / letters : 0; } /** * Gets the frequency of each character in the text. * * @returns {Record<string, number>} - A record of character frequencies */ getCharFrequency() { return Object.fromEntries(this.charFrequency); } /** * Gets the frequency of each Unicode block in the text. * * @returns {Record<string, number>} - A record of Unicode block frequencies */ getUnicodeStats() { const result = {}; for (const [char, count] of this.charFrequency) { // Get the Unicode block for the character const block = char .charCodeAt(0).toString(16) .padStart(4, '0').toUpperCase(); // Increment the count for the block result[block] = (result[block] ?? 0) + count; } return result; } /** * Gets the ratio of long words (words with length >= len) to total words. * * @param {number} [len=7] - Minimum length for a word to be considered long * @returns {number} - Ratio of long words to total words */ getLongWordRatio(len = 7) { let long = 0; for (const w of this.words) if (w.length >= len) long++; return this.words.length ? long / this.words.length : 0; } /** * Gets the ratio of short words (words with length <= len) to total words. * * @param {number} [len=3] - Maximum length for a word to be considered short * @returns {number} - Ratio of short words to total words */ getShortWordRatio(len = 3) { let short = 0; for (const w of this.words) if (w.length <= len) short++; return this.words.length ? short / this.words.length : 0; } /** * Estimates the number of syllables in the text. * * @returns {number} - Total estimated syllable count */ getSyllablesCount() { let count = 0; for (const w of this.words) count += this.estimateSyllables(w); return count; } /** * Gets the number of monosyllabic words (words with exactly one syllable). * * @returns {number} - Count of monosyllabic words */ getMonosyllabicWordCount() { let count = 0; for (const w of this.words) if (this.estimateSyllables(w) === 1) count++; return count; } /** * Gets the number of words with at least a specified minimum syllable count. * * @param {number} min - Minimum syllable count for a word to be included * @returns {number} - Count of words meeting the syllable criteria */ getMinSyllablesWordCount(min) { let count = 0; for (const w of this.words) if (this.estimateSyllables(w) >= min) count++; return count; } /** * Gets the number of words with at most a specified maximum syllable count. * * @param {number} max - Maximum syllable count for a word to be included * @returns {number} - Count of words meeting the syllable criteria */ getMaxSyllablesWordCount(max) { let count = 0; for (const w of this.words) if (this.estimateSyllables(w) <= max) count++; return count; } /** * Calculates the Honore's R statistic for the text as a measure of lexical richness. * * @returns {number} - The Honore's R statistic */ getHonoresR() { return (100 * Math.log(this.words.length)) / (1 - (this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))); } /** * Estimates the reading time for the text based on words per minute (WPM). * * @param {number} [wpm=200] - Words per minute for the calculation * @returns {number} - Estimated reading time in minutes */ getReadingTime(wpm = 200) { return Math.max(1, this.words.length / (wpm ?? 1)); } /** * Calculates various readability scores based on the text. * * This method supports multiple readability metrics: * - Flesch Reading Ease * - Flesch-Kincaid Grade Level * * @param {'flesch'|'fleschde'|'kincaid'} [metric='flesch'] - The readability metric to calculate * @returns {number} - The calculated readability score */ getReadabilityScore(metric = 'flesch') { const w = this.words.length || 1; const s = this.sentences.length || 1; const y = this.getSyllablesCount() || 1; const asl = w / s; const asw = y / w; switch (metric) { // Flesch Reading Ease formula case 'flesch': return 206.835 - (1.015 * asl) - (84.6 * asw); // Flesch Reading Ease formula for German texts case 'fleschde': return 180 - asl - (58.5 * asw); // Flesch-Kincaid Grade Level formula case 'kincaid': return (0.39 * asl) + (11.8 * asw) - 15.59; } } /** * Calculates the LIX (Lesbarhetsindex) score for the text. * * The LIX score is a readability index that combines average word length and sentence length. * * @returns {number} - The LIX score */ getLIXScore() { const w = this.words.length || 1; const s = this.sentences.length || 1; const l = this.getLongWordRatio() * w; return (w / s) + (l / w * 100); } /** * Calculates the Wiener Sachtextformel (WSTF) scores for the text. * * The WSTF scores are a set of readability metrics based on word and sentence characteristics. * * @returns {[number, number, number, number]} - An array of WSTF scores */ getWSTFScore() { const w = this.words.length || 1; const h = this.getMinSyllablesWordCount(3) / w * 100; const s = this.getAvgSentenceLength(); const l = this.getLongWordRatio() * 100; const m = this.getMonosyllabicWordCount() / w * 100; return [ 0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.8750, 0.2007 * h + 0.1682 * s + 0.1373 * l - 2.7790, 0.2963 * h + 0.1905 * s - 1.1144, 0.2744 * h + 0.2656 * s - 1.6930 ]; } } /** * DiffChecker Utility * src/utils/DiffChecker.ts * * The DiffChecker class provides a robust and efficient utility for comparing two * texts and extracting their differences (full lines or word mode). It supports * context-aware grouping of changes, unified diff output (with CLI color or ASCII * markup), and detailed change magnitude metrics. The class is highly configurable, * allowing users to choose the diff granularity, case sensitivity, context lines, * grouping, and output style. It is suitable for text comparison, code review * tools, document versioning, and any application requiring precise and human- * readable difference reporting. * * Features: * - Line and word-based diffing * - Case-insensitive comparison option * - Context lines and grouping of adjacent changes * - Unified diff output (ASCII or colored CLI) * - Highlighting of changed segments within lines * - Change magnitude calculation (relative to group or line) * - Expand-all mode for full file context * * @module Utils/DiffChecker * @author Paul Köhler (komed3) * @license MIT */ /** * The DiffChecker class provides methods to compare two texts and generate * structured diffs, grouped diffs, and unified diff outputs. */ class DiffChecker { // Original input texts and options a; b; options; // Computed diff entries and groups entries = []; grouped = []; // Flag to indicate if the diff has already been computed diffRun = false; /** * Constructs a new DiffChecker instance for comparing two texts. * * @param {string} a - The first (original) text * @param {string} b - The second (modified) text * @param {DiffOptions} [opt] - Optional diff configuration */ constructor(a, b, opt = {}) { // Set the two texts to compare this.a = a, this.b = b; // Merge default with user-provided options this.options = { ...{ mode: 'word', caseInsensitive: false, contextLines: 1, groupedLines: true, expandLines: false, showChangeMagnitude: true, maxMagnitudeSymbols: 5, lineBreak: '\n' }, ...opt }; // Run the diff computation immediately this.computeDiff(); } /** * Splits both input texts into arrays of lines and returns them * with the maximum line count. * * @returns { linesA: string[], linesB: string[], maxLen: number } */ text2lines() { // Trim and split the input texts into lines const linesA = this.a.trim().split(/\r?\n/); const linesB = this.b.trim().split(/\r?\n/); return { linesA, linesB, maxLen: Math.max(linesA.length, linesB.length) }; } /** * Tokenizes a string according to the current diff mode (line or word). * * @param {string} input - The string to tokenize * @returns {string[]} - Array of tokens */ tokenize(input) { const { mode } = this.options; switch (mode) { // Tokenize by lines case 'line': return [input]; // Tokenize by words case 'word': return input.split(/\s+/); } } /** * Concatenates an array of tokens back into a string, respecting the diff mode. * * @param {string[]} input - Array of tokens * @returns {string} - Concatenated string */ concat(input) { const { mode } = this.options; return input.join(mode === 'word' ? ' ' : ''); } /** * Computes the diff between the two input texts and populates the * entries and grouped arrays. */ computeDiff() { if (!this.diffRun) { // Get the lines from both texts const { linesA, linesB, maxLen } = this.text2lines(); // Loop through each line and compare them for (let i = 0; i < maxLen; i++) { const a = linesA[i] || ''; const b = linesB[i] || ''; // Perform line diffing this.lineDiff(a, b, i); } // Find groups of adjacent changes this.findGroups(); // Set the diff run flag to true this.diffRun = true; } } /** * Compares two lines and records their differences at the configured granularity. * * @param {string} a - Line from the first text * @param {string} b - Line from the second text * @param {number} line - Line number */ lineDiff(a, b, line) { const { mode, caseInsensitive } = this.options; const baseLen = Math.max(a.length, b.length); let A = a, B = b; // If case-insensitive mode is enabled, convert both lines to lowercase if (caseInsensitive) A = a.toLowerCase(), B = b.toLowerCase(); let diffs = []; let delSize = 0, insSize = 0; if (mode === 'line') { // For line mode, compare the entire lines directly if (A !== B) { diffs.push({ posA: 0, posB: 0, del: a, ins: b, size: b.length - a.length }); delSize = a.length; insSize = b.length; } } else { // For word mode, find precise diffs between tokenized lines diffs = this.preciseDiff(a, A, b, B); // Calculate total sizes of deletions and insertions for (const d of diffs) delSize += d.del.length, insSize += d.ins.length; } if (diffs.length) { // Add the diff entry for this line this.entries.push({ line, diffs, delSize, insSize, baseLen, totalSize: insSize - delSize, magnitude: this.magnitude(delSize, insSize, baseLen) }); } } /** * Finds all minimal diff blocks between two tokenized strings, * returning original text and positions. * * @param {string} a - Original line (case preserved) * @param {string} A - Original line (possibly lowercased) * @param {string} b - Modified line (case preserved) * @param {string} B - Modified line (possibly lowercased) * @returns {DiffEntry[]} - Array of diff entries for this line */ preciseDiff(a, A, b, B) { // Helper function to calculate positions of tokens in the original text const posIndex = (t) => t.reduce((p, _, i) => (p.push(i ? p[i - 1] + t[i - 1].length + 1 : 0), p), []); // Original and tokenized arrays, their lengths and position arrays const origA = this.tokenize(a); const origB = this.tokenize(b); const tokenA = this.tokenize(A); const tokenB = this.tokenize(B); const lenA = tokenA.length; const lenB = tokenB.length; const posArrA = posIndex(origA); const posArrB = posIndex(origB); // Find all matching blocks (LCS) const matches = []; let ai = 0, bi = 0; while (ai < lenA && bi < lenB) { // If tokens match, find the length of the match if (tokenA[ai] === tokenB[bi]) { let len = 1; // Extend the match as long as tokens continue to match while (ai + len < lenA && bi + len < lenB && tokenA[ai + len] === tokenB[bi + len]) len++; matches.push({ ai, bi, len }); ai += len, bi += len; } else { let found = false; // Look ahead for next sync point (greedy, but avoids long tails) for (let offset = 1; offset <= 3 && !found; offset++) { // Check if the next token in A matches the current token in B if (ai + offset < lenA && tokenA[ai + offset] === tokenB[bi]) { matches.push({ ai: ai + offset, bi, len: 1 }); ai += offset + 1, bi += 1, found = true; } // Check if the next token in B matches the current token in A else if (bi + offset < lenB && tokenA[ai] === tokenB[bi + offset]) { matches.push({ ai, bi: bi + offset, len: 1 }); ai += 1, bi += offset + 1, found = true; } } // If no match was found, advance both pointers by one if (!found) ai++, bi++; } } // Walk through tokens and emit diffs between matches const diffs = []; let i = 0, j = 0; for (const m of matches) { // If there are unmatched tokens before the match, record them if (i < m.ai || j < m.bi) { // Slice the original arrays to get the unmatched tokens const delArr = origA.slice(i, m.ai); const insArr = origB.slice(j, m.bi); // Push the diff entry for unmatched tokens diffs.push({ posA: posArrA[i] ?? 0, posB: posArrB[j] ?? 0, del: this.concat(delArr), ins: this.concat(insArr), size: insArr.join('').length - delArr.join('').length }); } // Advance to after the match i = m.ai + m.len, j = m.bi + m.len; } // Tail diffs after the last match if (i < lenA || j < lenB) { // Slice the original arrays to get the unmatched tokens const delArr = origA.slice(i); const insArr = origB.slice(j); // Push the diff entry for unmatched tokens at the end diffs.push({ posA: posArrA[i] ?? 0, posB: posArrB[j] ?? 0, del: this.concat(delArr), ins: this.concat(insArr), size: insArr.join('').length - delArr.join('').length }); } // Remove empty diffs return diffs.filter(d => d.del.length > 0 || d.ins.length > 0); } /** * Groups adjacent changed lines together, including context lines, * and calculates group metrics. */ findGroups() { const { contextLines } = this.options; // Helper function to add a group to the grouped array const addGroup = (group, start, end) => { // Calculate total sizes and base length for the group const [delSize, insSize, totalSize, baseLen] = [ 'delSize', 'insSize', 'totalSize', 'baseLen' ].map(k => group.reduce((sum, e) => sum + e[k], 0)); // Push the group to the grouped array this.grouped.push({ start, end, delSize, insSize, totalSize, line: group[0].line, entries: group, magnitude: this.magnitude(delSize, insSize, baseLen) }); }; let group = []; let start = 0, end = 0; // Iterate through each diff entry to find groups for (const entry of this.entries) { const s = Math.max(0, entry.line - contextLines); const e = entry.line + contextLines; // If the group is empty or the current entry is adjacent to the last one if (!group.length || s <= end + 1) { // If this is the first entry, set the start position if (!group.length) start = s; end = Math.max(end, e); group.push(entry); } else { // If the group is not empty, finalize it and start a new one addGroup(group, start, end); group = [entry], start = s, end = e; } } // If there is a remaining group, finalize it if (group.length) addGroup(group, start, end); } /** * Calculates the change magnitude string for a group or line. * * @param {number} del - Number of deleted characters * @param {number} ins - Number of inserted characters * @param {number} baseLen - Base length for normalization * @returns {string} - Magnitude string (e.g. "++-") */ magnitude(del, ins, baseLen) { const { maxMagnitudeSymbols } = this.options; const total = del + ins; // If there are no changes or base length is zero, return empty string if (total === 0 || baseLen === 0) return ''; // Calculate the length of the magnitude string based on the full length const magLen = Math.min(maxMagnitudeSymbols, Math.max(Math.round(total / baseLen * maxMagnitudeSymbols), 1)); // Calculate the number of plus and minus symbols const plus = Math.round((ins / total) * magLen); const minus = magLen - plus; // Return the magnitude string with plus and minus symbols return '+'.repeat(plus) + '-'.repeat(minus); } /** * Generates a unified diff output as a string, with optional CLI coloring. * * @param {boolean} cli - If true, use CLI colors; otherwise, ASCII markup * @returns {string} - Unified diff output */ output(cli) { const { mode, contextLines, groupedLines, expandLines, showChangeMagnitude, lineBreak } = this.options; // Get the lines and maximum length from the input texts const { linesA, linesB, maxLen } = this.text2lines(); const linePad = Math.max(4, maxLen.toString().length); // Helper functions for coloring and formatting (ASCII or CLI colored) const highlight = (s, ansi) => cli ? `\x1b[${ansi}m${s}\x1b[0m` : s; const cy = (s) => highlight(s, '36'); const gy = (s) => highlight(s, '90'); const gn = (s) => highlight(s, '32'); const rd = (s) => highlight(s, '31'); const ye = (s) => highlight(s, '33'); const del = (s) => cli ? `\x1b[37;41m${s}\x1b[31;49m` : `-[${s}]`; const ins = (s) => cli ? `\x1b[37;42m${s}\x1b[32;49m` : `+[${s}]`; // Function to output a block of lines with optional header const block = (start, end, forced, headerEntry) => { // If there is a header entry, output the header if (headerEntry) header(headerEntry); // Loop through the range and output lines for (let i = start; i <= end; i++) line(i, forced ?? i); out.push(''); }; // Function to output a header for a group or line const header = (e) => { out.push(`${(' '.repeat(linePad))} ${(cy(`@@ -${(e.line + 1)},${e.delSize} +${(e.line + 1)},${e.insSize} @@`))} ${(showChangeMagnitude ? ye(e.magnitude) : '')}`); }; // Function to output a single line with optional diff highlighting const line = (i, forced) => { // If the line exists in either text, output it if (linesA[i] || linesB[i]) { // Find the diff entry for this line, if it exists const entry = this.entries.find(e => e.line === i); // Format the line number with padding const lineNo = (i + 1).toString().padStart(linePad, ' '); if (entry && forced === i) { // If there is an entry, output the line with diff highlighting out.push(`${lineNo} ${rd(`- ${mark(linesA[i], entry.diffs, 'del')}`)}`); out.push(`${' '.repeat(linePad)} ${gn(`+ ${mark(linesB[i], entry.diffs, 'ins')}`)}`); } else { // If no entry, just output the line without diff (context lines) out.push(`${lineNo} ${gy(linesA[i])}`); } } }; // Function to mark changes in a line based on the diffs const mark = (line, diffs, type) => { // If there are no diffs or the mode is line, return the line as is if (!diffs.length || mode === 'line') return line; let res = '', idx = 0; // Loop through each diff entry and apply the changes for (const d of diffs) { // Get the position and value based on the type const pos = type === 'del' ? d.posA : d.posB; const val = type === 'del' ? d.del : d.ins; // If the value is empty, skip it if (!val) continue; // Add the unchanged part of the line before the change if (pos > idx) res += line.slice(idx, pos); // Add the changed part of the line with appropriate formatting res += (type === 'del' ? del(val) : ins(val)); idx = pos + val.length; } // Return the marked line with any remaining unchanged part return res + line.slice(idx); }; let out = ['']; switch (true) { // For expandLines, output the entire file context case expandLines: block(0, maxLen); break; // For groupedLines, output each group with its start and end case groupedLines: for (const group of this.grouped) block(group.start, group.end, undefined, group); break; // For individual lines, output each entry with context lines default: for (const entry of this.entries) block(entry.line - contextLines, entry.line + contextLines, entry.line, entry); break; } // Output the final diff as a string (ASCII or CLI colored) return out.join(lineBreak); } /** * Returns the structured diff as an array of DiffLine objects. * * @returns {DiffLine[]} - Array of line-level diffs */ getStructuredDiff() { return this.entries; } /** * Returns the grouped diff as an array of DiffGroup objects. * * @returns {DiffGroup[]} - Array of grouped diffs */ getGroupedDiff() { return this.grouped; } /** * Returns the unified diff as a plain ASCII string. * * @returns {string} - Unified diff (ASCII) */ getASCIIDiff() { return this.output(false); } /** * Returns the unified diff as a CLI-colored string. * * @returns {string} - Unified diff (CLI colors) */ getCLIDiff() { return this.output(true); } } /** * Hash Table Utility * src/utils/HashTable.ts * * @see https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function * @see https://en.wikipedia.org/wiki/Hash_table * * This module implements an instantiable hash table/cache using the FNV-1a hash algorithm. * It allows for multiple independent caches (e.g. for metrics, normalization, etc.) with * type safety and high performance. The FNV-1a algorithm is factored out into its own * static utility class to avoid code duplication and memory overhead. * * The key() method supports any number of string arguments, enabling flexible cache keys * for different use cases (e.g. normalization, metrics, etc.). * * @module Utils/HashTable * @author Paul Köhler (komed3) * @license MIT */ /** * Hasher Utility * Static class for FNV-1a hash calculation. */ class Hasher { // Constants for the FNV-1a hash algorithm static FNV_PRIME = 0x01000193; static HASH_OFFSET = 0x811c9dc5; /** * Computes a hash value for a given string using the FNV-1a algorithm. * Processes the string in chunks of 4 characters for better performance. * * @param {string} str - The string to hash * @return {number} - The computed hash value as an unsigned 32-bit integer */ static fnv1a(str) { const len = str.length; let hash = this.HASH_OFFSET; // Process 4 characters at a time for better performance const chunks = Math.floor(len / 4); for (let i = 0; i < chunks; i++) { const pos = i * 4; // Combine 4 chars into a single number for faster processing const chunk = ((str.charCodeAt(pos)) | (str.charCodeAt(pos + 1) << 8) | (str.charCodeAt(pos + 2) << 16) | (str.charCodeAt(pos + 3) << 24)); hash ^= chunk; hash *= this.FNV_PRIME; } // Handle remaining characters const remaining = len % 4; if (remaining > 0) { const pos = chunks * 4; for (let i = 0; i < remaining; i++) { hash ^= str.charCodeAt(pos + i); hash *= this.FNV_PRIME; } } // Final mixing to improve distribution hash ^= hash >>> 16; hash *= 0x85ebca6b; hash ^= hash >>> 13; hash *= 0xc2b2ae35; hash ^= hash >>> 16; // Convert to unsigned 32-bit integer return hash >>> 0; } } /** * HashTable class implements an instantiable hash table/cache. * Allows for multiple independent caches with type safety and high performance. * * @template K - The type of the label for the key (e.g. string, MetricName, …) * @template T - The type of value to be stored in the hash table (e.g. MetricCompute, string, …) */ class HashTable { // The max. length of a string to hash, which is set to 2048 characters. static MAX_LEN = 2048; // The max. size of the hash table, which is set to 10,000. static TABLE_SIZE = 10_000; /** * The internal map to store entries. * The key is a string generated from the label and any number of hashed strings. * The value is of type T. */ table = new Map(); /** * Generates a unique hash key for any number of string arguments. * The key is in the format "label-H1-H2-H3-..." * * @param {K} label - Label for this key (e.g. metric name, normalization flags, …) * @param {string[]} strs - Array of strings to hash (e.g. input, params, …) * @param {boolean} [sorted=false] - Whether to sort the hashes before creating the key * @returns {string|false} - A unique hash key or false if any string is too long */ key(label, strs, sorted = false) { // Return false if any string exceeds the maximum length for (const str of strs) { if (str.length > HashTable.MAX_LEN) return false; } // Hash all strings const hashes = strs.map(s => Hasher.fnv1a(s)); // Sort them in ascending order if (sorted) hashes.sort(); // Build key: label-H1-H2-H3-... return [label, ...hashes].join('-'); } /** * Checks if a key exists in the hash table. * * @param {string} key - The key to check * @returns {boolean} - True if the key exists, false otherwise */ has(key) { return this.table.has(key); } /** * Retrieves the entry from the hash table by its key. * * @param {string} key - The key to look up * @returns {T|undefined} - The entry if found, undefined otherwise */ get(key) { return this.table.get(key); } /** * Adds an entry to the hash table. * * @param {string} key - The hashed key for the entry * @param {T} entry - The entry itself to add * @param {boolean} [update=true] - Whether to update the entry if it already exists * @returns {boolean} - True if added successfully, false if the table is full */ set(key, entry, update = true) { // If the table is not full and the key does not exist or update is true, add the entry if (this.table.size < HashTable.TABLE_SIZE && (update || !this.table.has(key))) { this.table.set(key, entry); return true; } return false; } /** * Deletes an entry from the hash table by its key. * * @param {string} key - The key of the entry to delete */ delete(key) { this.table.delete(key); } /** * Clears the hash table. * This method removes all entries from the hash table. */ clear() { this.table.clear(); } /** * Returns the current size of the hash table. * * @returns {number} - The number of entries in the hash table */ size() { return this.table.size; } } /** * Normalizer Utility * src/utils/Normalizer.ts * * @see https://en.wikipedia.org/wiki/Text_normalization * @see https://en.wikipedia.org/wiki/Unicode_equivalence * * This module provides a