UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

664 lines (660 loc) 22.2 kB
// CmpStr v3.0.1 dev-052fa0c-250614 by Paul Köhler @komed3 / MIT License 'use strict'; var DeepMerge = require('./utils/DeepMerge.cjs'); var Profiler = require('./utils/Profiler.cjs'); var TextAnalyzer = require('./utils/TextAnalyzer.cjs'); var DiffChecker = require('./utils/DiffChecker.cjs'); var Normalizer = require('./utils/Normalizer.cjs'); var Filter = require('./utils/Filter.cjs'); var Registry = require('./utils/Registry.cjs'); require('./metric/Cosine.cjs'); require('./metric/DamerauLevenshtein.cjs'); require('./metric/DiceSorensen.cjs'); require('./metric/Hamming.cjs'); require('./metric/Jaccard.cjs'); require('./metric/JaroWinkler.cjs'); require('./metric/LCS.cjs'); require('./metric/Levenshtein.cjs'); require('./metric/NeedlemanWunsch.cjs'); require('./metric/qGram.cjs'); require('./metric/SmithWaterman.cjs'); var Metric = require('./metric/Metric.cjs'); require('./phonetic/Cologne.cjs'); require('./phonetic/Metaphone.cjs'); require('./phonetic/Soundex.cjs'); var Phonetic = require('./phonetic/Phonetic.cjs'); /** * CmpStr Main API * src/CmpStr.ts * * The CmpStr class provides a comprehensive, highly abstracted, and type-safe interface * for string comparison, similarity measurement, phonetic indexing, filtering, normalization, * and text analysis. It unifies all core features of the CmpStr package and exposes a * consistent, user-friendly API for both single and batch operations. * * Features: * - Centralized management of metrics, phonetic algorithms, and filters * - Flexible normalization and filtering pipeline for all inputs * - Batch, pairwise, and single string comparison with detailed results * - Phonetic indexing and phonetic-aware search and comparison * - Text analysis and unified diff utilities * - Full TypeScript type safety and extensibility * * @module CmpStr * @author Paul Köhler (komed3) * @license MIT */ // Import the Profiler instance for global profiling const profiler = Profiler.Profiler.getInstance(); /** * The main CmpStr class that provides a unified interface for string comparison, * phonetic indexing, filtering, and text analysis. * * @template R - The type of the metric result, defaults to MetricRaw */ class CmpStr { /** * -------------------------------------------------------------------------------- * Static methods and properties for global access to CmpStr features * -------------------------------------------------------------------------------- * * These static methods provide a convenient way to access the core features of * the CmpStr package without needing to instantiate a CmpStr object. */ /** * Adds, removes, pauses, resumes, lists, or clears global filters. * * @see Filter */ static filter = { add: Filter.Filter.add, remove: Filter.Filter.remove, pause: Filter.Filter.pause, resume: Filter.Filter.resume, list: Filter.Filter.list, clear: Filter.Filter.clear }; /** * Adds, removes, checks, or lists available metrics. * * @see MetricRegistry */ static metric = { add: Metric.MetricRegistry.add, remove: Metric.MetricRegistry.remove, has: Metric.MetricRegistry.has, list: Metric.MetricRegistry.list }; /** * Adds, removes, checks, or lists available phonetic algorithms and mappings. * * @see PhoneticRegistry */ static phonetic = { add: Phonetic.PhoneticRegistry.add, remove: Phonetic.PhoneticRegistry.remove, has: Phonetic.PhoneticRegistry.has, list: Phonetic.PhoneticRegistry.list, map: { add: Phonetic.PhoneticMappingRegistry.add, remove: Phonetic.PhoneticMappingRegistry.remove, has: Phonetic.PhoneticMappingRegistry.has, list: Phonetic.PhoneticMappingRegistry.list } }; /** * Provides access to the global profiler services. * * @see Profiler */ static profiler = profiler.services; /** * Clears the caches for normalizer, metric, and phonetic modules. */ static clearCache = { normalizer: Normalizer.Normalizer.clear, metric: Metric.Metric.clear, phonetic: Phonetic.Phonetic.clear }; /** * Returns a TextAnalyzer instance for the given input string. * * @param {string} [input] - The input string * @returns {TextAnalyzer} - The text analyzer */ static analyze(input) { return new TextAnalyzer.TextAnalyzer(input); } /** * Returns a DiffChecker instance for computing the unified diff between two texts. * * @param {string} a - The first (original) text * @param {string} b - The second (modified) text * @param {DiffOptions} [opt] - Optional diff configuration * @returns {DiffChecker} - The diff checker instance */ static diff(a, b, opt) { return new DiffChecker.DiffChecker(a, b, opt); } /** * -------------------------------------------------------------------------------- * Instanciate the CmpStr class * -------------------------------------------------------------------------------- * * Methods to create a new CmpStr instance with the given options. * Using the static `create` method is recommended to ensure proper instantiation. */ /** * Creates a new CmpStr instance with the given options. * * @param {string|CmpStrOptions} [opt] - Optional serialized or options object * @returns {CmpStr<R>} - A new CmpStr instance */ static create(opt) { return new CmpStr(opt); } // The options object that holds the configuration for this CmpStr instance options = Object.create(null); /** * Creates a new CmpStr instance with the given options. * The constructor is protected to enforce the use of the static `create` method. * * @param {string|CmpStrOptions} [opt] - Optional serialized or options object */ constructor(opt) { if (opt) typeof opt === 'string' ? this.setSerializedOptions(opt) : this.setOptions(opt); } /** * --------------------------------------------------------------------------------- * Protected utility methods for internal use * --------------------------------------------------------------------------------- * * These methods provide utility functions for converting inputs, merging options, * normalizing inputs, filtering, and preparing inputs for comparison. */ /** * Assert a condition and throws if the condition is not met. * * @param {string} cond - The condition to met * @param {any} [test] - Value to test for * @throws {Error} If the condition is not met */ assert(cond, test) { switch (cond) { // Check if the metric exists case 'metric': if (!CmpStr.metric.has(test)) throw new Error( `CmpStr <metric> must be set, call .setMetric(), ` + `use CmpStr.metric.list() for available metrics` ); break; // Check if the phonetic algorithm exists case 'phonetic': if (!CmpStr.phonetic.has(test)) throw new Error( `CmpStr <phonetic> must be set, call .setPhonetic(), ` + `use CmpStr.phonetic.list() for available phonetic algorithms` ); break; // Throw an error for unknown conditions default: throw new Error(`Cmpstr condition <${cond}> unknown`); } } /** * Assert multiple conditions. * * @param {[ string, any? ][]} cond - Array of [ condition, value ] pairs */ assertMany(...cond) { for (const [c, test] of cond) this.assert(c, test); } /** * Resolves the options for the CmpStr instance, merging the provided options with * the existing options. * * @param {CmpStrOptions} [opt] - Optional options to merge * @returns {CmpStrOptions} - The resolved options */ resolveOptions(opt) { return DeepMerge.merge({ ...(this.options ?? Object.create(null)) }, opt); } /** * Normalizes the input string or array using the configured or provided flags. * * @param {MetricInput} input - The input string or array * @param {NormalizeFlags} [flags] - Normalization flags * @returns {MetricInput} - The normalized input */ normalize(input, flags) { return Normalizer.Normalizer.normalize( input, flags ?? this.options.flags ?? '' ); } /** * Applies all active filters to the input string or array. * * @param {MetricInput} input - The input string or array * @param {string} [hook='input'] - The filter hook * @returns {MetricInput} - The filtered string(s) */ filter(input, hook) { return Filter.Filter.apply(hook, input); } /** * Prepares the input by normalizing and filtering. * * @param {MetricInput} [input] - The input string or array * @param {CmpStrOptions} [opt] - Optional options to use * @returns {MetricInput} - The prepared input */ prepare(input, opt) { const { flags, processors } = opt ?? this.options; // Normalize the input using flags (i.e., 'itw') if (flags?.length) input = this.normalize(input, flags); // Filter the input using hooked up filters input = this.filter(input, 'input'); // Apply phonetic processors if configured if (processors?.phonetic) input = this.index(input, processors.phonetic); return input; } /** * Post-process the results of the metric computation. * * @param {MetricResult<R>} result - The metric result * @returns {MetricResult<R>} - The post-processed results */ postProcess(result, opt) { // Remove "zero similarity" from batch results if configured if (opt?.removeZero && Array.isArray(result)) result = result.filter((r) => r.res > 0); return result; } /** * Computes the phonetic index for the given input using the specified phonetic algorithm. * * @param {MetricInput} input - The input string or array * @param {{ algo: string, opt?: PhoneticOptions }} options - The phonetic algorithm and options * @returns {MetricInput} - The phonetic index for the given input */ index(input, { algo, opt }) { this.assert('phonetic', algo); const phonetic = Registry.factory.phonetic(algo, opt); const delimiter = opt?.delimiter ?? ' '; return Array.isArray(input) ? input.map((s) => phonetic.getIndex(s).join(delimiter)) : phonetic.getIndex(input).join(delimiter); } /** * Computes the metric result for the given inputs, applying normalization and * filtering as configured. * * @template T - The type of the metric result * @param {MetricInput} a - The first input string or array * @param {MetricInput} b - The second input string or array * @param {CmpStrOptions} [opt] - Optional options to use * @param {MetricMode} [mode='single'] - The metric mode to use * @param {boolean} [raw=false] - Whether to return raw results * @param {boolean} [skip=false] - Whether to skip normalization and filtering * @returns {T} - The computed metric result */ compute(a, b, opt, mode, raw, skip) { const resolved = this.resolveOptions(opt); this.assert('metric', resolved.metric); // Prepare the input const A = skip ? a : this.prepare(a, resolved); const B = skip ? b : this.prepare(b, resolved); // Get the metric class const metric = Registry.factory.metric(resolved.metric, A, B, resolved.opt); // Pass the original inputs to the metric if (resolved.output !== 'prep') metric.setOriginal(a, b); // Compute the metric result metric.run(mode); // Post-process the results const result = this.postProcess(metric.getResults(), resolved); // Resolve and return the result based on the raw flag return this.output(result, raw ?? resolved.raw); } /** * Resolves the result format (raw or formatted). * * @template T - The type of the metric result * @param {MetricResult<R>} result - The metric result * @param {boolean} [raw] - Whether to return raw results * @returns {T} - The resolved result */ output(result, raw) { return (raw ?? this.options.raw) ? result : Array.isArray(result) ? result.map((r) => ({ source: r.a, target: r.b, match: r.res })) : { source: result.a, target: result.b, match: result.res }; } /** * --------------------------------------------------------------------------------- * Managing methods for CmpStr * --------------------------------------------------------------------------------- * * These methods provides an interface to set and get properties of the CmpStr * instance, such as options, metric, phonetic algorithm, and more. */ /** * Creates a shallow clone of the current instance. * * @returns {CmpStr<R>} - The cloned instance */ clone() { return Object.assign(Object.create(Object.getPrototypeOf(this)), this); } /** * Resets the instance, clearing all data and options. * * @returns {this} */ reset() { for (const k in this.options) delete this.options[k]; return this; } /** * Sets / replaces the full options object. * * @param {CmpStrOptions} opt - The options * @returns {this} */ setOptions(opt) { this.options = opt; return this; } /** * Deep merges and sets new options. * * @param {CmpStrOptions} opt - The options to merge * @returns {this} */ mergeOptions(opt) { DeepMerge.merge(this.options, opt); return this; } /** * Sets the serialized options from a JSON string. * * @param {string} opt - The serialized options * @returns {this} */ setSerializedOptions(opt) { this.options = JSON.parse(opt); return this; } /** * Sets a specific option at the given path. * * @param {string} path - The path to the option * @param {any} value - The value to set * @returns {this} */ setOption(path, value) { DeepMerge.set(this.options, path, value); return this; } /** * Removes an option at the given path. * * @param {string} path - The path to the option * @returns {this} */ rmvOption(path) { DeepMerge.rmv(this.options, path); return this; } /** * Enable or disable raw output. * * @param {boolean} enable - Whether to enable or disable raw output * @returns {this} */ setRaw(enable) { return this.setOption('raw', enable); } /** * Sets the similatity metric to use (e.g., 'levenshtein', 'dice'). * * @param {string} name - The metric name * @returns {this} */ setMetric(name) { return this.setOption('metric', name); } /** * Sets the normalization flags (e.g., 'itw', 'nfc'). * * @param {NormalizeFlags} flags - The normalization flags * @returns {this} */ setFlags(flags) { return this.setOption('flags', flags); } /** * Removes the normalization flags entirely. * * @return {this} */ rmvFlags() { return this.rmvOption('flags'); } /** * Sets the pre-processors to use for preparing the input. * * @param {CmpStrProcessors} opt - The processors to set * @returns {this} */ setProcessors(opt) { return this.setOption('processors', opt); } /** * Removes the processors entirely. * * @returns {this} */ rmvProcessors() { return this.rmvOption('processors'); } /** * Returns the current options object. * * @returns {CmpStrOptions} - The options */ getOptions() { return this.options; } /** * Returns the options as a JSON string. * * @returns {string} - The serialized options */ getSerializedOptions() { return JSON.stringify(this.options); } /** * Returns a specific option value by path. * * @param {string} path - The path to the option * @returns {any} - The option value */ getOption(path) { return DeepMerge.get(this.options, path); } /** * --------------------------------------------------------------------------------- * Public core methods for string comparison * --------------------------------------------------------------------------------- * * These methods provide the core functionality of the CmpStr class, allowing for * string comparison, phonetic indexing, filtering, and text search. */ /** * Performs a single metric comparison between the source and target. * * @template T - The type of the metric result * @param {string} a - The source string * @param {string} b - The target string * @param {CmpStrOptions} [opt] - Optional options * @returns {T} - The metric result */ test(a, b, opt) { return this.compute(a, b, opt, 'single'); } /** * Performs a single metric comparison and returns only the numeric score. * * @param {string} a - The source string * @param {string} b - The target string * @param {CmpStrOptions} [opt] - Optional options * @returns {number} - The similarity score (0..1) */ compare(a, b, opt) { return this.compute(a, b, opt, 'single', true).res; } /** * Performs a batch metric comparison between source and target strings * or array of strings. * * @template T - The type of the metric result * @param {MetricInput} a - The source string or array of strings * @param {MetricInput} b - The target string or array of strings * @param {CmpStrOptions} [opt] - Optional options * @returns {T} - The batch metric results */ batchTest(a, b, opt) { return this.compute(a, b, opt, 'batch'); } /** * Performs a batch metric comparison and returns results sorted by score. * * @template T - The type of the metric result * @param {MetricInput} a - The source string or array of strings * @param {MetricInput} b - The target string or array of strings * @param {'desc'|'asc'} [dir='desc'] - Sort direction (desc, asc) * @param {CmpStrOptions} [opt] - Optional options * @returns {T} - The sorted batch results */ batchSorted(a, b, dir = 'desc', opt) { return this.output( this.compute(a, b, opt, 'batch', true).sort((a, b) => dir === 'asc' ? a.res - b.res : b.res - a.res ), opt?.raw ?? this.options.raw ); } /** * Performs a pairwise metric comparison between source and target strings * or array of strings. * * Input arrays needs of the same length to perform pairwise comparison, * otherwise the method will throw an error. * * @template T - The type of the metric result * @param {MetricInput} a - The source string or array of strings * @param {MetricInput} b - The target string or array of strings * @param {CmpStrOptions} [opt] - Optional options * @returns {T} - The pairwise metric results */ pairs(a, b, opt) { return this.compute(a, b, opt, 'pairwise'); } /** * Performs a batch comparison and returns only results above the threshold. * * @template T - The type of the metric result * @param {MetricInput} a - The source string or array of strings * @param {MetricInput} b - The target string or array of strings * @param {number} threshold - The similarity threshold (0..1) * @param {CmpStrOptions} [opt] - Optional options * @returns {T} - The filtered batch results */ match(a, b, threshold, opt) { return this.output( this.compute(a, b, opt, 'batch', true) .filter((r) => r.res >= threshold) .sort((a, b) => b.res - a.res), opt?.raw ?? this.options.raw ); } /** * Returns the n closest matches from a batch comparison. * * @template T - The type of the metric result * @param {MetricInput} a - The source string or array of strings * @param {MetricInput} b - The target string or array of strings * @param {number} [n=1] - Number of closest matches * @param {CmpStrOptions} [opt] - Optional options * @returns {T} - The closest matches */ closest(a, b, n = 1, opt) { return this.batchSorted(a, b, 'desc', opt).slice(0, n); } /** * Returns the n furthest matches from a batch comparison. * * @template T - The type of the metric result * @param {MetricInput} a - The source string or array of strings * @param {MetricInput} b - The target string or array of strings * @param {number} [n=1] - Number of furthest matches * @param {CmpStrOptions} [opt] - Optional options * @returns {T} - The furthest matches */ furthest(a, b, n = 1, opt) { return this.batchSorted(a, b, 'asc', opt).slice(0, n); } /** * Performs a normalized and filtered substring search. * * @param {string} needle - The search string * @param {string[]} haystack - The array to search in * @param {NormalizeFlags} [flags] - Normalization flags * @param {CmpStrProcessors} [processors] - Pre-processors to apply * @returns {string[]} - Array of matching entries */ search(needle, haystack, flags, processors) { const resolved = this.resolveOptions({ flags, processors }); // Prepare the needle and haystack, normalizing and filtering them const test = this.prepare(needle, resolved); const hstk = this.prepare(haystack, resolved); // Filter the haystack based on the normalized test string return haystack.filter((_, i) => hstk[i].includes(test)); } /** * Computes a similarity matrix for the given input array. * * @param {string[]} input - The input array * @param {CmpStrOptions} [opt] - Optional options * @returns {number[][]} - The similarity matrix */ matrix(input, opt) { input = this.prepare(input, this.resolveOptions(opt)); return input.map((a) => this.compute(a, input, undefined, 'batch', true, true).map( (b) => b.res ?? 0 ) ); } /** * Computes the phonetic index for a string using the configured * or given algorithm. * * @param {string} [input] - The input string * @param {string} [algo] - The phonetic algorithm to use * @param {PhoneticOptions} [opt] - Optional phonetic options * @returns {string} - The phonetic index as a string */ phoneticIndex(input, algo, opt) { const { algo: a, opt: o } = this.options.processors?.phonetic ?? {}; return this.index(input, { algo: algo ?? a, opt: opt ?? o }); } } exports.CmpStr = CmpStr; //# sourceMappingURL=CmpStr.cjs.map