cmpstr
Version:
CmpStr is a lightweight, fast and well performing package for calculating string similarity
664 lines (660 loc) • 22.2 kB
JavaScript
// CmpStr v3.0.1 dev-052fa0c-250614 by Paul Köhler @komed3 / MIT License
'use strict';
var DeepMerge = require('./utils/DeepMerge.cjs');
var Profiler = require('./utils/Profiler.cjs');
var TextAnalyzer = require('./utils/TextAnalyzer.cjs');
var DiffChecker = require('./utils/DiffChecker.cjs');
var Normalizer = require('./utils/Normalizer.cjs');
var Filter = require('./utils/Filter.cjs');
var Registry = require('./utils/Registry.cjs');
require('./metric/Cosine.cjs');
require('./metric/DamerauLevenshtein.cjs');
require('./metric/DiceSorensen.cjs');
require('./metric/Hamming.cjs');
require('./metric/Jaccard.cjs');
require('./metric/JaroWinkler.cjs');
require('./metric/LCS.cjs');
require('./metric/Levenshtein.cjs');
require('./metric/NeedlemanWunsch.cjs');
require('./metric/qGram.cjs');
require('./metric/SmithWaterman.cjs');
var Metric = require('./metric/Metric.cjs');
require('./phonetic/Cologne.cjs');
require('./phonetic/Metaphone.cjs');
require('./phonetic/Soundex.cjs');
var Phonetic = require('./phonetic/Phonetic.cjs');
/**
* CmpStr Main API
* src/CmpStr.ts
*
* The CmpStr class provides a comprehensive, highly abstracted, and type-safe interface
* for string comparison, similarity measurement, phonetic indexing, filtering, normalization,
* and text analysis. It unifies all core features of the CmpStr package and exposes a
* consistent, user-friendly API for both single and batch operations.
*
* Features:
* - Centralized management of metrics, phonetic algorithms, and filters
* - Flexible normalization and filtering pipeline for all inputs
* - Batch, pairwise, and single string comparison with detailed results
* - Phonetic indexing and phonetic-aware search and comparison
* - Text analysis and unified diff utilities
* - Full TypeScript type safety and extensibility
*
* @module CmpStr
* @author Paul Köhler (komed3)
* @license MIT
*/
// Import the Profiler instance for global profiling
const profiler = Profiler.Profiler.getInstance();
/**
* The main CmpStr class that provides a unified interface for string comparison,
* phonetic indexing, filtering, and text analysis.
*
* @template R - The type of the metric result, defaults to MetricRaw
*/
class CmpStr {
/**
* --------------------------------------------------------------------------------
* Static methods and properties for global access to CmpStr features
* --------------------------------------------------------------------------------
*
* These static methods provide a convenient way to access the core features of
* the CmpStr package without needing to instantiate a CmpStr object.
*/
/**
* Adds, removes, pauses, resumes, lists, or clears global filters.
*
* @see Filter
*/
static filter = {
add: Filter.Filter.add,
remove: Filter.Filter.remove,
pause: Filter.Filter.pause,
resume: Filter.Filter.resume,
list: Filter.Filter.list,
clear: Filter.Filter.clear
};
/**
* Adds, removes, checks, or lists available metrics.
*
* @see MetricRegistry
*/
static metric = {
add: Metric.MetricRegistry.add,
remove: Metric.MetricRegistry.remove,
has: Metric.MetricRegistry.has,
list: Metric.MetricRegistry.list
};
/**
* Adds, removes, checks, or lists available phonetic algorithms and mappings.
*
* @see PhoneticRegistry
*/
static phonetic = {
add: Phonetic.PhoneticRegistry.add,
remove: Phonetic.PhoneticRegistry.remove,
has: Phonetic.PhoneticRegistry.has,
list: Phonetic.PhoneticRegistry.list,
map: {
add: Phonetic.PhoneticMappingRegistry.add,
remove: Phonetic.PhoneticMappingRegistry.remove,
has: Phonetic.PhoneticMappingRegistry.has,
list: Phonetic.PhoneticMappingRegistry.list
}
};
/**
* Provides access to the global profiler services.
*
* @see Profiler
*/
static profiler = profiler.services;
/**
* Clears the caches for normalizer, metric, and phonetic modules.
*/
static clearCache = {
normalizer: Normalizer.Normalizer.clear,
metric: Metric.Metric.clear,
phonetic: Phonetic.Phonetic.clear
};
/**
* Returns a TextAnalyzer instance for the given input string.
*
* @param {string} [input] - The input string
* @returns {TextAnalyzer} - The text analyzer
*/
static analyze(input) {
return new TextAnalyzer.TextAnalyzer(input);
}
/**
* Returns a DiffChecker instance for computing the unified diff between two texts.
*
* @param {string} a - The first (original) text
* @param {string} b - The second (modified) text
* @param {DiffOptions} [opt] - Optional diff configuration
* @returns {DiffChecker} - The diff checker instance
*/
static diff(a, b, opt) {
return new DiffChecker.DiffChecker(a, b, opt);
}
/**
* --------------------------------------------------------------------------------
* Instanciate the CmpStr class
* --------------------------------------------------------------------------------
*
* Methods to create a new CmpStr instance with the given options.
* Using the static `create` method is recommended to ensure proper instantiation.
*/
/**
* Creates a new CmpStr instance with the given options.
*
* @param {string|CmpStrOptions} [opt] - Optional serialized or options object
* @returns {CmpStr<R>} - A new CmpStr instance
*/
static create(opt) {
return new CmpStr(opt);
}
// The options object that holds the configuration for this CmpStr instance
options = Object.create(null);
/**
* Creates a new CmpStr instance with the given options.
* The constructor is protected to enforce the use of the static `create` method.
*
* @param {string|CmpStrOptions} [opt] - Optional serialized or options object
*/
constructor(opt) {
if (opt)
typeof opt === 'string'
? this.setSerializedOptions(opt)
: this.setOptions(opt);
}
/**
* ---------------------------------------------------------------------------------
* Protected utility methods for internal use
* ---------------------------------------------------------------------------------
*
* These methods provide utility functions for converting inputs, merging options,
* normalizing inputs, filtering, and preparing inputs for comparison.
*/
/**
* Assert a condition and throws if the condition is not met.
*
* @param {string} cond - The condition to met
* @param {any} [test] - Value to test for
* @throws {Error} If the condition is not met
*/
assert(cond, test) {
switch (cond) {
// Check if the metric exists
case 'metric':
if (!CmpStr.metric.has(test))
throw new Error(
`CmpStr <metric> must be set, call .setMetric(), ` +
`use CmpStr.metric.list() for available metrics`
);
break;
// Check if the phonetic algorithm exists
case 'phonetic':
if (!CmpStr.phonetic.has(test))
throw new Error(
`CmpStr <phonetic> must be set, call .setPhonetic(), ` +
`use CmpStr.phonetic.list() for available phonetic algorithms`
);
break;
// Throw an error for unknown conditions
default:
throw new Error(`Cmpstr condition <${cond}> unknown`);
}
}
/**
* Assert multiple conditions.
*
* @param {[ string, any? ][]} cond - Array of [ condition, value ] pairs
*/
assertMany(...cond) {
for (const [c, test] of cond) this.assert(c, test);
}
/**
* Resolves the options for the CmpStr instance, merging the provided options with
* the existing options.
*
* @param {CmpStrOptions} [opt] - Optional options to merge
* @returns {CmpStrOptions} - The resolved options
*/
resolveOptions(opt) {
return DeepMerge.merge({ ...(this.options ?? Object.create(null)) }, opt);
}
/**
* Normalizes the input string or array using the configured or provided flags.
*
* @param {MetricInput} input - The input string or array
* @param {NormalizeFlags} [flags] - Normalization flags
* @returns {MetricInput} - The normalized input
*/
normalize(input, flags) {
return Normalizer.Normalizer.normalize(
input,
flags ?? this.options.flags ?? ''
);
}
/**
* Applies all active filters to the input string or array.
*
* @param {MetricInput} input - The input string or array
* @param {string} [hook='input'] - The filter hook
* @returns {MetricInput} - The filtered string(s)
*/
filter(input, hook) {
return Filter.Filter.apply(hook, input);
}
/**
* Prepares the input by normalizing and filtering.
*
* @param {MetricInput} [input] - The input string or array
* @param {CmpStrOptions} [opt] - Optional options to use
* @returns {MetricInput} - The prepared input
*/
prepare(input, opt) {
const { flags, processors } = opt ?? this.options;
// Normalize the input using flags (i.e., 'itw')
if (flags?.length) input = this.normalize(input, flags);
// Filter the input using hooked up filters
input = this.filter(input, 'input');
// Apply phonetic processors if configured
if (processors?.phonetic) input = this.index(input, processors.phonetic);
return input;
}
/**
* Post-process the results of the metric computation.
*
* @param {MetricResult<R>} result - The metric result
* @returns {MetricResult<R>} - The post-processed results
*/
postProcess(result, opt) {
// Remove "zero similarity" from batch results if configured
if (opt?.removeZero && Array.isArray(result))
result = result.filter((r) => r.res > 0);
return result;
}
/**
* Computes the phonetic index for the given input using the specified phonetic algorithm.
*
* @param {MetricInput} input - The input string or array
* @param {{ algo: string, opt?: PhoneticOptions }} options - The phonetic algorithm and options
* @returns {MetricInput} - The phonetic index for the given input
*/
index(input, { algo, opt }) {
this.assert('phonetic', algo);
const phonetic = Registry.factory.phonetic(algo, opt);
const delimiter = opt?.delimiter ?? ' ';
return Array.isArray(input)
? input.map((s) => phonetic.getIndex(s).join(delimiter))
: phonetic.getIndex(input).join(delimiter);
}
/**
* Computes the metric result for the given inputs, applying normalization and
* filtering as configured.
*
* @template T - The type of the metric result
* @param {MetricInput} a - The first input string or array
* @param {MetricInput} b - The second input string or array
* @param {CmpStrOptions} [opt] - Optional options to use
* @param {MetricMode} [mode='single'] - The metric mode to use
* @param {boolean} [raw=false] - Whether to return raw results
* @param {boolean} [skip=false] - Whether to skip normalization and filtering
* @returns {T} - The computed metric result
*/
compute(a, b, opt, mode, raw, skip) {
const resolved = this.resolveOptions(opt);
this.assert('metric', resolved.metric);
// Prepare the input
const A = skip ? a : this.prepare(a, resolved);
const B = skip ? b : this.prepare(b, resolved);
// Get the metric class
const metric = Registry.factory.metric(resolved.metric, A, B, resolved.opt);
// Pass the original inputs to the metric
if (resolved.output !== 'prep') metric.setOriginal(a, b);
// Compute the metric result
metric.run(mode);
// Post-process the results
const result = this.postProcess(metric.getResults(), resolved);
// Resolve and return the result based on the raw flag
return this.output(result, raw ?? resolved.raw);
}
/**
* Resolves the result format (raw or formatted).
*
* @template T - The type of the metric result
* @param {MetricResult<R>} result - The metric result
* @param {boolean} [raw] - Whether to return raw results
* @returns {T} - The resolved result
*/
output(result, raw) {
return (raw ?? this.options.raw)
? result
: Array.isArray(result)
? result.map((r) => ({ source: r.a, target: r.b, match: r.res }))
: { source: result.a, target: result.b, match: result.res };
}
/**
* ---------------------------------------------------------------------------------
* Managing methods for CmpStr
* ---------------------------------------------------------------------------------
*
* These methods provides an interface to set and get properties of the CmpStr
* instance, such as options, metric, phonetic algorithm, and more.
*/
/**
* Creates a shallow clone of the current instance.
*
* @returns {CmpStr<R>} - The cloned instance
*/
clone() {
return Object.assign(Object.create(Object.getPrototypeOf(this)), this);
}
/**
* Resets the instance, clearing all data and options.
*
* @returns {this}
*/
reset() {
for (const k in this.options) delete this.options[k];
return this;
}
/**
* Sets / replaces the full options object.
*
* @param {CmpStrOptions} opt - The options
* @returns {this}
*/
setOptions(opt) {
this.options = opt;
return this;
}
/**
* Deep merges and sets new options.
*
* @param {CmpStrOptions} opt - The options to merge
* @returns {this}
*/
mergeOptions(opt) {
DeepMerge.merge(this.options, opt);
return this;
}
/**
* Sets the serialized options from a JSON string.
*
* @param {string} opt - The serialized options
* @returns {this}
*/
setSerializedOptions(opt) {
this.options = JSON.parse(opt);
return this;
}
/**
* Sets a specific option at the given path.
*
* @param {string} path - The path to the option
* @param {any} value - The value to set
* @returns {this}
*/
setOption(path, value) {
DeepMerge.set(this.options, path, value);
return this;
}
/**
* Removes an option at the given path.
*
* @param {string} path - The path to the option
* @returns {this}
*/
rmvOption(path) {
DeepMerge.rmv(this.options, path);
return this;
}
/**
* Enable or disable raw output.
*
* @param {boolean} enable - Whether to enable or disable raw output
* @returns {this}
*/
setRaw(enable) {
return this.setOption('raw', enable);
}
/**
* Sets the similatity metric to use (e.g., 'levenshtein', 'dice').
*
* @param {string} name - The metric name
* @returns {this}
*/
setMetric(name) {
return this.setOption('metric', name);
}
/**
* Sets the normalization flags (e.g., 'itw', 'nfc').
*
* @param {NormalizeFlags} flags - The normalization flags
* @returns {this}
*/
setFlags(flags) {
return this.setOption('flags', flags);
}
/**
* Removes the normalization flags entirely.
*
* @return {this}
*/
rmvFlags() {
return this.rmvOption('flags');
}
/**
* Sets the pre-processors to use for preparing the input.
*
* @param {CmpStrProcessors} opt - The processors to set
* @returns {this}
*/
setProcessors(opt) {
return this.setOption('processors', opt);
}
/**
* Removes the processors entirely.
*
* @returns {this}
*/
rmvProcessors() {
return this.rmvOption('processors');
}
/**
* Returns the current options object.
*
* @returns {CmpStrOptions} - The options
*/
getOptions() {
return this.options;
}
/**
* Returns the options as a JSON string.
*
* @returns {string} - The serialized options
*/
getSerializedOptions() {
return JSON.stringify(this.options);
}
/**
* Returns a specific option value by path.
*
* @param {string} path - The path to the option
* @returns {any} - The option value
*/
getOption(path) {
return DeepMerge.get(this.options, path);
}
/**
* ---------------------------------------------------------------------------------
* Public core methods for string comparison
* ---------------------------------------------------------------------------------
*
* These methods provide the core functionality of the CmpStr class, allowing for
* string comparison, phonetic indexing, filtering, and text search.
*/
/**
* Performs a single metric comparison between the source and target.
*
* @template T - The type of the metric result
* @param {string} a - The source string
* @param {string} b - The target string
* @param {CmpStrOptions} [opt] - Optional options
* @returns {T} - The metric result
*/
test(a, b, opt) {
return this.compute(a, b, opt, 'single');
}
/**
* Performs a single metric comparison and returns only the numeric score.
*
* @param {string} a - The source string
* @param {string} b - The target string
* @param {CmpStrOptions} [opt] - Optional options
* @returns {number} - The similarity score (0..1)
*/
compare(a, b, opt) {
return this.compute(a, b, opt, 'single', true).res;
}
/**
* Performs a batch metric comparison between source and target strings
* or array of strings.
*
* @template T - The type of the metric result
* @param {MetricInput} a - The source string or array of strings
* @param {MetricInput} b - The target string or array of strings
* @param {CmpStrOptions} [opt] - Optional options
* @returns {T} - The batch metric results
*/
batchTest(a, b, opt) {
return this.compute(a, b, opt, 'batch');
}
/**
* Performs a batch metric comparison and returns results sorted by score.
*
* @template T - The type of the metric result
* @param {MetricInput} a - The source string or array of strings
* @param {MetricInput} b - The target string or array of strings
* @param {'desc'|'asc'} [dir='desc'] - Sort direction (desc, asc)
* @param {CmpStrOptions} [opt] - Optional options
* @returns {T} - The sorted batch results
*/
batchSorted(a, b, dir = 'desc', opt) {
return this.output(
this.compute(a, b, opt, 'batch', true).sort((a, b) =>
dir === 'asc' ? a.res - b.res : b.res - a.res
),
opt?.raw ?? this.options.raw
);
}
/**
* Performs a pairwise metric comparison between source and target strings
* or array of strings.
*
* Input arrays needs of the same length to perform pairwise comparison,
* otherwise the method will throw an error.
*
* @template T - The type of the metric result
* @param {MetricInput} a - The source string or array of strings
* @param {MetricInput} b - The target string or array of strings
* @param {CmpStrOptions} [opt] - Optional options
* @returns {T} - The pairwise metric results
*/
pairs(a, b, opt) {
return this.compute(a, b, opt, 'pairwise');
}
/**
* Performs a batch comparison and returns only results above the threshold.
*
* @template T - The type of the metric result
* @param {MetricInput} a - The source string or array of strings
* @param {MetricInput} b - The target string or array of strings
* @param {number} threshold - The similarity threshold (0..1)
* @param {CmpStrOptions} [opt] - Optional options
* @returns {T} - The filtered batch results
*/
match(a, b, threshold, opt) {
return this.output(
this.compute(a, b, opt, 'batch', true)
.filter((r) => r.res >= threshold)
.sort((a, b) => b.res - a.res),
opt?.raw ?? this.options.raw
);
}
/**
* Returns the n closest matches from a batch comparison.
*
* @template T - The type of the metric result
* @param {MetricInput} a - The source string or array of strings
* @param {MetricInput} b - The target string or array of strings
* @param {number} [n=1] - Number of closest matches
* @param {CmpStrOptions} [opt] - Optional options
* @returns {T} - The closest matches
*/
closest(a, b, n = 1, opt) {
return this.batchSorted(a, b, 'desc', opt).slice(0, n);
}
/**
* Returns the n furthest matches from a batch comparison.
*
* @template T - The type of the metric result
* @param {MetricInput} a - The source string or array of strings
* @param {MetricInput} b - The target string or array of strings
* @param {number} [n=1] - Number of furthest matches
* @param {CmpStrOptions} [opt] - Optional options
* @returns {T} - The furthest matches
*/
furthest(a, b, n = 1, opt) {
return this.batchSorted(a, b, 'asc', opt).slice(0, n);
}
/**
* Performs a normalized and filtered substring search.
*
* @param {string} needle - The search string
* @param {string[]} haystack - The array to search in
* @param {NormalizeFlags} [flags] - Normalization flags
* @param {CmpStrProcessors} [processors] - Pre-processors to apply
* @returns {string[]} - Array of matching entries
*/
search(needle, haystack, flags, processors) {
const resolved = this.resolveOptions({ flags, processors });
// Prepare the needle and haystack, normalizing and filtering them
const test = this.prepare(needle, resolved);
const hstk = this.prepare(haystack, resolved);
// Filter the haystack based on the normalized test string
return haystack.filter((_, i) => hstk[i].includes(test));
}
/**
* Computes a similarity matrix for the given input array.
*
* @param {string[]} input - The input array
* @param {CmpStrOptions} [opt] - Optional options
* @returns {number[][]} - The similarity matrix
*/
matrix(input, opt) {
input = this.prepare(input, this.resolveOptions(opt));
return input.map((a) =>
this.compute(a, input, undefined, 'batch', true, true).map(
(b) => b.res ?? 0
)
);
}
/**
* Computes the phonetic index for a string using the configured
* or given algorithm.
*
* @param {string} [input] - The input string
* @param {string} [algo] - The phonetic algorithm to use
* @param {PhoneticOptions} [opt] - Optional phonetic options
* @returns {string} - The phonetic index as a string
*/
phoneticIndex(input, algo, opt) {
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
return this.index(input, { algo: algo ?? a, opt: opt ?? o });
}
}
exports.CmpStr = CmpStr;
//# sourceMappingURL=CmpStr.cjs.map