UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

446 lines (442 loc) 16.2 kB
// CmpStr v3.0.1 dev-052fa0c-250614 by Paul Köhler @komed3 / MIT License 'use strict'; var Registry = require('../utils/Registry.cjs'); var HashTable = require('../utils/HashTable.cjs'); var Profiler = require('../utils/Profiler.cjs'); /** * Abstract Metric * src/metric/Metric.ts * * This module defines an abstract class for string metrics, providing a framework for * computing various string similarity metrics. It includes methods for running metrics * in different modes (single, batch, pairwise) synchronous or asynchronous and caching * results to optimize performance. The class is designed to be extended by specific * metric implementations like the Levenshtein distance or Jaro-Winkler similarity. * * It provides: * - A base class for string metrics with common functionality * - Methods for running metrics in different modes * - Pre-computation for trivial cases to optimize performance * - Caching of metric computations to avoid redundant calculations * - Support for symmetrical metrics (same result for inputs in any order) * - Performance tracking capabilities (Profiler) * - Asynchronous execution support for metrics * * This class is intended to be extended by specific metric implementations that will * implement the `compute` method to define the specific metric computation logic. * * @module Metric * @author Paul Köhler (komed3) * @license MIT */ // Get the singleton profiler instance for performance monitoring const profiler = Profiler.Profiler.getInstance(); /** * Abstract class representing a generic string metric. * * @abstract * @template R - The type of the raw result, defaulting to `MetricRaw`. */ class Metric { // Cache for metric computations to avoid redundant calculations static cache = new HashTable.HashTable(); // Metric name for identification metric; // Inputs for the metric computation, transformed into arrays a; b; // Store original inputs for result mapping origA = []; origB = []; // Options for the metric computation, such as performance tracking options; // Indicates whether the metric is symmetric (same result for inputs in any order) symmetric; /** * Result of the metric computation, which can be a single result or an array of results. * This will be populated after running the metric. */ results; /** * Static method to clear the cache of metric computations. */ static clear() { this.cache.clear(); } /** * Swaps two strings and their lengths if the first is longer than the second. * * @param {string} a - First string * @param {string} b - Second string * @param {number} m - Length of the first string * @param {number} n - Length of the second string * @returns {[string, string, number, number]} - Swapped strings and lengths */ static swap(a, b, m, n) { return m > n ? [b, a, n, m] : [a, b, m, n]; } /** * Clamps the similarity result between 0 and 1. * * @param {number} res - The input similarity to clamp * @returns {number} - The clamped similarity (0 to 1) */ static clamp(res) { return Math.max(0, Math.min(1, res)); } /** * Constructor for the Metric class. * Initializes the metric with two inputs (strings or arrays of strings) and options. * * @param {string} metric - The name of the metric (e.g. 'levenshtein') * @param {MetricInput} a - First input string or array of strings * @param {MetricInput} b - Second input string or array of strings * @param {MetricOptions} [opt] - Options for the metric computation * @param {boolean} [symmetric=false] - Whether the metric is symmetric (same result for inputs in any order) * @throws {Error} - If inputs `a` or `b` are empty */ constructor(metric, a, b, opt = {}, symmetric = false) { // Set the metric name this.metric = metric; // Set the inputs this.a = Array.isArray(a) ? a : [a]; this.b = Array.isArray(b) ? b : [b]; // Validate inputs: ensure they are not empty if (this.a.length === 0 || this.b.length === 0) throw new Error(`inputs <a> and <b> must not be empty`); // Set options this.options = opt; this.symmetric = symmetric; } /** * Pre-compute the metric for two strings. * This method is called before the actual computation to handle trivial cases. * * @param {string} a - First string * @param {string} b - Second string * @param {number} m - Length of the first string * @param {number} n - Length of the second string * @returns {MetricCompute<R>|undefined} - Pre-computed result or undefined if not applicable */ preCompute(a, b, m, n) { // If strings are identical, return a similarity of 1 if (a === b) return { res: 1 }; // If the lengths of both strings is less than 2, return a similarity of 0 if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 }; return undefined; } /** * Abstract method to be implemented by subclasses to perform the metric computation. * This method should contain the logic for computing the metric between two strings. * * @param {string} a - First string * @param {string} b - Second string * @param {number} m - Length of the first string * @param {number} n - Length of the second string * @param {number} maxLen - Maximum length of the strings * @returns {MetricCompute<R>} - The result of the metric computation * @throws {Error} - If not overridden in a subclass */ compute(a, b, m, n, maxLen) { throw new Error(`method compute() must be overridden in a subclass`); } /** * Run the metric computation for single inputs (two strings). * Applies preCompute for trivial cases before cache lookup and computation. * * If the profiler is active, it will measure time and memory usage. * * @param {number} i - Pointer to the first string * @param {number} j - Pointer to the second string * @returns {MetricResultSingle<R>} - The result of the metric computation */ runSingle(i, j) { // Type safety: convert inputs to strings let a = String(this.a[i]), A = a; let b = String(this.b[j]), B = b; // Get lengths let m = A.length, n = B.length; // Pre-compute trivial cases (identical, empty, etc.) let result = this.preCompute(A, B, m, n); if (!result) { // If the profiler is enabled, measure; else, just run result = profiler.run(() => { // Generate a cache key based on the metric and pair of strings `a` and `b` const key = Metric.cache.key(this.metric, [A, B], this.symmetric); // If the key exists in the cache, return the cached result // Otherwise, compute the metric using the algorithm return ( Metric.cache.get(key || '') ?? (() => { // If the metric is symmetrical, swap `a` and `b` (shorter string first) if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n); // Compute the similarity using the algorithm const res = this.compute(A, B, m, n, Math.max(m, n)); // If a key was generated, store the result in the cache if (key) Metric.cache.set(key, res); return res; })() ); }); } // Build metric result object return { metric: this.metric, a: this.origA[i] ?? a, b: this.origB[j] ?? b, ...result }; } /** * Run the metric computation for single inputs (two strings) asynchronously. * * @param {number} i - Pointer to the first string * @param {number} j - Pointer to the second string * @returns {Promise<MetricResultSingle<R>>} - Promise resolving the result of the metric computation */ async runSingleAsync(i, j) { return Promise.resolve(this.runSingle(i, j)); } /** * Run the metric computation for batch inputs (arrays of strings). * * It iterates through each string in the first array and computes the metric * against each string in the second array. */ runBatch() { const results = []; // Loop through each combination of strings in a[] and b[] for (let i = 0; i < this.a.length; i++) for (let j = 0; j < this.b.length; j++) results.push(this.runSingle(i, j)); // Populate the results // `this.results` will be an array of MetricResultSingle this.results = results; } /** * Run the metric computation for batch inputs (arrays of strings) asynchronously. */ async runBatchAsync() { const results = []; // Loop through each combination of strings in a[] and b[] for (let i = 0; i < this.a.length; i++) for (let j = 0; j < this.b.length; j++) results.push(await this.runSingleAsync(i, j)); // Populate the results // `this.results` will be an array of MetricResultSingle this.results = results; } /** * Run the metric computation for pairwise inputs (A[i] vs B[i]). * * This method assumes that both `a` and `b` are arrays of equal length * and computes the metric only for corresponding index pairs. */ runPairwise() { const results = []; // Compute metric for each corresponding pair for (let i = 0; i < this.a.length; i++) results.push(this.runSingle(i, i)); // Populate the results // `this.results` will be an array of MetricResultSingle this.results = results; } /** * Run the metric computation for pairwise inputs (A[i] vs B[i]) asynchronously. */ async runPairwiseAsync() { const results = []; // Compute metric for each corresponding pair for (let i = 0; i < this.a.length; i++) results.push(await this.runSingleAsync(i, i)); // Populate the results // `this.results` will be an array of MetricResultSingle this.results = results; } /** * Set the original inputs to which the results of the metric calculation will refer. * * @param {MetricInput} [a] - original input(s) for a * @param {MetricInput} [b] - original input(s) for b */ setOriginal(a, b) { if (a) this.origA = Array.isArray(a) ? a : [a]; if (b) this.origB = Array.isArray(b) ? b : [b]; return this; } /** * Check if the inputs are in batch mode. * * This method checks if either `a` or `b` contains more than one string, * indicating that the metric is being run in batch mode. * * @returns {boolean} - True if either input is an array with more than one element */ isBatch() { return this.a.length > 1 || this.b.length > 1; } /** * Check if the inputs are in single mode. * * This method checks if both `a` and `b` are single strings (not arrays), * indicating that the metric is being run on a single pair of strings. * * @returns {boolean} - True if both inputs are single strings */ isSingle() { return !this.isBatch(); } /** * Check if the inputs are in pairwise mode. * * This method checks if both `a` and `b` are arrays of the same length, * indicating that the metric is being run on corresponding pairs of strings. * * @returns {boolean} - True if both inputs are arrays of equal length * @param {boolean} [safe=false] - If true, does not throw an error if lengths are not equal * @throws {Error} - If `safe` is false and the lengths of `a` and `b` are not equal */ isPairwise(safe = false) { return this.isBatch() && this.a.length === this.b.length ? true : !safe && (() => { throw new Error(`mode <pairwise> requires arrays of equal length`); })(); } /** * Check if the metric is symmetrical. * * This method returns whether the metric is symmetric, meaning it produces the same * result regardless of the order of inputs (e.g., Levenshtein distance). * * @returns {boolean} - True if the metric is symmetric */ isSymmetrical() { return this.symmetric; } /** * Determine which mode to run the metric in. * * This method checks the provided mode or defaults to the mode specified in options. * If no mode is specified, it defaults to 'default'. * * @param {MetricMode} [mode] - The mode to run the metric in (optional) * @returns {MetricMode} - The determined mode */ whichMode(mode) { return mode ?? this.options?.mode ?? 'default'; } /** * Clear the cached results of the metric. * * This method resets the `results` property to `undefined`, effectively clearing * any previously computed results. It can be useful for re-running the metric * with new inputs or options. */ clear() { this.results = undefined; } /** * Run the metric computation based on the specified mode. * * @param {MetricMode} [mode] - The mode to run the metric in (optional) * @param {boolean} [clear=true] - Whether to clear previous results before running * @throws {Error} - If an unsupported mode is specified */ run(mode, clear = true) { // Clear previous results if requested if (clear) this.clear(); switch (this.whichMode(mode)) { // Default mode runs the metric on single inputs or falls back to batch mode case 'default': if (this.isSingle()) { this.results = this.runSingle(0, 0); break; } // Batch mode runs the metric on all combinations of a[] and b[] case 'batch': this.runBatch(); break; // Single mode runs the metric on the first elements of a[] and b[] case 'single': this.results = this.runSingle(0, 0); break; // Pairwise mode runs the metric on corresponding pairs of a[] and b[] case 'pairwise': if (this.isPairwise()) this.runPairwise(); break; // Unsupported mode default: throw new Error(`unsupported mode <${mode}>`); } } /** * Run the metric computation based on the specified mode asynchronously. * * @param {MetricMode} [mode] - The mode to run the metric in (optional) * @param {boolean} [clear=true] - Whether to clear previous results before running * @returns {Promise<void>} - A promise that resolves when the metric computation is complete * @throws {Error} - If an unsupported mode is specified */ async runAsync(mode, clear = true) { // Clear previous results if requested if (clear) this.clear(); switch (this.whichMode(mode)) { // Default mode runs the metric on single inputs or falls back to batch mode case 'default': if (this.isSingle()) { this.results = await this.runSingleAsync(0, 0); break; } // Batch mode runs the metric on all combinations of a[] and b[] case 'batch': await this.runBatchAsync(); break; // Single mode runs the metric on the first elements of a[] and b[] case 'single': this.results = await this.runSingleAsync(0, 0); break; // Pairwise mode runs the metric on corresponding pairs of a[] and b[] case 'pairwise': if (this.isPairwise()) await this.runPairwiseAsync(); break; // Unsupported mode default: throw new Error(`unsupported async mode <${mode}>`); } } /** * Get the name of the metric. * * @returns {string} - The name of the metric */ getMetricName() { return this.metric; } /** * Get the result of the metric computation. * * @returns {MetricResult<R>} - The result of the metric computation * @throws {Error} - If `run()` has not been called before this method */ getResults() { // Ensure that the metric has been run before getting the result if (this.results === undefined) throw new Error(`run() must be called before getResult()`); // Return the result(s) return this.results; } } /** * Metric registry service for managing metric implementations. * * This registry allows for dynamic registration and retrieval of metric classes, * enabling the use of various string similarity metrics in a consistent manner. */ const MetricRegistry = Registry.Registry('metric', Metric); exports.Metric = Metric; exports.MetricRegistry = MetricRegistry; //# sourceMappingURL=Metric.cjs.map