UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

115 lines (111 loc) 3.9 kB
// CmpStr v3.0.1 dev-052fa0c-250614 by Paul Köhler @komed3 / MIT License 'use strict'; var Metric = require('./Metric.cjs'); var Pool = require('../utils/Pool.cjs'); /** * Jaro-Winkler Distance * src/metric/JaroWinkler.ts * * @see https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance * * The Jaro-Winkler distance is a string similarity metric that gives more weight * to matching characters at the start of the strings. It is especially effective * for short strings and typographical errors, and is widely used in record linkage * and duplicate detection. * * @module Metric/JaroWinkler * @author Paul Köhler (komed3) * @license MIT */ /** * JaroWinklerDistance class extends the Metric class to implement the Jaro-Winkler algorithm. */ class JaroWinklerDistance extends Metric.Metric { /** * Constructor for the JaroWinklerDistance class. * * Initializes the Jaro-Winkler metric with two input strings or * arrays of strings and optional options. * * @param {MetricInput} a - First input string or array of strings * @param {MetricInput} b - Second input string or array of strings * @param {MetricOptions} [opt] - Options for the metric computation */ constructor(a, b, opt = {}) { // Call the parent Metric constructor with the metric name and inputs // Metric is symmetrical super('jaro-winkler', a, b, opt, true); } /** * Calculates the Jaro-Winkler similarity between two strings. * * @param {string} a - First string * @param {string} b - Second string * @param {number} m - Length of the first string * @param {number} n - Length of the second string * @return {MetricCompute<JaroWinklerRaw>} - Object containing the similarity result and raw values */ compute(a, b, m, n) { // Find matches const matchWindow = Math.max(0, Math.floor(n / 2) - 1); // Use Pool for boolean arrays const matchA = Pool.Pool.acquire('uint16', m); const matchB = Pool.Pool.acquire('uint16', n); // Initialize match arrays for (let i = 0; i < m; i++) matchA[i] = 0; for (let i = 0; i < n; i++) matchB[i] = 0; // Find matches within the match window let matches = 0; for (let i = 0; i < m; i++) { const start = Math.max(0, i - matchWindow); const end = Math.min(i + matchWindow + 1, n); for (let j = start; j < end; j++) { if (!matchB[j] && a[i] === b[j]) { matchA[i] = 1; matchB[j] = 1; matches++; break; } } } // Set initial values for transpositions, jaro distance, prefix and result let transpos = 0, jaro = 0, prefix = 0, res = 0; // If matches are found, proceed with further calculations if (matches > 0) { // Count transpositions let k = 0; for (let i = 0; i < m; i++) { if (matchA[i]) { while (!matchB[k]) k++; if (a[i] !== b[k]) transpos++; k++; } } transpos /= 2; // Calculate Jaro similarity jaro = (matches / m + matches / n + (matches - transpos) / matches) / 3; // Calculate common prefix length (max 4) for (let i = 0; i < Math.min(4, m, n); i++) { if (a[i] === b[i]) prefix++; else break; } // Step 5: Calculate Jaro-Winkler similarity res = jaro + prefix * 0.1 * (1 - jaro); } // Release arrays back to the pool Pool.Pool.release('uint16', matchA, m); Pool.Pool.release('uint16', matchB, n); // Return the result as a MetricCompute object return { res: Metric.Metric.clamp(res), raw: { matchWindow, matches, transpos, jaro, prefix } }; } } // Register the Jaro-Winkler distance in the metric registry Metric.MetricRegistry.add('jaroWinkler', JaroWinklerDistance); exports.JaroWinklerDistance = JaroWinklerDistance; //# sourceMappingURL=JaroWinkler.cjs.map