UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

92 lines (88 loc) 3.41 kB
// CmpStr v3.0.1 dev-052fa0c-250614 by Paul Köhler @komed3 / MIT License 'use strict'; var Metric = require('./Metric.cjs'); var Pool = require('../utils/Pool.cjs'); /** * Dice-Sørensen Coefficient * src/metric/DiceSorensen.ts * * @see https://en.wikipedia.org/wiki/Dice-S%C3%B8rensen_coefficient * * This module implements the Dice-Sørensen coefficient, a statistic used to gauge * the similarity of two samples. It is commonly used in natural language processing * and information retrieval to compare the similarity between two sets of data, * such as text documents. The coefficient is defined as twice the size of the * intersection divided by the sum of the sizes of the two sets. * * The implementation includes methods to compute bigrams from strings and calculate * the coefficient based on these bigrams. It handles edge cases, such as empty * strings and identical strings, to ensure accurate results. * * @module Metric/DiceSorensenCoefficient * @author Paul Köhler (komed3) * @license MIT */ /** * DiceSorensenCoefficient class extends the Metric class to implement the Dice-Sørensen coefficient. */ class DiceSorensenCoefficient extends Metric.Metric { /** * Constructor for the DiceSorensen class. * * Initializes the DiceSorensen metric with two input strings or * arrays of strings and optional options. * * @param {MetricInput} a - First input string or array of strings * @param {MetricInput} b - Second input string or array of strings * @param {MetricOptions} [opt] - Options for the metric computation */ constructor(a, b, opt = {}) { // Call the parent Metric constructor with the metric name and inputs // Metric is symmetrical super('dice', a, b, opt, true); } /** * Computes the bigrams of a given string. * * @param {string} str - The input string * @return {Set<string>} - A set of bigrams (two-character sequences) from the string */ _bigrams(str) { const len = str.length - 1; const bigrams = Pool.Pool.acquire('set', len); // Generate bigrams by iterating through the string for (let i = 0; i < len; i++) bigrams.add(str.substring(i, i + 2)); return bigrams; } /** * Calculates the Dice-Sørensen coefficient between two strings. * * @param {string} a - First string * @param {string} b - Second string * @return {MetricCompute<DiceRaw>} - Object containing the similarity result and raw distance */ compute(a, b) { // Generate bigrams for both strings const setA = this._bigrams(a); const setB = this._bigrams(b); // Calculate the intersection of bigrams let intersection = 0; for (const bigram of setA) if (setB.has(bigram)) intersection++; // Calculate the size of the union of both sets const sizeA = setA.size, sizeB = setB.size; const size = sizeA + sizeB; // Release sets back to the pool Pool.Pool.release('set', setA, sizeA); Pool.Pool.release('set', setB, sizeB); // Return the result as a MetricCompute object return { res: size === 0 ? 1 : Metric.Metric.clamp((2 * intersection) / size), raw: { intersection, size } }; } } // Register the Dice-Sørensen coefficient in the metric registry Metric.MetricRegistry.add('dice', DiceSorensenCoefficient); exports.DiceSorensenCoefficient = DiceSorensenCoefficient; //# sourceMappingURL=DiceSorensen.cjs.map