UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

100 lines (97 loc) 3.57 kB
// CmpStr v3.0.1 dev-052fa0c-250614 by Paul Köhler @komed3 / MIT License import { MetricRegistry, Metric } from './Metric.js'; import { Pool } from '../utils/Pool.js'; /** * Cosine Similarity * src/metric/Cosine.ts * * @see https://en.wikipedia.org/wiki/Cosine_similarity * * Cosine similarity is a metric used to measure how similar two vectors are, regardless * of their magnitude. In text analysis, it is commonly used to compare documents or * strings by representing them as term frequency vectors and computing the cosine of * the angle between these vectors. * * The result is a value between 0 and 1, where 1 means the vectors are identical and * 0 means they are orthogonal (no similarity). * * @module Metric/CosineSimilarity * @author Paul Köhler (komed3) * @license MIT */ /** * CosineSimilarity class extends the Metric class to implement the Cosine similarity algorithm. */ class CosineSimilarity extends Metric { /** * Constructor for the CosineSimilarity class. * * Initializes the Cosine similarity metric with two input strings or * arrays of strings and optional options. * * @param {MetricInput} a - First input string or array of strings * @param {MetricInput} b - Second input string or array of strings * @param {MetricOptions} [opt] - Options for the metric computation */ constructor(a, b, opt = {}) { // Call the parent Metric constructor with the metric name and inputs // Metric is symmetrical super('cosine', a, b, opt, true); } /** * Calculates the term frequency vector for a given string. * * @param {string} str - The input string * @param {string} delimiter - The delimiter to split terms * @return {Map<string, number>} - Term frequency object */ _termFreq(str, delimiter) { const terms = str.split(delimiter); const freq = Pool.acquire('map', terms.length); for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1); return freq; } /** * Calculates the Cosine similarity between two strings. * * @param {string} a - First string * @param {string} b - Second string * @return {MetricCompute<CosineRaw>} - Object containing the similarity result and raw values */ compute(a, b) { // Get delimiter from options or use default (space) const { delimiter = ' ' } = this.options; // Compute term frequency vectors const termsA = this._termFreq(a, delimiter); const termsB = this._termFreq(b, delimiter); // Calculate dot product and magnitudes let dotProduct = 0, magnitudeA = 0, magnitudeB = 0; // Iterate over terms in A for dotProduct and magnitudeA for (const [term, freqA] of termsA) { const freqB = termsB.get(term) || 0; dotProduct += freqA * freqB; magnitudeA += freqA * freqA; } // Iterate over terms in B for magnitudeB for (const freqB of termsB.values()) magnitudeB += freqB * freqB; magnitudeA = Math.sqrt(magnitudeA); magnitudeB = Math.sqrt(magnitudeB); // Release maps back to the pool Pool.release('map', termsA, termsA.size); Pool.release('map', termsB, termsB.size); // Return the result as a MetricCompute object return { res: magnitudeA && magnitudeB ? Metric.clamp(dotProduct / (magnitudeA * magnitudeB)) : 0, raw: { dotProduct, magnitudeA, magnitudeB } }; } } // Register the Cosine similarity in the metric registry MetricRegistry.add('cosine', CosineSimilarity); export { CosineSimilarity }; //# sourceMappingURL=Cosine.js.map