cmpstr
Version:
CmpStr is a lightweight, fast and well performing package for calculating string similarity
100 lines (97 loc) • 3.57 kB
JavaScript
// CmpStr v3.0.1 dev-052fa0c-250614 by Paul Köhler @komed3 / MIT License
import { MetricRegistry, Metric } from './Metric.js';
import { Pool } from '../utils/Pool.js';
/**
* Cosine Similarity
* src/metric/Cosine.ts
*
* @see https://en.wikipedia.org/wiki/Cosine_similarity
*
* Cosine similarity is a metric used to measure how similar two vectors are, regardless
* of their magnitude. In text analysis, it is commonly used to compare documents or
* strings by representing them as term frequency vectors and computing the cosine of
* the angle between these vectors.
*
* The result is a value between 0 and 1, where 1 means the vectors are identical and
* 0 means they are orthogonal (no similarity).
*
* @module Metric/CosineSimilarity
* @author Paul Köhler (komed3)
* @license MIT
*/
/**
* CosineSimilarity class extends the Metric class to implement the Cosine similarity algorithm.
*/
class CosineSimilarity extends Metric {
/**
* Constructor for the CosineSimilarity class.
*
* Initializes the Cosine similarity metric with two input strings or
* arrays of strings and optional options.
*
* @param {MetricInput} a - First input string or array of strings
* @param {MetricInput} b - Second input string or array of strings
* @param {MetricOptions} [opt] - Options for the metric computation
*/
constructor(a, b, opt = {}) {
// Call the parent Metric constructor with the metric name and inputs
// Metric is symmetrical
super('cosine', a, b, opt, true);
}
/**
* Calculates the term frequency vector for a given string.
*
* @param {string} str - The input string
* @param {string} delimiter - The delimiter to split terms
* @return {Map<string, number>} - Term frequency object
*/
_termFreq(str, delimiter) {
const terms = str.split(delimiter);
const freq = Pool.acquire('map', terms.length);
for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
return freq;
}
/**
* Calculates the Cosine similarity between two strings.
*
* @param {string} a - First string
* @param {string} b - Second string
* @return {MetricCompute<CosineRaw>} - Object containing the similarity result and raw values
*/
compute(a, b) {
// Get delimiter from options or use default (space)
const { delimiter = ' ' } = this.options;
// Compute term frequency vectors
const termsA = this._termFreq(a, delimiter);
const termsB = this._termFreq(b, delimiter);
// Calculate dot product and magnitudes
let dotProduct = 0,
magnitudeA = 0,
magnitudeB = 0;
// Iterate over terms in A for dotProduct and magnitudeA
for (const [term, freqA] of termsA) {
const freqB = termsB.get(term) || 0;
dotProduct += freqA * freqB;
magnitudeA += freqA * freqA;
}
// Iterate over terms in B for magnitudeB
for (const freqB of termsB.values()) magnitudeB += freqB * freqB;
magnitudeA = Math.sqrt(magnitudeA);
magnitudeB = Math.sqrt(magnitudeB);
// Release maps back to the pool
Pool.release('map', termsA, termsA.size);
Pool.release('map', termsB, termsB.size);
// Return the result as a MetricCompute object
return {
res:
magnitudeA && magnitudeB
? Metric.clamp(dotProduct / (magnitudeA * magnitudeB))
: 0,
raw: { dotProduct, magnitudeA, magnitudeB }
};
}
}
// Register the Cosine similarity in the metric registry
MetricRegistry.add('cosine', CosineSimilarity);
export { CosineSimilarity };
//# sourceMappingURL=Cosine.js.map