cmpstr
Version:
lightweight npm package to calculate string similarity
86 lines (65 loc) • 1.96 kB
JavaScript
/**
* Cosine Similarity
* CmpStr module
*
* Cosine similarity is a measure how similar two vectors are. It's often used
* in text analysis to compare texts based on the words they contain.
*
* @author Paul Köhler (komed3)
* @license MIT
*/
;
/**
* private helper function
* get term frequency from string
* @private
*
* @param {String} str string
* @param {String} delimiter term delimiter
* @returns {Object} term frequency
*/
const _termFreq = ( str, delimiter ) => {
let freq = {};
str.split( delimiter ).forEach( ( term ) => {
freq[ term ] = ( freq[ term ] || 0 ) + 1;
} );
return freq;
};
/**
* module exports
* @public
*
* @param {String} a string a
* @param {String} b string b
* @param {Object} options having {
* @param {String} [delimiter=' '] term delimiter
* }
* @returns {Number} similarity score (0..1)
*/
module.exports = ( a, b, { delimiter = ' ' } = {} ) => {
/* step 1: count the frequency of chars per string */
let termsA = _termFreq( a, delimiter ),
termsB = _termFreq( b, delimiter );
let allTerms = new Set ( [
...Object.keys( termsA ),
...Object.keys( termsB )
] );
/* step 2: calculate the dot product */
let dotProduct = [ ...allTerms ].reduce(
( sum, char ) => sum + ( termsA[ char ] || 0 ) * ( termsB[ char ] || 0 ),
0
);
/* step 3: calculate the vector magnitudes */
let magnitudeA = Math.sqrt( [ ...allTerms ].reduce(
( sum, char ) => sum + ( termsA[ char ] || 0 ) ** 2,
0
) );
let magnitudeB = Math.sqrt( [ ...allTerms ].reduce(
( sum, char ) => sum + ( termsB[ char ] || 0 ) ** 2,
0
) );
/* step 4: calculate Cosine similarity */
return magnitudeA && magnitudeB
? dotProduct / ( magnitudeA * magnitudeB )
: 0;
};