UNPKG

wink-nlp

Version:

Developer friendly Natural Language Processing ✨

winkjs/wink-nlp

207 lines (190 loc) • 7.15 kB

JavaScript

// wink-nlp // // Copyright (C) GRAYPE Systems Private Limited // // This file is part of “wink-nlp”. // // Permission is hereby granted, free of charge, to any // person obtaining a copy of this software and // associated documentation files (the "Software"), to // deal in the Software without restriction, including // without limitation the rights to use, copy, modify, // merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to // whom the Software is furnished to do so, subject to // the following conditions: // // The above copyright notice and this permission notice // shall be included in all copies or substantial // portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED // TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF // CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. // var similarity = Object.create( null ); similarity.bow = Object.create( null ); similarity.set = Object.create( null ); similarity.vector = Object.create( null ); /** * * Computes the cosine similarity between the input bag of words (bow) * `bowA` and `bowB` and returns a value between 0 and 1. * * @param {object} bowA the first bow i.e word (i.e. key) and it's frequency * (i.e. value) pairs. * @param {object} bowB the second bow. * @return {number} cosine similarity between `bowA` and `bowB`. */ similarity.bow.cosine = function ( bowA, bowB ) { // `ab` & `ba` additional variables are required as you dont want to corrupt // `bowA` & `bowB`! // Updated `a` with words in `b` set as 0 in `a`. var ab = Object.create( null ); // Updated `b` with words in `a` set as 0 in `b`. var ba = Object.create( null ); // Similarly score. var ss; var w; // a word! // Fill up `ab` and `ba` for ( w in bowA ) { // eslint-disable-line guard-for-in ab[ w ] = bowA[ w ]; ba[ w ] = 0; } for ( w in bowB ) { // eslint-disable-line guard-for-in ba[ w ] = bowB[ w ]; ab[ w ] = ab[ w ] || 0; } // With `ab` & `ba` in hand, its easy to transform in to // vector: its a frequency of each word found in both strings // We do not need to store these vectors in arrays, instead we can perform // processing in the same loop. var sa2 = 0, // sum of ai^2 saxb = 0, // sum of ai x bi sb2 = 0, // sum of bi^2 va, vb; // value of ai and bi // One could have used `ba`, as both have same words now! for ( w in ab ) { // eslint-disable-line guard-for-in va = ab[ w ]; vb = ba[ w ]; sa2 += va * va; sb2 += vb * vb; saxb += va * vb; } // Compute cosine similarity; ensure you dont get `NaN i.e. 0/0` by testing for // `sa2` and `sb2`. ss = ( ( sa2 && sb2 ) ? // Compute cosine if both of them are non-zero. +( saxb / ( Math.sqrt( sa2 ) * Math.sqrt( sb2 ) ) ).toFixed( 6 ) : // If one of them is 0 means **0** ss otherwise a ss of **1**. ( !sa2 ^ !sb2 ) ? 0 : 1 // eslint-disable-line no-bitwise ); return ss; }; // similarity.bow.cosine() // ### tversky /** * * Computes the tversky similarity between input sets `setA` and `setB`. * This similarity is always between 0 and 1. Tversky calls `setA` as * **prototype** and `setB` as **variant**. The `alpha` corresponds * to the weight of prototype, whereas `beta` corresponds to the * weight of variant. * * @param {set} setA the first set or the prototype. * @param {set} setB the second set or the variant. * @param {number} [alpha=0.5] the prototype weight. * @param {number} [beta=0.5] the variant weight. * @return {number} the tversky similarity between `setA` and `setB`. */ similarity.set.tversky = function ( setA, setB, alpha, beta ) { // Early return. if ( setA.size === 0 && setB.size === 0 ) return 1; if ( setA.size === 0 || setB.size === 0 ) return 0; // Contains `alpha` & `beta` values respectively after the validations. var a, b; // Size of the intersection between set `setA` and `setB`. var intersectSize = 0; // Their differences! var saDIFFsb, sbDIFFsa; // The similarity score between `setA` and `setB`. var ss; a = ( isNaN( alpha ) ) ? 0.5 : +alpha; b = ( isNaN( beta ) ) ? 0.5 : +beta; if ( a < 0 || b < 0 ) { throw Error( 'wink-nlp: tversky requires aplha & beta to be positive numbers.' ); } // Use smaller sized set for iteration. if ( setA.size < setB.size ) { setA.forEach( function ( element ) { if ( setB.has( element ) ) intersectSize += 1; } ); } else { setB.forEach( function ( element ) { if ( setA.has( element ) ) intersectSize += 1; } ); } saDIFFsb = setA.size - intersectSize; sbDIFFsa = setB.size - intersectSize; // Compute Tversky similarity. ss = ( intersectSize / ( intersectSize + ( a * saDIFFsb ) + ( b * sbDIFFsa ) ) ); return +ss.toFixed( 6 ); }; // tversky() // ### oo /** * * Computes the Otsuka-Ochiai similarity between input sets `setA` and `setB`. * This similarity is always between 0 and 1. It is equivalent to binary bow * cosine similarity. * * @param {set} setA the first set. * @param {set} setB the second set. * @return {number} the oo similarity between `setA` and `setB`. */ similarity.set.oo = function ( setA, setB ) { // Early return. if ( setA.size === 0 && setB.size === 0 ) return 1; if ( setA.size === 0 || setB.size === 0 ) return 0; // Size of the intersection between set `setA` and `setB`. var intersectSize = 0; // Use smaller sized set for iteration. if ( setA.size < setB.size ) { setA.forEach( function ( element ) { if ( setB.has( element ) ) intersectSize += 1; } ); } else { setB.forEach( function ( element ) { if ( setA.has( element ) ) intersectSize += 1; } ); } return +( intersectSize / ( Math.sqrt( setA.size * setB.size ) ) ).toFixed( 6 ); }; // similarity.set.oo() /** * * Computes the cosine similarity between the input vectors * `vectorA` and `vectorB` and returns a value between 0 and 1. * Note, in winkNLP all vectors contain the `l2Norm` as the last * element of the array. * * @param {object} vectorA the first vector * @param {object} vectorB the second vector. * @return {number} cosine similarity between `vectorA` and `vectorB`. */ similarity.vector.cosine = function ( vectorA, vectorB ) { let sumOfProducts = 0; // Recall, the last element is always the `l2Norm`. const length = vectorA.length - 1; for ( let i = 0; i < length; i += 1 ) { sumOfProducts += vectorA[ i ] * vectorB[ i ]; } // Use `l2Norm` directly from each vector. return +( sumOfProducts / ( vectorA[ length ] * vectorB[ length ] ) ).toFixed( 6 ); }; // similarity.vector.cosine() // Export similarity module.exports = similarity;