UNPKG

epsilonjs

Version:

A lightweight Node.js module that implements the most common Approximate String Matching algorithms

320 lines (223 loc) 6.64 kB
/* epsilon Copyright(c) 2015 Gabriele Salvatori http://www.salvatorigabriele.com salvatorigabriele@gmail.com This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ /** * Find minimum value of all the arguments * much faster than Math.min * * @return min minimum value */ var levenMin = function(){ var length = arguments.length; var min = Infinity; while (length--) { if (arguments[length] < min) { min = arguments[length]; } } return min; }; /** * Create a new nxm matrix fill with zeros * * @param {int} n row dimension * @param {int} m column dimension * @return {matrix} matrix a nxm matrix */ var matrix = function(n,m){ var matrix = []; for (i=0; i<n; i++) { matrix[i] = []; for (j=0; j<m; j++) { matrix[i][j]=0; } } return matrix; }; /** * Create an array range from start to edge * * @param {int} start description * @param {int} edge description * @param {int} step description * @return {array} ret description */ var createRange = function(start,edge,step){ // If only one number was passed in make it the edge and 0 the start. if (arguments.length == 1) { edge = start; start = 0; } // Validate the edge and step numbers. edge = edge || 0; step = step || 1; // Create the array of numbers, stopping befor the edge. for (var ret = []; (edge - start) * step > 0; start += step) { ret.push(start); } return ret; }; /** * The standard Levenshtein function * * @param {string} source source string * @param {string} target target string * @return {int} range the levenshtein distance */ exports.leven = function(source,target) { var sourceLength = source.length; var targetLength = target.length; var range = createRange(0,targetLength+1,1); for (var i = 1; i <= sourceLength; i++) { var newRange = []; newRange[0] = i; for (var j = 1; j <= targetLength; j++) { // if the characther of the first string is equal to the second, then the LD is 0 // else is 1 var compare = ( source[i-1] == target[j-1] ) ? 0 : 1; newRange[j] = levenMin( range[j]+1, newRange[j-1]+1, range[j-1] + compare ); } range = newRange; } return range[targetLength]; } /** * Standard levenshtein function with dinamic programming * * @param {string} source source string * @param {string} target target string * @return {int} distances DP levenshtein distance */ exports.DPLeven = function(source,target){ var sourceLength = source.length; var targetLength = target.length; // we hold all "short" distances between characters // in a matrix that will be source.lenght x target.lenght in dimension var distances = matrix(sourceLength,targetLength); for (var k = 0; k <= targetLength-1; k++){ distances[k][0]= k; } for (var y = 0; y <= sourceLength-1; y++){ distances[0][y] = y; } for (y = 1; y < sourceLength; y++) { for (k = 1; k < targetLength; k++) { // caso base if (source.charAt(k) == target.charAt(y)){ distances[k][y] = distances[k-1][y-1]; } else{ distances[k][y] = levenMin(distances[k-1][y]+1,distances[k][y-1]+1,distances[k-1][y-1]+1); } } } return distances[sourceLength-1][targetLength-1]; } /** * Calculates the Optimal String Alignment * between two strings * * @param {string} source description * @param {string} target description * @return {int} table description */ exports.OSADistance = function(source,target){ var sourceLength = source.length; var targetLength = target.length; var table = matrix(sourceLength,targetLength); var cost = 0; for (var i = 0; i < sourceLength; i++) { table[i][0] = i; } for (var j = 0; j < targetLength; j++) { table[j][0] = j; } for (var k = 1; k < sourceLength; k++) { for (var y = 1; y < targetLength; y++) { cost = ( source[k] == target[y] ) ? 0 : 1; table[k][y] = levenMin(table[k-1][y] + 1, table[k][y-1] + 1, table[k-1][y-1] + cost); // this is the actual optimization if(k > 1 & y > 1 & source[k] == target[y-1] & source[k-1] == target[y]){ table[k][y] = levenMin(table[k][y], table[k-2][y-2] + cost, null); } } } return table[sourceLength-1][targetLength-1]; } /** * Damerau Levenshtein distance * * @param {string} source description * @param {string} target description * @return {int} description */ exports.DLevenshtein = function(source,target){ var sourceLength = source.length; var targetLength = target.length; if (sourceLength === 0 || targetLength === 0){ return Math.max(sourceLength,targetLength); }else{ var cost = (source[sourceLength-1] != target[targetLength-1]) ? 1 : 0; var H1 = source.substring(0,sourceLength-1); var H2 = target.substring(0,targetLength-1); if (sourceLength>1 && targetLength>1 && source[sourceLength-1] == target[targetLength-2] && source[sourceLength-2] == target[targetLength-1]) { return levenMin( this.DLevenshtein(H1,target)+1, this.DLevenshtein(source,H2)+1, this.DLevenshtein(H1,H2)+cost, this.DLevenshtein(source.substring(0,sourceLength-2),target.substring(0,targetLength-2)+1) ); } return levenMin( this.DLevenshtein(H1,target)+1, this.DLevenshtein(source,H2)+1, this.DLevenshtein(H1,H2)+cost ); } } /** * Hamming Distance * * @param {string} source description * @param {string} target description * @return {int} distance description */ exports.hamming = function(source,target){ if ( source.length == target.length){ throw new Error("String must be equal length"); } var distance = 0; var value = source ^ target; while(value !== 0){ distance++; value &= value-1; } return distance; } /** * Calculate the Cosin Similarity of two vectors * (in this case, two strings), the range is [0,1]: * * - 0 : no similarity * - 1 : total similarity * * @param {string} source description * @param {string} target description * @return {int} cosinSim description */ // exports.cosinSim = function(source,target){ // }