UNPKG

fuzzball

Version:

Fuzzy string matching algorithms and utilities, port of the TheFuzz Python library.

539 lines (503 loc) 25.9 kB
(function () { /** @module fuzzball */ 'use strict'; var nativeUtils = require('../lib/native_utils.js'); var _intersect = nativeUtils._intersect; var _difference = nativeUtils._difference; var _uniq = nativeUtils._uniq; var leven = require('../lib/leven'); var utils = require('../lib/utils_ultra_lite.js')(_uniq); var validate = utils.validate; var process_and_sort = utils.process_and_sort; var tokenize = utils.tokenize; var full_process = utils.full_process; var clone_and_set_option_defaults = utils.clone_and_set_option_defaults; var isCustomFunc = utils.isCustomFunc; if (typeof setImmediate !== 'function') { require('setimmediate'); } // didn't run in tiny-worker without extra check // isArray polyfill if (typeof Array.isArray === 'undefined') { Array.isArray = function (obj) { return Object.prototype.toString.call(obj) === '[object Array]'; } }; /** Mostly follows after python fuzzywuzzy, https://github.com/seatgeek/fuzzywuzzy */ /** Public functions */ function distance(str1, str2, options_p) { /** * Calculate levenshtein distance of the two strings. * * @function distance * @param {string} str1 - the first string. * @param {string} str2 - the second string. * @param {Object} [options_p] - Additional options. * @param {boolean} [options_p.full_process] - Apply basic cleanup, non-alphanumeric to whitespace etc. if true. default true * @param {boolean} [options_p.force_ascii] - Strip non-ascii in full_process if true (non-ascii will not become whtespace), only applied if full_process is true as well, default true * @param {boolean} [options_p.collapseWhitespace] - Collapse consecutive white space during full_process, default true * @returns {number} - the levenshtein distance (0 and above). */ var options = clone_and_set_option_defaults(options_p); str1 = options.full_process ? full_process(str1, options) : str1; str2 = options.full_process ? full_process(str2, options) : str2; return leven(str1, str2, options); // falls back to leven if no wildcards } function QRatio(str1, str2, options_p) { /** * Calculate levenshtein ratio of the two strings. * * @function ratio * @param {string} str1 - the first string. * @param {string} str2 - the second string. * @param {Object} [options_p] - Additional options. * @param {boolean} [options_p.full_process] - Apply basic cleanup, non-alphanumeric to whitespace etc. if true. default true * @param {boolean} [options_p.force_ascii] - Strip non-ascii in full_process if true (non-ascii will not become whtespace), only applied if full_process is true as well, default true * @param {boolean} [options_p.collapseWhitespace] - Collapse consecutive white space during full_process, default true * @returns {number} - the levenshtein ratio (0-100). */ var options = clone_and_set_option_defaults(options_p); str1 = options.full_process ? full_process(str1, options) : str1; str2 = options.full_process ? full_process(str2, options) : str2; if (!validate(str1)) return 0; if (!validate(str2)) return 0; return _ratio(str1, str2, options); } function token_set_ratio(str1, str2, options_p) { /** * Calculate token set ratio of the two strings. * * @function token_set_ratio * @param {string} str1 - the first string. * @param {string} str2 - the second string. * @param {Object} [options_p] - Additional options. * @param {boolean} [options_p.full_process] - Apply basic cleanup, non-alphanumeric to whitespace etc. if true. default true * @param {boolean} [options_p.force_ascii] - Strip non-ascii in full_process if true (non-ascii will not become whtespace), only applied if full_process is true as well, default true * @returns {number} - the levenshtein ratio (0-100). */ var options = clone_and_set_option_defaults(options_p); str1 = options.full_process ? full_process(str1, options) : str1; str2 = options.full_process ? full_process(str2, options) : str2; if (!validate(str1)) return 0; if (!validate(str2)) return 0; return _token_set(str1, str2, options); } function token_sort_ratio(str1, str2, options_p) { /** * Calculate token sort ratio of the two strings. * * @function token_sort_ratio * @param {string} str1 - the first string. * @param {string} str2 - the second string. * @param {Object} [options_p] - Additional options. * @param {boolean} [options_p.full_process] - Apply basic cleanup, non-alphanumeric to whitespace etc. if true. default true * @param {boolean} [options_p.force_ascii] - Strip non-ascii in full_process if true (non-ascii will not become whtespace), only applied if full_process is true as well, default true * @returns {number} - the levenshtein ratio (0-100). */ var options = clone_and_set_option_defaults(options_p); str1 = options.full_process ? full_process(str1, options) : str1; str2 = options.full_process ? full_process(str2, options) : str2; if (!validate(str1)) return 0; if (!validate(str2)) return 0; if (!options.proc_sorted) { str1 = process_and_sort(str1); str2 = process_and_sort(str2); } return _ratio(str1, str2, options); } function extract(query, choices, options_p) { /** * Return the top scoring items from an array (or assoc array) of choices * * @param {string} query - the search term. * @param {String[]|Object[]|Object} choices - array of strings, or array of choice objects if processor is supplied, or object of form {key: choice} * @param {Object} [options_p] - Additional options. * @param {function} [options_p.scorer] - takes two values and returns a score * @param {function} [options_p.processor] - takes each choice and outputs a value to be used for Scoring * @param {number} [options_p.limit] - optional max number of results to return, returns all if not supplied * @param {number} [options_p.cutoff] - minimum score that will get returned 0-100 * @param {boolean} [options_p.full_process] - Apply basic cleanup, non-alphanumeric to whitespace etc. if true. default true * @param {boolean} [options_p.force_ascii] - Strip non-ascii in full_process if true (non-ascii will not become whtespace), only applied if full_process is true as well, default false * @param {boolean} [options_p.collapseWhitespace] - Collapse consecutive white space during full_process, default true * @param {boolean} [options_p.trySimple] - try simple/partial ratio as part of (parial_)token_set_ratio test suited * @param {boolean} [options_p.returnObjects] - return array of object instead of array of tuples; default false * @returns {Object[]} - array of choice results with their computed ratios (0-100). */ var options = clone_and_set_option_defaults(options_p); var isArray = false; var numchoices; if (choices && choices.length && Array.isArray(choices)) { numchoices = choices.length; isArray = true; //if array don't check hasOwnProperty every time below } else if (!(choices instanceof Object)) { throw new Error("Invalid choices"); } else numchoices = Object.keys(choices).length; if (!choices || numchoices === 0) { if (typeof console !== undefined) console.warn("No choices"); return []; } if (options.processor && typeof options.processor !== "function") { throw new Error("Invalid Processor"); } if (!options.processor) options.processor = function (x) { return x; } if (options.scorer && typeof options.scorer !== "function") { throw new Error("Invalid Scorer"); } if (!options.scorer) { options.scorer = QRatio; } var isCustom = isCustomFunc(options.scorer); // check if func name is one of fuzzball's, so don't use same names.. if (!options.cutoff || typeof options.cutoff !== "number") { options.cutoff = -1;} var pre_processor = function(choice, force_ascii) {return choice;} if (options.full_process) { pre_processor = full_process; if (!isCustom) options.processed = true; // to let wildcardLeven know and not run again after we set fp to false below } if (!isCustom) { // if custom scorer func let scorer handle it query = pre_processor(query, options); options.full_process = false; if (query.length === 0) if (typeof console !== undefined) console.warn("Processed query is empty string"); } var results = []; var anyblank = false; var tsort = false; var tset = false; if (options.scorer.name === "token_sort_ratio" || options.scorer.name === "partial_token_sort_ratio") { var proc_sorted_query = process_and_sort(query); tsort = true; } else if (options.scorer.name === "token_set_ratio" || options.scorer.name === "partial_token_set_ratio") { var query_tokens = tokenize(query); tset = true; } var idx, mychoice, result, cmpSort; if (options.returnObjects) { cmpSort = function (a, b) { return b.score - a.score; }; } else { cmpSort = function (a, b) { return b[1] - a[1]; }; } for (var c in choices) { if (isArray || choices.hasOwnProperty(c)) { options.tokens = undefined; options.proc_sorted = false; if (tsort) { options.proc_sorted = true; if (choices[c] && choices[c].proc_sorted) mychoice = choices[c].proc_sorted; else { mychoice = pre_processor(options.processor(choices[c]), options); mychoice = process_and_sort(mychoice); } result = options.scorer(proc_sorted_query, mychoice, options); } else if (tset) { mychoice = "x"; //dummy string so it validates if (choices[c] && choices[c].tokens) { options.tokens = [query_tokens, choices[c].tokens]; if (options.trySimple) mychoice = pre_processor(options.processor(choices[c]), options); } else { mychoice = pre_processor(options.processor(choices[c]), options); options.tokens = [query_tokens, tokenize(mychoice)] } //query and mychoice only used for validation here unless trySimple = true result = options.scorer(query, mychoice, options); } else if (isCustom) { // options.full_process should be unmodified, don't pre-process here since mychoice maybe not string mychoice = options.processor(choices[c]); result = options.scorer(query, mychoice, options); } else { mychoice = pre_processor(options.processor(choices[c]), options); if (typeof mychoice !== "string" || mychoice.length === 0) anyblank = true; result = options.scorer(query, mychoice, options); } if (isArray) idx = parseInt(c); else idx = c; if (result > options.cutoff) { if (options.returnObjects) results.push({ choice: choices[c], score: result, key: idx }); else results.push([choices[c], result, idx]);; } } } if (anyblank) if (typeof console !== undefined) console.log("One or more choices were empty. (post-processing if applied)") if (options.limit && typeof options.limit === "number" && options.limit > 0 && options.limit < numchoices && !options.unsorted) { results = results.sort(cmpSort).slice(0, options.limit); } else if (!options.unsorted) { results = results.sort(cmpSort); } return results; } function extractAsync(query, choices, options_p, callback) { /** * Return the top scoring items from an array (or assoc array) of choices * * @param {string} query - the search term. * @param {String[]|Object[]|Object} choices - array of strings, or array of choice objects if processor is supplied, or object of form {key: choice} * @param {Object} [options_p] - Additional options. * @param {function} [options_p.scorer] - takes two values and returns a score * @param {function} [options_p.processor] - takes each choice and outputs a value to be used for Scoring * @param {number} [options_p.limit] - optional max number of results to return, returns all if not supplied * @param {number} [options_p.cutoff] - minimum score that will get returned 0-100 * @param {boolean} [options_p.full_process] - Apply basic cleanup, non-alphanumeric to whitespace etc. if true. default true * @param {boolean} [options_p.force_ascii] - Strip non-ascii in full_process if true (non-ascii will not become whtespace), only applied if full_process is true as well, default false * @param {boolean} [options_p.collapseWhitespace] - Collapse consecutive white space during full_process, default true * @param {boolean} [options_p.trySimple] - try simple/partial ratio as part of (parial_)token_set_ratio test suite * @param {boolean} [options_p.returnObjects] - return array of object instead of array of tuples; default false * @param {Object} [options_p.abortController] - track abortion * @param {Object} [options_p.cancelToken] - track cancellation * @param {number} [options_p.asyncLoopOffset] - number of rows to run in between every async loop iteration, default 256 * @param {function} callback - node style callback (err, arrayOfResults) */ var options = clone_and_set_option_defaults(options_p); var abortController; if (typeof options_p.abortController === "object") { abortController = options_p.abortController; } var cancelToken; if (typeof options_p.cancelToken === "object") { cancelToken = options_p.cancelToken; } var loopOffset = 256; if (typeof options.asyncLoopOffset === 'number') { if (options.asyncLoopOffset < 1) loopOffset = 1; else loopOffset = options.asyncLoopOffset; } var isArray = false; var numchoices; if (choices && choices.length && Array.isArray(choices)) { numchoices = choices.length; isArray = true; //if array don't check hasOwnProperty every time below } else if (!(choices instanceof Object)) { callback(new Error("Invalid choices")); return; } else numchoices = Object.keys(choices).length; if (!choices || numchoices === 0) { if (typeof console !== undefined) console.warn("No choices"); callback(null, []); return; } if (options.processor && typeof options.processor !== "function") { callback(new Error("Invalid Processor")); return; } if (!options.processor) options.processor = function (x) { return x; } if (options.scorer && typeof options.scorer !== "function") { callback(new Error("Invalid Scorer")); return; } if (!options.scorer) { options.scorer = QRatio; } var isCustom = isCustomFunc(options.scorer); // check if func name is one of fuzzball's, so don't use same names.. if (!options.cutoff || typeof options.cutoff !== "number") { options.cutoff = -1; } var pre_processor = function (choice, force_ascii) { return choice; } if (options.full_process) { pre_processor = full_process; if (!isCustom) options.processed = true; // to let wildcardLeven know and not run again after we set fp to false below } if (!isCustom) { // if custom scorer func let scorer handle it query = pre_processor(query, options); options.full_process = false; if (query.length === 0) if (typeof console !== undefined) console.warn("Processed query is empty string"); } var results = []; var anyblank = false; var tsort = false; var tset = false; if (options.scorer.name === "token_sort_ratio" || options.scorer.name === "partial_token_sort_ratio") { var proc_sorted_query = process_and_sort(query); tsort = true; } else if (options.scorer.name === "token_set_ratio" || options.scorer.name === "partial_token_set_ratio") { var query_tokens = tokenize(query); tset = true; } var idx, mychoice, result, cmpSort; if (options.returnObjects) { cmpSort = function (a, b) { return b.score - a.score; }; } else { cmpSort = function (a, b) { return b[1] - a[1]; }; } var keys = Object.keys(choices); isArray ? searchLoop(0) : searchLoop(keys[0], 0); function searchLoop (c, i) { if (isArray || choices.hasOwnProperty(c)) { options.tokens = undefined; options.proc_sorted = false; if (tsort) { options.proc_sorted = true; if (choices[c].proc_sorted) mychoice = choices[c].proc_sorted; else { mychoice = pre_processor(options.processor(choices[c]), options); mychoice = process_and_sort(mychoice); } result = options.scorer(proc_sorted_query, mychoice, options); } else if (tset) { mychoice = "x"; //dummy string so it validates if (choices[c].tokens) { options.tokens = [query_tokens, choices[c].tokens]; if (options.trySimple) mychoice = pre_processor(options.processor(choices[c]), options); } else { mychoice = pre_processor(options.processor(choices[c]), options); options.tokens = [query_tokens, tokenize(mychoice)] } //query and mychoice only used for validation here unless trySimple = true result = options.scorer(query, mychoice, options); } else if (isCustom) { // options.full_process should be unmodified, don't pre-process here since mychoice maybe not string mychoice = options.processor(choices[c]); result = options.scorer(query, mychoice, options); } else { mychoice = pre_processor(options.processor(choices[c]), options); if (typeof mychoice !== "string" || mychoice.length === 0) anyblank = true; result = options.scorer(query, mychoice, options); } if (isArray) idx = parseInt(c); else idx = c; if (result > options.cutoff) { if (options.returnObjects) results.push({ choice: choices[c], score: result, key: idx }); else results.push([choices[c], result, idx]);; } } if (abortController && abortController.signal.aborted === true) { callback(new Error("aborted")); return; } if (cancelToken && cancelToken.canceled === true) { callback(new Error("canceled")); return; } if (isArray && c < choices.length - 1) { if (c % loopOffset === 0) { setImmediate(function () { searchLoop(c + 1); }); } else searchLoop(c + 1); } else if (i < keys.length - 1) { if (i % loopOffset === 0) { setImmediate(function () { searchLoop(keys[i + 1], i + 1); }); } else { searchLoop(keys[i + 1], i + 1); } } else { if (anyblank) if (typeof console !== undefined) console.log("One or more choices were empty. (post-processing if applied)") if (options.limit && typeof options.limit === "number" && options.limit > 0 && options.limit < numchoices && !options.unsorted) { results = results.sort(cmpSort).slice(0, options.limit); } else if (!options.unsorted) { results = results.sort(cmpSort); } callback(null, results); } } } /** Main Scoring Code */ function _token_set(str1, str2, options) { if (!options.tokens) { var tokens1 = tokenize(str1); var tokens2 = tokenize(str2); } else { var tokens1 = options.tokens[0]; var tokens2 = options.tokens[1]; } var intersection = _intersect(tokens1, tokens2); var diff1to2 = _difference(tokens1, tokens2); var diff2to1 = _difference(tokens2, tokens1); var sorted_sect = intersection.sort().join(" "); var sorted_1to2 = diff1to2.sort().join(" "); var sorted_2to1 = diff2to1.sort().join(" "); var combined_1to2 = sorted_sect + " " + sorted_1to2; var combined_2to1 = sorted_sect + " " + sorted_2to1; sorted_sect = sorted_sect.trim(); combined_1to2 = combined_1to2.trim(); combined_2to1 = combined_2to1.trim(); var ratio_func = _ratio; var pairwise = [ ratio_func(sorted_sect, combined_1to2, options), ratio_func(sorted_sect, combined_2to1, options), ratio_func(combined_1to2, combined_2to1, options) ] if (options.trySimple) { pairwise.push(ratio_func(str1, str2, options)); } return Math.max.apply(null, pairwise); } function _ratio(str1, str2, options) { if (!validate(str1)) return 0; if (!validate(str2)) return 0; //to match behavior of python-Levenshtein/fuzzywuzzy, substitution cost is 2 var levdistance, lensum; if (typeof options.subcost === "undefined") options.subcost = 2; levdistance = leven(str1, str2, options); lensum = str1.length + str2.length; return Math.round(100 * ((lensum - levdistance) / lensum)); } //polyfill for Object.keys // From https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Object/keys if (!Object.keys) { Object.keys = (function () { 'use strict'; var hasOwnProperty = Object.prototype.hasOwnProperty, hasDontEnumBug = !({ toString: null }).propertyIsEnumerable('toString'), dontEnums = [ 'toString', 'toLocaleString', 'valueOf', 'hasOwnProperty', 'isPrototypeOf', 'propertyIsEnumerable', 'constructor' ], dontEnumsLength = dontEnums.length; return function (obj) { if (typeof obj !== 'object' && (typeof obj !== 'function' || obj === null)) { throw new TypeError('Object.keys called on non-object'); } var result = [], prop, i; for (prop in obj) { if (hasOwnProperty.call(obj, prop)) { result.push(prop); } } if (hasDontEnumBug) { for (i = 0; i < dontEnumsLength; i++) { if (hasOwnProperty.call(obj, dontEnums[i])) { result.push(dontEnums[i]); } } } return result; }; } ()); } var extractAsPromised = undefined; if (typeof Promise !== 'undefined') { extractAsPromised = function (query, choices, options) { return new Promise(function (resolve, reject) { extractAsync(query, choices, options, function (err, response) { if (err) reject(err); else resolve(response); }); }); }; } var fuzzball = { distance: distance, ratio: QRatio, token_set_ratio: token_set_ratio, token_sort_ratio: token_sort_ratio, full_process: full_process, extract: extract, extractAsync: extractAsync, extractAsPromised: extractAsPromised, process_and_sort: process_and_sort, unique_tokens: tokenize }; module.exports = fuzzball; } ());