fuzzball
Version:
Fuzzy string matching algorithms and utilities, port of the TheFuzz Python library.
130 lines (113 loc) • 5.41 kB
JavaScript
module.exports = function (_uniq, _uniqWith, _partialRight) {
var module = {};
var wildLeven = require('./wildcardLeven.js');
var leven = require('./leven.js');
function escapeRegExp(string) {
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // from MDN
}
function validate(str) {
if ((typeof str === "string" || str instanceof String) && str.length > 0) return true;
else return false;
}
module.validate = validate;
module.process_and_sort = function process_and_sort(str) {
if (!validate(str)) return "";
return str.match(/\S+/g).sort().join(" ").trim();
}
module.tokenize = function unique_tokens(str, options) {
if (options && options.wildcards && _uniqWith && _partialRight) {
var partWild = _partialRight(wildLeven, options, leven);
var wildCompare = function (a, b) { return partWild(a, b) === 0; }
return _uniqWith(str.match(/\S+/g), wildCompare);
}
else return _uniq(str.match(/\S+/g));
}
// Native Unicode property escapes for alphanumeric check
const alphaNumUnicode = /[^\p{L}\p{N}]/gu;
module.full_process = function full_process(str, options) {
if (!(str instanceof String) && typeof str !== "string") return "";
var processedtext;
if (options && typeof options === "object" && options.wildcards && typeof options.wildcards === "string" && options.wildcards.length > 0) {
var wildcards = options.wildcards.toLowerCase();
str = str.toLowerCase();
if (options.force_ascii) {
// replace non-ascii non-wildcards
var pattern = '[^\x00 -\x7F|' + escapeRegExp(wildcards) + ']';
str = str.replace(new RegExp(pattern, "g"), "");
// replace wildcards with wildchar
var wildpattern = '[' + escapeRegExp(wildcards) + ']';
var wildchar = wildcards[0];
str = str.replace(new RegExp(wildpattern, "g"), wildchar);
// replace non alpha-num non-wildcards with space
var alphanumPat = '[^A-Za-z0-9' + escapeRegExp(wildcards) + ']';
str = str.replace(new RegExp(alphanumPat, "g"), " ");
str = str.replace(/_/g, ' ');
// wildcards are case insensitive as of now
// would need to make sure lower version of wildcards didnt get turned into wildcards
processedtext = str.trim();
}
else {
// replace non-alphanum non-wildcards
var upattern = '[^\\p{L}\\p{N}|' + escapeRegExp(wildcards) + ']';
str = str.replace(new RegExp(upattern, 'gu'), ' ');
// replace wildcards with wildchar
var wildpattern = '[' + escapeRegExp(wildcards) + ']';
var wildchar = wildcards[0];
str = str.replace(new RegExp(wildpattern, "g"), wildchar);
// wildcards are case insensitive as of now
// would need to make sure lower version of wildcards didnt get turned into wildcards
processedtext = str.trim();
}
}
else {
// Non-ascii won't turn into whitespace if not force_ascii
if (options && (options.force_ascii || options === true)) { //support old behavior just passing true
str = str.replace(/[^\x00-\x7F]/g, "");
processedtext = str.replace(/\W|_/g, ' ').toLowerCase().trim();
}
else {
processedtext = str.replace(alphaNumUnicode, ' ').toLowerCase().trim();
}
}
if (options && options.collapseWhitespace) {
processedtext = processedtext.replace(/\s+/g, ' ');
}
return processedtext;
}
// clone/shallow copy whatev
module.clone_and_set_option_defaults = function(options) {
// don't run more than once if usign extract functions
if (options && options.isAClone) return options;
var optclone = { isAClone: true };
if (options) {
var i, keys = Object.keys(options);
for (i = 0; i < keys.length; i++) {
optclone[keys[i]] = options[keys[i]];
}
}
if (!(optclone.full_process === false)) optclone.full_process = true;
if (!(optclone.force_ascii === true)) optclone.force_ascii = false;
// normalize option not used unless astral is true, so true + no astral = no normalize
if (!(optclone.normalize === false) && optclone.astral === true) { optclone.normalize = true; }
if (!(optclone.collapseWhitespace === false)) optclone.collapseWhitespace = true;
return optclone;
}
module.isCustomFunc = function(func) {
if (typeof func === "function" && (
func.name === "token_set_ratio" ||
func.name === "partial_token_set_ratio" ||
func.name === "token_sort_ratio" ||
func.name === "partial_token_sort_ratio" ||
func.name === "QRatio" ||
func.name === "WRatio" ||
func.name === "distance" ||
func.name === "partial_ratio"
)) {
return false;
}
else {
return true;
}
}
return module;
}