UNPKG

fuzzy-match-utils

Version:

A collection of string matching algorithms built with React Select in mind

github.com/Khan/fuzzy-match-utils

Khan/fuzzy-match-utils

220 lines (194 loc) • 6.54 kB

JavaScript

// @flow /** * A collection of string matching algorithms built with React Select in mind. */ // Option type from React Select and similar libraries. export type Option = { label?: string, value?: any, }; type MapOfStrings = {[key: string]: string}; /** * Filters React Select options and sorts by similarity to a search filter. * Handles partial matches, eg. searching for "Waberg High" will find "Raoul * Wallenberg Traditional High School". Case insensitive. Ignores * non-alphanumeric characters. * * @param options An unfiltered list of Options. * @param? filter A string to compare against Option labels. * @param? substitutions Strings with multiple spellings or variations that we * expect to match, eg. accented characters or abbreviated words. * * @return A filtered and sorted array of Options. */ export function filterOptions( options: Array<Option>, filter?: string, substitutions?: MapOfStrings, ): Array<Option> { // If the filter is blank, return the full list of Options. if (!filter) { return options; } const cleanFilter = cleanUpText(filter, substitutions); return options // Filter out undefined or null Options. .filter(({label, value}) => label != null && value != null) // Create a {score, Option} pair for each Option based on its label's // similarity to the filter text. .map(option => ({ option: option, score: typeaheadSimilarity( cleanUpText(option.label, substitutions), cleanFilter), })) // Only include matches of the entire substring, with a slight // affordance for transposition or extra characters. .filter(pair => pair.score >= cleanFilter.length - 2) // Sort 'em by order of their score. .sort((a, b) => b.score - a.score) // …and grab the original Options back from their pairs. .map(pair => pair.option); } /** * Scores the similarity between two strings by returning the length of the * longest common subsequence. Intended for comparing strings of different * lengths; eg. when matching a typeahead search input with a school name. * Meant for use in an instant search box where results are being fetched * as a user is typing. * * @param a The longer string (though, we flip them if it's shorter). * @param b The shorter string, eg. a typeahead search input. * * @return The length of the longest common subsequence. Higher scores indicate * closer matches. */ export function typeaheadSimilarity(a: string, b: string): number { const aLength = a.length; const bLength = b.length; const table = []; if (!aLength || !bLength) { return 0; } // Ensure `a` isn't shorter than `b`. if (aLength < bLength) { [a, b] = [b, a]; } // Early exit if `a` includes `b`; these will be scored higher than any // other options with the same `b` (filter string), with a preference for // shorter `a` strings (option labels). if (a.indexOf(b) !== -1) { return bLength + 1 / aLength; } // Initialize the table axes: // // 0 0 0 0 ... bLength // 0 // 0 // // ... // // aLength // for (let x = 0; x <= aLength; ++x) { table[x] = [0]; } for (let y = 0; y <= bLength; ++y) { table[0][y] = 0; } // Populate the rest of the table with a dynamic programming algorithm. for (let x = 1; x <= aLength; ++x) { for (let y = 1; y <= bLength; ++y) { table[x][y] = a[x - 1] === b[y - 1] ? 1 + table[x - 1][y - 1] : Math.max(table[x][y - 1], table[x - 1][y]); } } return table[aLength][bLength]; } /** * Returns the Levenshtein distance between two strings. * * NOTE: The Jaro-Winkler distance also worked well and is slightly more * performant. Levenshtein seems to match more reliably, which is more * important here. * * @param a The first string for comparison. * @param b The second string for comparison. * * @return The Levenshtein distance, where lower distance indicates higher * similarity. */ export function fullStringDistance(a: string, b: string): number { const aLength = a.length; const bLength = b.length; const table = []; if (!aLength) { return bLength; } if (!bLength) { return aLength; } // Initialize the table axes: // // 0 1 2 3 4 ... bLength // 1 // 2 // // ... // // aLength // for (let x = 0; x <= aLength; ++x) { table[x] = [x]; } for (let y = 0; y <= bLength; ++y) { table[0][y] = y; } // Populate the rest of the table with a dynamic programming algorithm. for (let x = 1; x <= aLength; ++x) { for (let y = 1; y <= bLength; ++y) { table[x][y] = a[x - 1] === b[y - 1] ? table[x - 1][y - 1] : 1 + Math.min( table[x - 1][y], // Substitution, table[x][y - 1], // insertion, table[x - 1][y - 1]); // and deletion. } } return table[aLength][bLength]; } /** * Apply string substitutions, remove non-alphanumeric characters, and convert * all letters to uppercase. * * eg. 'Scoil Bhríde Primary School' may become 'SCOILBHRIDEPRIMARYSCHOOL'. * * @param input An unsanitized input string. * @param substitutions Strings with multiple spellings or variations that we * expect to match, for example accented characters or abbreviated * words. * * @return The sanitized text. */ export function cleanUpText( input?: string, substitutions?: MapOfStrings, ): string { if (!input) { return ''; } // Uppercase and remove all non-alphanumeric, non-accented characters. // Also remove underscores. input = input.toUpperCase().replace(/((?=[^\u00E0-\u00FC])\W)|_/g, ''); if (!substitutions) { return input; } const safeSubstitutions: MapOfStrings = substitutions; // For Flow. // Replace all strings in `safeSubstitutions` with their standardized // counterparts. return Object.keys(safeSubstitutions) .reduce((output, substitution) => { const unsubbed = new RegExp(substitution, 'g'); return output.replace(unsubbed, safeSubstitutions[substitution]); }, input); }