UNPKG

@nozbe/microfuzz

Version:

A tiny, simple, fast fuzzy search library

306 lines (291 loc) 11.8 kB
"use strict"; exports.__esModule = true; exports.aggressiveFuzzyMatch = aggressiveFuzzyMatch; exports.createFuzzySearchImpl = createFuzzySearchImpl; exports.experimentalSmartFuzzyMatch = experimentalSmartFuzzyMatch; exports.fuzzyMatchImpl = fuzzyMatchImpl; var _normalizeText = _interopRequireDefault(require("./normalizeText")); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } // @flow /* eslint-disable no-continue */ /*:: import type { Range, FuzzySearcher, FuzzySearchOptions, FuzzySearchStrategy, FuzzyResult, HighlightRanges, FuzzyMatches, } from './index'*/ var MAX_SAFE_INTEGER = Number.MAX_SAFE_INTEGER; var sortByScore = function sortByScore(a /*: FuzzyResult<T>*/, b /*: FuzzyResult<T>*/) { return (/*: number*/a.score - b.score ); } /*:: <T>*/; var sortRangeTuple = function sortRangeTuple(a /*: Range*/, b /*: Range*/) { return (/*: number*/a[0] - b[0] ); }; var validWordBoundaries = new Set('  []()-–—\'"“”'.split('')); function isValidWordBoundary(character /*: string*/) /*: boolean*/{ return validWordBoundaries.has(character); } function matchesFuzzily(item /*: string*/, normalizedItem /*: string*/, itemWords /*: Set<string>*/, query /*: string*/, normalizedQuery /*: string*/, queryWords /*: string[]*/, strategy /*: FuzzySearchStrategy*/) /*: ?[number, HighlightRanges]*/{ // quick matches if (item === query) { return [0, [[0, item.length - 1]]]; } var queryLen = query.length; var normalizedItemLen = normalizedItem.length; var normalizedQueryLen = normalizedQuery.length; if (normalizedItem === normalizedQuery) { return [0.1, [[0, normalizedItemLen - 1]]]; } else if (normalizedItem.startsWith(normalizedQuery)) { return [0.5, [[0, normalizedQueryLen - 1]]]; } // contains query (starting at word boundary) // NOTE: It would be more correct to do a regex search, than to check previous character, since // it could be that the item found does _not_ start at a word boundary, but there is another match // that does. However, this is faster and should rarely be a problem, while fuzzy search will still // find other matches (just ranked lower) var exactContainsIdx = item.indexOf(query); if (exactContainsIdx > -1 && isValidWordBoundary(item[exactContainsIdx - 1])) { return [0.9, [[exactContainsIdx, exactContainsIdx + queryLen - 1]]]; } var containsIdx = normalizedItem.indexOf(normalizedQuery); if (containsIdx > -1 && isValidWordBoundary(normalizedItem[containsIdx - 1])) { return [1, [[containsIdx, containsIdx + queryLen - 1]]]; } // Match by words included // Score: 1.5 + 0.2*words (so that it's better than two non-word chunks) var queryWordCount = queryWords.length; if (queryWordCount > 1) { if (queryWords.every(function (word) { return itemWords.has(word); })) { var score = 1.5 + queryWordCount * 0.2; return [score, queryWords.map(function (word) { var wordIndex = normalizedItem.indexOf(word); return ([wordIndex, wordIndex + word.length - 1] /*: Range*/); }).sort(sortRangeTuple)]; } } // Contains query (at any position) if (containsIdx > -1) { return [2, [[containsIdx, containsIdx + queryLen - 1]]]; } // Match by consecutive letters (fuzzy) if (strategy === 'aggressive') { return aggressiveFuzzyMatch(normalizedItem, normalizedQuery); } else if (strategy === 'smart') { return experimentalSmartFuzzyMatch(normalizedItem, normalizedQuery); } return null; } function aggressiveFuzzyMatch(normalizedItem /*: string*/, normalizedQuery /*: string*/) /*: ?[number, HighlightRanges]*/{ var normalizedItemLen = normalizedItem.length; var normalizedQueryLen = normalizedQuery.length; var queryIdx = 0; var queryChar = normalizedQuery[queryIdx]; var indices /*: HighlightRanges*/ = []; var chunkFirstIdx = -1; var chunkLastIdx = -2; // TODO: May improve performance by early exits (less to go than remaining query) // and by using .indexOf(x, fromIndex) for (var itemIdx = 0; itemIdx < normalizedItemLen; itemIdx += 1) { // DEBUG: // console.log(`${itemIdx} (${normalizedItem[itemIdx]}), ${queryIdx} (${queryChar}), ${chunkLastIdx}, score: ${consecutiveChunks}`) if (normalizedItem[itemIdx] === queryChar) { if (itemIdx !== chunkLastIdx + 1) { if (chunkFirstIdx >= 0) { indices.push([chunkFirstIdx, chunkLastIdx]); } chunkFirstIdx = itemIdx; } chunkLastIdx = itemIdx; queryIdx += 1; if (queryIdx === normalizedQueryLen) { indices.push([chunkFirstIdx, chunkLastIdx]); return scoreConsecutiveLetters(indices, normalizedItem); } queryChar = normalizedQuery[queryIdx]; } } return null; } function experimentalSmartFuzzyMatch(normalizedItem /*: string*/, normalizedQuery /*: string*/) /*: ?[number, HighlightRanges]*/{ var normalizedItemLen = normalizedItem.length; // Match by consecutive letters, but only match beginnings of words or chunks of 3+ letters // Note that there may be multiple valid ways in which such matching can be done, and we'll only // match each chunk to the first one found that matches these criteria. It's not perfect as it's // possible that later chunks will fail to match while there's a better match, for example: // - query: ABC // - item: A xABC // ^___xx (no match) // ___^^^ (better match) // But we want to limit the algorithmic complexity and this should generally work. var indices /*: HighlightRanges*/ = []; var queryIdx = 0; var queryChar = normalizedQuery[queryIdx]; var chunkFirstIdx = -1; var chunkLastIdx = -2; // eslint-disable-next-line no-constant-condition while (true) { // Find match for first letter of chunk var idx = normalizedItem.indexOf(queryChar, chunkLastIdx + 1); if (idx === -1) { break; } // Check if chunk starts at word boundary if (idx === 0 || isValidWordBoundary(normalizedItem[idx - 1])) { chunkFirstIdx = idx; } else { // Else, check if chunk is at least 3+ letters var queryCharsLeft = normalizedQuery.length - queryIdx; var itemCharsLeft = normalizedItem.length - idx; var minimumChunkLen = Math.min(3, queryCharsLeft, itemCharsLeft); var minimumQueryChunk = normalizedQuery.slice(queryIdx, queryIdx + minimumChunkLen); if (normalizedItem.slice(idx, idx + minimumChunkLen) === minimumQueryChunk) { chunkFirstIdx = idx; } else { // Move index to continue search for valid chunk chunkLastIdx += 1; continue; } } // We have first index of a valid chunk, find its last index // TODO: We could micro-optimize by setting chunkLastIdx earlier if we already know it's len 3 or more for (chunkLastIdx = chunkFirstIdx; chunkLastIdx < normalizedItemLen; chunkLastIdx += 1) { if (normalizedItem[chunkLastIdx] !== queryChar) { break; } queryIdx += 1; queryChar = normalizedQuery[queryIdx]; } // Add chunk to indices chunkLastIdx -= 1; // decrement as we've broken out of loop on non-matching char indices.push([chunkFirstIdx, chunkLastIdx]); // Check if we're done if (queryIdx === normalizedQuery.length) { return scoreConsecutiveLetters(indices, normalizedItem); } } // eslint-disable-next-line no-unreachable return null; } function scoreConsecutiveLetters(indices /*: HighlightRanges*/, normalizedItem /*: string*/) /*: ?[number, HighlightRanges]*/{ // Score: 2 + sum of chunk scores // Chunk scores: // - 0.2 for a full word // - 0.4 for chunk starting at beginning of word // - 0.8 for chunk in the middle of the word (if >=3 characters) // - 1.6 for chunk in the middle of the word (if 1 or 2 characters) var score = 2; indices.forEach(function (_ref) { var firstIdx = _ref[0], lastIdx = _ref[1]; var chunkLength = lastIdx - firstIdx + 1; var isStartOfWord = firstIdx === 0 || normalizedItem[firstIdx] === ' ' || normalizedItem[firstIdx - 1] === ' '; var isEndOfWord = lastIdx === normalizedItem.length - 1 || normalizedItem[lastIdx] === ' ' || normalizedItem[lastIdx + 1] === ' '; var isFullWord = isStartOfWord && isEndOfWord; // DEBUG: // console.log({ // firstIdx, // lastIdx, // chunkLength, // isStartOfWord, // isEndOfWord, // isFullWord, // before: normalizedItem[firstIdx - 1], // after: normalizedItem[lastIdx + 1], // }) if (isFullWord) { score += 0.2; } else if (isStartOfWord) { score += 0.4; } else if (chunkLength >= 3) { score += 0.8; } else { score += 1.6; } }); return [score, indices]; } function fuzzyMatchImpl(text /*: string*/, query /*: string*/) /*: ?FuzzyResult<string>*/{ var normalizedQuery = (0, _normalizeText["default"])(query); var queryWords = normalizedQuery.split(' '); var normalizedText = (0, _normalizeText["default"])(text); var itemWords = new Set(normalizedText.split(' ')); var result = matchesFuzzily(text, normalizedText, itemWords, query, normalizedQuery, queryWords, 'smart'); if (result) { return { item: text, score: result[0], matches: [result[1]] }; } return null; } function createFuzzySearchImpl /*:: <Element>*/(collection /*: Element[]*/, options /*: FuzzySearchOptions*/) /*: FuzzySearcher<Element>*/{ // TODO: Change default strategy to smart var _options$strategy = options.strategy, strategy = _options$strategy === void 0 ? 'aggressive' : _options$strategy, getText = options.getText; var preprocessedCollection /*: [Element, [string, string, Set<string>][]][]*/ = collection.map(function (element /*: Element*/) { var texts /*: (?string)[]*/; if (getText) { texts = getText(element); } else { // $FlowFixMe[incompatible-use] var text /*: string*/ = options.key ? element[options.key] : (element /*: any*/); texts = [text]; } var preprocessedTexts /*: [string, string, Set<string>][]*/ = texts.map(function (text) { var item = text || ''; var normalizedItem = (0, _normalizeText["default"])(item); var itemWords = new Set(normalizedItem.split(' ')); return [item, normalizedItem, itemWords]; }); return [element, preprocessedTexts]; }); return function (query /*: string*/) { // DEBUG // const b4 = Date.now() var results /*: Array<FuzzyResult<Element>>*/ = []; var normalizedQuery = (0, _normalizeText["default"])(query); var queryWords = normalizedQuery.split(' '); if (!normalizedQuery.length) { return []; } preprocessedCollection.forEach(function (_ref2) { var element = _ref2[0], texts = _ref2[1]; var bestScore = MAX_SAFE_INTEGER; var matches /*: FuzzyMatches*/ = []; for (var i = 0, len = texts.length; i < len; i += 1) { var _texts$i = texts[i], item = _texts$i[0], normalizedItem = _texts$i[1], itemWords = _texts$i[2]; var result = matchesFuzzily(item, normalizedItem, itemWords, query, normalizedQuery, queryWords, strategy); if (result) { bestScore = Math.min(bestScore, result[0]); // take the lowest score of any match matches.push(result[1]); } else { matches.push(null); } } if (bestScore < MAX_SAFE_INTEGER) { results.push({ item: element, score: bestScore, matches: matches }); } }); results.sort(sortByScore); // DEBUG // console.log(`fuzzy search complete in ${Date.now() - b4} ms`) return results; }; }