UNPKG

@nozbe/microfuzz

Version:

A tiny, simple, fast fuzzy search library

352 lines (306 loc) 10.9 kB
// @flow /* eslint-disable no-continue */ import normalizeText from './normalizeText' import type { Range, FuzzySearcher, FuzzySearchOptions, FuzzySearchStrategy, FuzzyResult, HighlightRanges, FuzzyMatches, } from './index' const { MAX_SAFE_INTEGER } = Number const sortByScore = <T>(a: FuzzyResult<T>, b: FuzzyResult<T>): number => a.score - b.score const sortRangeTuple = (a: Range, b: Range): number => a[0] - b[0] const validWordBoundaries = new Set('  []()-–—\'"“”'.split('')) function isValidWordBoundary(character: string): boolean { return validWordBoundaries.has(character) } function matchesFuzzily( item: string, normalizedItem: string, itemWords: Set<string>, query: string, normalizedQuery: string, queryWords: string[], strategy: FuzzySearchStrategy, ): ?[number, HighlightRanges] { // quick matches if (item === query) { return [0, [[0, item.length - 1]]] } const queryLen = query.length const normalizedItemLen = normalizedItem.length const normalizedQueryLen = normalizedQuery.length if (normalizedItem === normalizedQuery) { return [0.1, [[0, normalizedItemLen - 1]]] } else if (normalizedItem.startsWith(normalizedQuery)) { return [0.5, [[0, normalizedQueryLen - 1]]] } // contains query (starting at word boundary) // NOTE: It would be more correct to do a regex search, than to check previous character, since // it could be that the item found does _not_ start at a word boundary, but there is another match // that does. However, this is faster and should rarely be a problem, while fuzzy search will still // find other matches (just ranked lower) const exactContainsIdx = item.indexOf(query) if (exactContainsIdx > -1 && isValidWordBoundary(item[exactContainsIdx - 1])) { return [0.9, [[exactContainsIdx, exactContainsIdx + queryLen - 1]]] } const containsIdx = normalizedItem.indexOf(normalizedQuery) if (containsIdx > -1 && isValidWordBoundary(normalizedItem[containsIdx - 1])) { return [1, [[containsIdx, containsIdx + queryLen - 1]]] } // Match by words included // Score: 1.5 + 0.2*words (so that it's better than two non-word chunks) const queryWordCount = queryWords.length if (queryWordCount > 1) { if (queryWords.every((word) => itemWords.has(word))) { const score = 1.5 + queryWordCount * 0.2 return [ score, queryWords .map((word) => { const wordIndex = normalizedItem.indexOf(word) return ([wordIndex, wordIndex + word.length - 1]: Range) }) .sort(sortRangeTuple), ] } } // Contains query (at any position) if (containsIdx > -1) { return [2, [[containsIdx, containsIdx + queryLen - 1]]] } // Match by consecutive letters (fuzzy) if (strategy === 'aggressive') { return aggressiveFuzzyMatch(normalizedItem, normalizedQuery) } else if (strategy === 'smart') { return experimentalSmartFuzzyMatch(normalizedItem, normalizedQuery) } return null } export function aggressiveFuzzyMatch( normalizedItem: string, normalizedQuery: string, ): ?[number, HighlightRanges] { const normalizedItemLen = normalizedItem.length const normalizedQueryLen = normalizedQuery.length let queryIdx = 0 let queryChar = normalizedQuery[queryIdx] const indices: HighlightRanges = [] let chunkFirstIdx = -1 let chunkLastIdx = -2 // TODO: May improve performance by early exits (less to go than remaining query) // and by using .indexOf(x, fromIndex) for (let itemIdx = 0; itemIdx < normalizedItemLen; itemIdx += 1) { // DEBUG: // console.log(`${itemIdx} (${normalizedItem[itemIdx]}), ${queryIdx} (${queryChar}), ${chunkLastIdx}, score: ${consecutiveChunks}`) if (normalizedItem[itemIdx] === queryChar) { if (itemIdx !== chunkLastIdx + 1) { if (chunkFirstIdx >= 0) { indices.push([chunkFirstIdx, chunkLastIdx]) } chunkFirstIdx = itemIdx } chunkLastIdx = itemIdx queryIdx += 1 if (queryIdx === normalizedQueryLen) { indices.push([chunkFirstIdx, chunkLastIdx]) return scoreConsecutiveLetters(indices, normalizedItem) } queryChar = normalizedQuery[queryIdx] } } return null } export function experimentalSmartFuzzyMatch( normalizedItem: string, normalizedQuery: string, ): ?[number, HighlightRanges] { const normalizedItemLen = normalizedItem.length // Match by consecutive letters, but only match beginnings of words or chunks of 3+ letters // Note that there may be multiple valid ways in which such matching can be done, and we'll only // match each chunk to the first one found that matches these criteria. It's not perfect as it's // possible that later chunks will fail to match while there's a better match, for example: // - query: ABC // - item: A xABC // ^___xx (no match) // ___^^^ (better match) // But we want to limit the algorithmic complexity and this should generally work. const indices: HighlightRanges = [] let queryIdx = 0 let queryChar = normalizedQuery[queryIdx] let chunkFirstIdx = -1 let chunkLastIdx = -2 // eslint-disable-next-line no-constant-condition while (true) { // Find match for first letter of chunk const idx = normalizedItem.indexOf(queryChar, chunkLastIdx + 1) if (idx === -1) { break } // Check if chunk starts at word boundary if (idx === 0 || isValidWordBoundary(normalizedItem[idx - 1])) { chunkFirstIdx = idx } else { // Else, check if chunk is at least 3+ letters const queryCharsLeft = normalizedQuery.length - queryIdx const itemCharsLeft = normalizedItem.length - idx const minimumChunkLen = Math.min(3, queryCharsLeft, itemCharsLeft) const minimumQueryChunk = normalizedQuery.slice(queryIdx, queryIdx + minimumChunkLen) if (normalizedItem.slice(idx, idx + minimumChunkLen) === minimumQueryChunk) { chunkFirstIdx = idx } else { // Move index to continue search for valid chunk chunkLastIdx += 1 continue } } // We have first index of a valid chunk, find its last index // TODO: We could micro-optimize by setting chunkLastIdx earlier if we already know it's len 3 or more for (chunkLastIdx = chunkFirstIdx; chunkLastIdx < normalizedItemLen; chunkLastIdx += 1) { if (normalizedItem[chunkLastIdx] !== queryChar) { break } queryIdx += 1 queryChar = normalizedQuery[queryIdx] } // Add chunk to indices chunkLastIdx -= 1 // decrement as we've broken out of loop on non-matching char indices.push([chunkFirstIdx, chunkLastIdx]) // Check if we're done if (queryIdx === normalizedQuery.length) { return scoreConsecutiveLetters(indices, normalizedItem) } } // eslint-disable-next-line no-unreachable return null } function scoreConsecutiveLetters( indices: HighlightRanges, normalizedItem: string, ): ?[number, HighlightRanges] { // Score: 2 + sum of chunk scores // Chunk scores: // - 0.2 for a full word // - 0.4 for chunk starting at beginning of word // - 0.8 for chunk in the middle of the word (if >=3 characters) // - 1.6 for chunk in the middle of the word (if 1 or 2 characters) let score = 2 indices.forEach(([firstIdx, lastIdx]) => { const chunkLength = lastIdx - firstIdx + 1 const isStartOfWord = firstIdx === 0 || normalizedItem[firstIdx] === ' ' || normalizedItem[firstIdx - 1] === ' ' const isEndOfWord = lastIdx === normalizedItem.length - 1 || normalizedItem[lastIdx] === ' ' || normalizedItem[lastIdx + 1] === ' ' const isFullWord = isStartOfWord && isEndOfWord // DEBUG: // console.log({ // firstIdx, // lastIdx, // chunkLength, // isStartOfWord, // isEndOfWord, // isFullWord, // before: normalizedItem[firstIdx - 1], // after: normalizedItem[lastIdx + 1], // }) if (isFullWord) { score += 0.2 } else if (isStartOfWord) { score += 0.4 } else if (chunkLength >= 3) { score += 0.8 } else { score += 1.6 } }) return [score, indices] } export function fuzzyMatchImpl(text: string, query: string): ?FuzzyResult<string> { const normalizedQuery = normalizeText(query) const queryWords = normalizedQuery.split(' ') const normalizedText = normalizeText(text) const itemWords = new Set(normalizedText.split(' ')) const result = matchesFuzzily( text, normalizedText, itemWords, query, normalizedQuery, queryWords, 'smart', ) if (result) { return { item: text, score: result[0], matches: [result[1]] } } return null } export function createFuzzySearchImpl<Element>( collection: Element[], options: FuzzySearchOptions, ): FuzzySearcher<Element> { // TODO: Change default strategy to smart const { strategy = 'aggressive', getText } = options const preprocessedCollection: [Element, [string, string, Set<string>][]][] = collection.map( (element: Element) => { let texts: (?string)[] if (getText) { texts = getText(element) } else { // $FlowFixMe[incompatible-use] const text: string = options.key ? element[options.key] : (element: any) texts = [text] } const preprocessedTexts: [string, string, Set<string>][] = texts.map((text) => { const item = text || '' const normalizedItem = normalizeText(item) const itemWords = new Set(normalizedItem.split(' ')) return [item, normalizedItem, itemWords] }) return [element, preprocessedTexts] }, ) return (query: string) => { // DEBUG // const b4 = Date.now() const results: Array<FuzzyResult<Element>> = [] const normalizedQuery = normalizeText(query) const queryWords = normalizedQuery.split(' ') if (!normalizedQuery.length) { return [] } preprocessedCollection.forEach(([element, texts]) => { let bestScore = MAX_SAFE_INTEGER const matches: FuzzyMatches = [] for (let i = 0, len = texts.length; i < len; i += 1) { const [item, normalizedItem, itemWords] = texts[i] const result = matchesFuzzily( item, normalizedItem, itemWords, query, normalizedQuery, queryWords, strategy, ) if (result) { bestScore = Math.min(bestScore, result[0]) // take the lowest score of any match matches.push(result[1]) } else { matches.push(null) } } if (bestScore < MAX_SAFE_INTEGER) { results.push({ item: element, score: bestScore, matches }) } }) results.sort(sortByScore) // DEBUG // console.log(`fuzzy search complete in ${Date.now() - b4} ms`) return results } }