UNPKG

@committed/trie-search

Version:

A trie implementation that maps keys to objects for rapid retrieval by phrases. Most common use will be for typeahead searches.

539 lines (449 loc) 13.2 kB
import get from "lodash.get" // @ts-ignore import HashArray from "hasharray" const MAX_CACHE_SIZE = 64 const IS_WHITESPACE = /^[\s]*$/ const DEFAULT_INTERNATIONALIZE_EXPAND_REGEXES = [ { regex: /[åäàáâãæ]/gi, alternate: "a", }, { regex: /[èéêë]/gi, alternate: "e", }, { regex: /[ìíîï]/gi, alternate: "i", }, { regex: /[òóôõö]/gi, alternate: "o", }, { regex: /[ùúûü]/gi, alternate: "u", }, { regex: /[æ]/gi, alternate: "ae", }, ] const replaceCharAt = function ( index: number, original: string, replacement: string ) { return ( original.substr(0, index) + replacement + original.substr(index + replacement.length) ) } export type TrieOptions = { cache: boolean expandRegexes: typeof DEFAULT_INTERNATIONALIZE_EXPAND_REGEXES idFieldOrFunction?: string | ((obj: unknown) => string) ignoreCase: boolean indexField?: string insertFullUnsplitKey: boolean keepAll: boolean keepAllKey: string maxCacheSize: number min?: number /** * How to split words when using get(). If undefined uses options.splitOnRegEx. If false, does not attempt the split. */ splitOnGetRegEx?: RegExp | false /** * How to split words when they are added to the trie. If undefined, does not attempt the split. */ splitOnRegEx?: RegExp } export type ObjectKey = string export type TrieNode<O> = Record<string, (TrieNode<O> | O)[]> export type KeyFields = (ObjectKey | ObjectKey[])[] export type TrieJson<O = unknown> = { root: TrieNode<O> keyFields: KeyFields options: Omit< TrieOptions, "expandRegexes" | "idFieldOrFunction" | "splitOnGetRegEx" | "splitOnRegEx" > } export type Reducer<O = {}> = ( accumulator: O[] | undefined, phrase: string, matches: O[], trie: TrieSearch ) => O[] export class TrieSearch<O = {}> { static readonly DEFAULT_OPTIONS: TrieOptions = { cache: true, expandRegexes: DEFAULT_INTERNATIONALIZE_EXPAND_REGEXES, idFieldOrFunction: undefined, ignoreCase: true, insertFullUnsplitKey: false, keepAll: false, keepAllKey: "id", maxCacheSize: MAX_CACHE_SIZE, splitOnRegEx: /\s/g, } public static fromJson<O = {}>(json: TrieJson<O>): TrieSearch<O> { const ts = new TrieSearch<O>(json.keyFields, json.options) ts.root = json.root return ts } static readonly UNION_REDUCER: Reducer = function <O = {}>( accumulator: O[] | undefined, _phrase: string, matches: O[], trie: TrieSearch ) { if (accumulator === undefined) { return matches } const map: Record<string, number> = {} let id: string var maxLength = Math.max(accumulator.length, matches.length) var results = [] var l = 0 // One loop, O(N) for max length of accumulator or matches. for (let i = 0; i < maxLength; i++) { if (i < accumulator.length) { id = trie.getId(accumulator[i]) map[id] = map[id] ? map[id] : 0 map[id]++ if (map[id] === 2) { results[l++] = accumulator[i] } } if (i < matches.length) { id = trie.getId(matches[i]) map[id] = map[id] ? map[id] : 0 map[id]++ if (map[id] === 2) { results[l++] = matches[i] } } } return results } static deepLookup(obj: {}, keys: ObjectKey | ObjectKey[]) { return get(obj, keys) } public readonly options: TrieOptions public readonly keyFields: KeyFields public size: number public root: TrieNode<O> public getCache: HashArray public indexed: HashArray | undefined constructor( keyFields?: KeyFields | string, options: Partial<TrieOptions> = {} ) { this.options = { ...TrieSearch.DEFAULT_OPTIONS, ...options } this.keyFields = keyFields ? keyFields instanceof Array ? keyFields : [keyFields] : [] this.root = {} this.size = 0 if (this.options.cache) { this.getCache = new HashArray("key") } if (this.options.splitOnGetRegEx === undefined) { this.options.splitOnGetRegEx = this.options.splitOnRegEx } } public add(obj: O, customKeys?: KeyFields) { if (this.options.cache) { this.clearCache() } let keyFields: KeyFields // Someone might have called add via an array forEach where the second param is a number if (customKeys == null || typeof customKeys === "number") { keyFields = this.keyFields } else { keyFields = customKeys } for (var key of keyFields) { let val: any if (key instanceof Array) { val = TrieSearch.deepLookup(obj, key) } else { // @ts-ignore val = obj[key] } if (!val) continue val = val.toString() var expandedValues = this.expandString(val) for (var v = 0; v < expandedValues.length; v++) { var expandedValue = expandedValues[v] this.map(expandedValue, obj) } } } public expandString(value: string) { var values = [value] if (this.options.expandRegexes && this.options.expandRegexes.length) { for (var i = 0; i < this.options.expandRegexes.length; i++) { var er = this.options.expandRegexes[i] var match while ((match = er.regex.exec(value)) !== null) { var alternateValue = replaceCharAt(match.index, value, er.alternate) values.push(alternateValue) } } } return values } public addAll(arr: O[], customKeys?: KeyFields) { for (var i = 0; i < arr.length; i++) this.add(arr[i], customKeys) } public reset() { this.root = {} this.size = 0 } public clearCache() { this.getCache = new HashArray("key") } public cleanCache() { while (this.getCache.all.length > this.options.maxCacheSize) this.getCache.remove(this.getCache.all[0]) } public map(key: string, value: O) { if (this.options.splitOnRegEx && this.options.splitOnRegEx.test(key)) { var phrases = key.split(this.options.splitOnRegEx) var emptySplitMatch = phrases.filter(function (p) { return IS_WHITESPACE.test(p) }) var selfMatch = phrases.filter(function (p) { return p === key }) var selfIsOnlyMatch = selfMatch.length + emptySplitMatch.length === phrases.length // There is an edge case that a RegEx with a positive lookeahed like: // /?=[A-Z]/ // Split on capital letters for a camelcase sentence // Will then match again when we call map, creating an infinite stack loop. if (!selfIsOnlyMatch) { for (var i = 0, l = phrases.length; i < l; i++) { if (!IS_WHITESPACE.test(phrases[i])) { this.map(phrases[i], value) } } if (!this.options.insertFullUnsplitKey) { return } } } if (this.options.cache) this.clearCache() if (this.options.keepAll) { this.indexed = this.indexed || new HashArray([this.options.keepAllKey]) this.indexed.add(value) } if (this.options.ignoreCase) { key = key.toLowerCase() } var keyArr = this.keyToArr(key), self = this insert(keyArr, value, this.root) function insert(keyArr: string[], value: O, node: TrieNode<O>) { if (keyArr.length == 0) { node["value"] = node["value"] || [] node["value"].push(value) return } var k = keyArr.shift() as string if (!node[k]) self.size++ node[k] = node[k] || {} // @ts-ignore insert(keyArr, value, node[k]) } } // used for testing - this not type safe and breaks generics public addFromObject(obj: any, valueField?: string) { if (this.options.cache) this.clearCache() valueField = valueField || "value" if (this.keyFields.indexOf("_key_") == -1) this.keyFields.push("_key_") for (var key in obj) { var o = { _key_: key } // @ts-ignore o[valueField] = obj[key] // @ts-ignore this.add(o) } } public keyToArr(key: String) { let keyArr if (this.options.min && this.options.min > 1) { if (key.length < this.options.min) return [] keyArr = [key.substr(0, this.options.min)] keyArr = keyArr.concat(key.substr(this.options.min).split("")) } else { keyArr = key.split("") } return keyArr } public findNode(key: string): TrieNode<O> | undefined { if ( this.options.min && this.options.min > 0 && key.length < this.options.min ) return undefined return f(this.keyToArr(key), this.root) function f(keyArr: string[], node: TrieNode<O>): TrieNode<O> | undefined { if (!node) return undefined if (keyArr.length == 0) return (node as unknown) as TrieNode<O> var k = keyArr.shift() as string // @ts-ignore return f(keyArr, node[k]) } } public _getCacheKey(phrase: string, limit?: number) { var cacheKey = phrase if (limit) { cacheKey = phrase + "_" + limit } return cacheKey } public _get(phrase: string, limit?: number) { phrase = this.options.ignoreCase ? phrase.toLowerCase() : phrase var c, node if ( this.options.cache && (c = this.getCache.get(this._getCacheKey(phrase, limit))) ) return c.value var ret = undefined, haKeyFields = this.options.indexField ? [this.options.indexField] : this.keyFields, words = this.options.splitOnGetRegEx ? phrase.split(this.options.splitOnGetRegEx) : [phrase] for (var w = 0, l = words.length; w < l; w++) { if (this.options.min && words[w].length < this.options.min) continue var temp = new HashArray(haKeyFields) if ((node = this.findNode(words[w]))) aggregate(node, temp) ret = ret ? ret.intersection(temp) : temp } var v = ret ? ret.all : [] if (this.options.cache) { var cacheKey = this._getCacheKey(phrase, limit) this.getCache.add({ key: cacheKey, value: v }) this.cleanCache() } return v function aggregate(node: TrieNode<O>, ha: HashArray) { if (limit && ha.all.length === limit) { return } if (node.value && node.value.length) { if (!limit || ha.all.length + node.value.length < limit) { ha.addAll(node.value) } else { // Limit is less than the number of entries in the node.value + ha combined ha.addAll(node.value.slice(0, limit - ha.all.length)) return } } for (var k in node) { if (limit && ha.all.length === limit) { return } if (k != "value") { // @ts-ignore aggregate(node[k], ha) } } } } public get( phrases: string | string[], reducer?: Reducer<O>, limit?: number ): O[] { const haKeyFields = this.options.indexField ? [this.options.indexField] : this.keyFields let ret = undefined let accumulator = undefined if (reducer && !this.options.idFieldOrFunction) { throw new Error( "To use the accumulator, you must specify and idFieldOrFunction" ) } phrases = phrases instanceof Array ? phrases : [phrases] for (var i = 0, l = phrases.length; i < l; i++) { var matches = this._get(phrases[i], limit) if (reducer) { accumulator = reducer(accumulator, phrases[i], matches, this) } else { ret = ret ? ret.addAll(matches) : new HashArray(haKeyFields).addAll(matches) } } if (!reducer) { return ret.all } return accumulator ?? [] } public getId(item: O) { const { idFieldOrFunction } = this.options if (idFieldOrFunction == null) { throw new Error( "To use the accumulator, you must specify and idFieldOrFunction" ) } return typeof idFieldOrFunction === "function" ? idFieldOrFunction(item) : // @ts-ignore item[idFieldOrFunction] } public toJson(): TrieJson { if ( this.options.expandRegexes !== TrieSearch.DEFAULT_OPTIONS.expandRegexes ) { throw new Error( "JSON export of non-default expandRegexes option not supported" ) } if ( this.options.idFieldOrFunction !== TrieSearch.DEFAULT_OPTIONS.idFieldOrFunction ) { throw new Error( "JSON export of non-default idFieldOrFunction option not supported" ) } if ( // NOTE: default value of splitOnGetRegEx is splitOnRegEx this.options.splitOnGetRegEx !== TrieSearch.DEFAULT_OPTIONS.splitOnRegEx ) { throw new Error( "JSON export of non-default splitOnGetRegEx option not supported" ) } if (this.options.splitOnRegEx !== TrieSearch.DEFAULT_OPTIONS.splitOnRegEx) { throw new Error( "JSON export of non-default splitOnRegEx option not supported" ) } const { expandRegexes, idFieldOrFunction, splitOnGetRegEx, splitOnRegEx, ...options } = this.options return { keyFields: this.keyFields, options, root: this.root, } } }