UNPKG

@skiff-org/trawler

Version:

A modern search library for Skiff

608 lines (502 loc) 16.7 kB
/**! * FlexSearch.js * Copyright 2018-2021 Nextapps GmbH * Author: Thomas Wilkerling * Licence: Apache-2.0 * https://github.com/nextapps-de/flexsearch */ import { encode as default_encoder } from './lang/latin/default.js'; import { create_object, create_object_array, concat, sort_by_length_down, is_array, parse_option } from './common.js'; import { init_stemmer_or_matcher, init_filter } from './lang.js'; import apply_async from './async.js'; import { intersect } from './intersect.js'; import Cache, { searchCache } from './cache.js'; /** * @constructor * @param {Object=} options * @return {Index} */ export class Index { constructor(options = {}) { this.encode = default_encoder; this.register = create_object(); this.resolution = options.resolution || 9; this.tokenize = options.tokenize || 'strict'; this.depth = options?.context?.depth; this.bidirectional = parse_option(options?.context?.bidirectional, true); this.optimize = parse_option(options.optimize, true); this.minlength = options.minlength || 1; this.boost = options.boost; // when not using the memory strategy the score array should not pre-allocated to its full length this.map = this.optimize ? create_object_array(options?.context?.resolution || 9) : create_object(); this.resolution_ctx = options?.context?.resolution || 1; this.ctx = this.optimize ? create_object_array(options?.context?.resolution || 1) : create_object(); this.rtl = options.rtl; this.matcher = options.matcher && init_stemmer_or_matcher(options.matcher, false); this.stemmer = options.stemmer && init_stemmer_or_matcher(options.stemmer, true); this.filter = options.filter && init_filter(options.filter); this.cache = options.cache && new Cache(options.cache); } //Index.prototype.pipeline = pipeline; /** * @param {!number|string} id * @param {!string} content */ append(id, content) { return this.add(id, content, true); } /** * @param {!number|string} id * @param {!string} content * @param {boolean=} _append * @param {boolean=} _skip_update */ add(id, content, _append, _skip_update) { if (content && (id || (id === 0))) { if (!_skip_update && !_append && this.register[id]) { return this.update(id, content); } content = this.encode(content); const length = content.length; if (length) { // check context dupes to skip all contextual redundancy along a document const dupes_ctx = create_object(); const dupes = create_object(); const depth = this.depth; const resolution = this.resolution; for (let i = 0; i < length; i++) { let term = content[this.rtl ? length - 1 - i : i]; let term_length = term.length; // skip dupes will break the context chain if (term && (term_length >= this.minlength) && (depth || !dupes[term])) { let score = get_score(resolution, length, i); let token = ''; switch (this.tokenize) { case 'full': if (term_length > 3) { for (let x = 0; x < term_length; x++) { for (let y = term_length; y > x; y--) { if ((y - x) >= this.minlength) { const partial_score = get_score(resolution, length, i, term_length, x); token = term.substring(x, y); this.push_index(dupes, token, partial_score, id, _append); } } } break; } // fallthrough to next case when term length < 4 case 'reverse': // skip last round (this token exist already in "forward") if (term_length > 2) { for (let x = term_length - 1; x > 0; x--) { token = term[x] + token; if (token.length >= this.minlength) { const partial_score = get_score(resolution, length, i, term_length, x); this.push_index(dupes, token, partial_score, id, _append); } } token = ''; } // fallthrough to next case to apply forward also case 'forward': if (term_length > 1) { for (let x = 0; x < term_length; x++) { token += term[x]; if (token.length >= this.minlength) { this.push_index(dupes, token, score, id, _append); } } break; } // fallthrough to next case when token has a length of 1 default: // case "strict": if (this.boost) { score = Math.min((score / this.boost(content, term, i)) | 0, resolution - 1); } this.push_index(dupes, term, score, id, _append); // context is just supported by tokenizer "strict" if (depth) { if ((length > 1) && (i < (length - 1))) { // check inner dupes to skip repeating words in the current context const dupes_inner = create_object(); const resolution = this.resolution_ctx; const keyword = term; const size = Math.min(depth + 1, length - i); dupes_inner[keyword] = 1; for (let x = 1; x < size; x++) { term = content[this.rtl ? length - 1 - i - x : i + x]; if (term && (term.length >= this.minlength) && !dupes_inner[term]) { dupes_inner[term] = 1; const context_score = get_score(resolution + ((length / 2) > resolution ? 0 : 1), length, i, size - 1, x - 1); const swap = this.bidirectional && (term > keyword); this.push_index(dupes_ctx, swap ? keyword : term, context_score, id, _append, swap ? term : keyword); } } } } } } } } } return this; } /** * @private * @param dupes * @param value * @param score * @param id * @param {boolean=} append * @param {string=} keyword */ push_index(dupes, value, score, id, append, keyword) { let arr = keyword ? this.ctx : this.map; if (!dupes[value] || (keyword && !dupes[value][keyword])) { if (this.optimize) { arr = arr[score]; } if (keyword) { dupes = dupes[value] || (dupes[value] = create_object()); dupes[keyword] = 1; arr = arr[keyword] || (arr[keyword] = create_object()); } else { dupes[value] = 1; } arr = arr[value] || (arr[value] = []); if (!this.optimize) { arr = arr[score] || (arr[score] = []); } if (!append || (arr.indexOf(id) === -1)) { arr[arr.length] = id; this.register[id] ||= []; this.register[id].push(arr); } } } /** * @param {string|Object} query * @param {Object=} options * @returns {Array<number|string>} */ search(query, options) { let result = []; let length; let context, suggest, offset = 0; let limit = 100; if (options) { limit = options.limit; offset = options.offset || 0; context = options.context; suggest = options.suggest; } if (query) { query = /** @type {Array} */ (this.encode(query)); length = query.length; // TODO: solve this in one single loop below if (length > 1) { const dupes = create_object(); const query_new = []; for (let i = 0, count = 0, term; i < length; i++) { term = query[i]; if (term && (term.length >= this.minlength) && !dupes[term]) { // this fast path just could applied when not in memory-optimized mode if (!this.optimize && !suggest && !this.map[term]) { // fast path "not found" return result; } else { query_new[count++] = term; dupes[term] = 1; } } } query = query_new; length = query.length; } } if (!length) { return result; } let depth = this.depth && (length > 1) && (context !== false); let index = 0, keyword; if (depth) { keyword = query[0]; index = 1; } else if (length > 1) { query.sort(sort_by_length_down); } for (let arr, term; index < length; index++) { term = query[index]; // console.log(keyword); // console.log(term); // console.log(""); if (depth) { arr = this.add_result(result, suggest, limit, offset, length === 2, term, keyword); // console.log(arr); // console.log(result); // when suggestion enabled just forward keyword if term was found // as long as the result is empty forward the pointer also if (!suggest || (arr !== false) || !result.length) { keyword = term; } } else { arr = this.add_result(result, suggest, limit, offset, length === 1, term); } if (arr) { return /** @type {Array<number|string>} */ (arr); } // apply suggestions on last loop or fallback if (suggest && (index === length - 1)) { let length = result.length; if (!length) { if (depth) { // fallback to non-contextual search when no result was found depth = 0; index = -1; continue; } return result; } else if (length === 1) { // fast path optimization return single_result(result[0], limit, offset); } } } return intersect(result, limit, offset, suggest); } /** * Returns an array when the result is done (to stop the process immediately), * returns false when suggestions is enabled and no result was found, * or returns nothing when a set was pushed successfully to the results * * @private * @param {Array} result * @param {Array} suggest * @param {number} limit * @param {number} offset * @param {boolean} single_term * @param {string} term * @param {string=} keyword * @return {Array<Array<string|number>>|boolean|undefined} */ add_result(result, suggest, limit, offset, single_term, term, keyword) { let word_arr = []; let arr = keyword ? this.ctx : this.map; if (!this.optimize) { arr = get_array(arr, term, keyword, this.bidirectional); } if (arr) { let count = 0; const arr_len = Math.min(arr.length, keyword ? this.resolution_ctx : this.resolution); // relevance: for (let x = 0, size = 0, tmp, len; x < arr_len; x++) { tmp = arr[x]; if (tmp) { if (this.optimize) { tmp = get_array(tmp, term, keyword, this.bidirectional); } if (offset) { if (tmp && single_term) { len = tmp.length; if (len <= offset) { offset -= len; tmp = null; } else { tmp = tmp.slice(offset); offset = 0; } } } if (tmp) { // keep score (sparse array): //word_arr[x] = tmp; // simplified score order: word_arr[count++] = tmp; if (single_term) { size += tmp.length; if (size >= limit) { // fast path optimization break; } } } } } if (count) { if (single_term) { // fast path optimization // offset was already applied at this point return single_result(word_arr, limit, 0); } result[result.length] = word_arr; return; } } // return an empty array will stop the loop, // to prevent stop when using suggestions return a false value return !suggest && word_arr; } contain(id) { return !!this.register[id]; } update(id, content) { return this.remove(id).add(id, content); } /** * @param {boolean=} _skip_deletion */ remove(id, _skip_deletion) { const refs = this.register[id]; if (refs) { remove_index(this.map, id, this.resolution, this.optimize); if (this.depth) { remove_index(this.ctx, id, this.resolution_ctx, this.optimize); } _skip_deletion || delete this.register[id]; if (this.cache) { this.cache.del(id); } } return this; } /** * Convert `this` into an exportable object */ serialize() { return { reg: this.register, opt: this.optimize, map: this.map, ctx: this.ctx, tok: this.tokenize }; } /** * Given a string load an Index object from it * @param {string} str the serialized Index object */ static deserialize(obj, params) { // TODO add extra parameter for index initialization? const result = new Index(params); result.optimize = obj.opt; result.register = obj.reg; result.map = obj.map; result.ctx = obj.ctx; result.tokenize = obj.tok; return result; } } /** * @param {number} resolution * @param {number} length * @param {number} i * @param {number=} term_length * @param {number=} x * @returns {number} */ function get_score(resolution, length, i, term_length, x){ // console.log("resolution", resolution); // console.log("length", length); // console.log("term_length", term_length); // console.log("i", i); // console.log("x", x); // console.log((resolution - 1) / (length + (term_length || 0)) * (i + (x || 0)) + 1); // the first resolution slot is reserved for the best match, // when a query matches the first word(s). // also to stretch score to the whole range of resolution, the // calculation is shift by one and cut the floating point. // this needs the resolution "1" to be handled additionally. // do not stretch the resolution more than the term length will // improve performance and memory, also it improves scoring in // most cases between a short document and a long document return i && (resolution > 1) ? ( (length + (term_length || 0)) <= resolution ? i + (x || 0) : ((resolution - 1) / (length + (term_length || 0)) * (i + (x || 0)) + 1) | 0 ): 0; } function single_result(result, limit, offset){ if(result.length === 1){ result = result[0]; } else{ result = concat(result); } return offset || (result.length > limit) ? result.slice(offset, offset + limit) : result; } function get_array(arr, term, keyword, bidirectional){ if(keyword){ // the frequency of the starting letter is slightly less // on the last half of the alphabet (m-z) in almost every latin language, // so we sort downwards (https://en.wikipedia.org/wiki/Letter_frequency) const swap = bidirectional && (term > keyword); arr = arr[swap ? term : keyword]; arr = arr && arr[swap ? keyword : term]; } else{ arr = arr[term]; } return arr; } /** * @param map * @param id * @param res * @param optimize * @param {number=} resolution * @return {number} */ function remove_index(map, id, res, optimize, resolution){ let count = 0; if(is_array(map)){ // the first array is the score array in both strategies if(!resolution){ resolution = Math.min(map.length, res); for(let x = 0, arr; x < resolution; x++){ arr = map[x]; if(arr){ count = remove_index(arr, id, res, optimize, resolution); if(!optimize && !count){ // when not memory optimized the score index should removed delete map[x]; } } } } else{ const pos = map.indexOf(id); if(pos !== -1){ // fast path, when length is 1 or lower then the whole field gets deleted if(map.length > 1){ map.splice(pos, 1); count++; } } else{ count++; } } } else{ for(let key in map){ count = remove_index(map[key], id, res, optimize, resolution); if(!count){ delete map[key]; } } } return count; } // TODO bring all these functions into this file Index.prototype.searchCache = searchCache; apply_async(Index.prototype);