flexsearch-ts
Version:
Next-Generation full text search library with zero dependencies.
822 lines (554 loc) • 21 kB
JavaScript
/**!
* FlexSearch.js
* Copyright 2018-2022 Nextapps GmbH
* Author: Thomas Wilkerling
* Licence: Apache-2.0
* https://github.com/nextapps-de/flexsearch
*/
import {
SUPPORT_ENCODER,
SUPPORT_CACHE,
SUPPORT_ASYNC,
SUPPORT_SUGGESTION,
SUPPORT_SERIALIZE
} from "./config.js";
import { IndexInterface } from "./type.js";
import { encode as default_encoder } from "./lang/latin/default.js";
import { create_object, create_object_array, concat, sort_by_length_down, is_array, is_string, is_object, parse_option } from "./common.js";
import { pipeline, init_stemmer_or_matcher, init_filter } from "./lang.js";
import { global_lang, global_charset } from "./global.js";
import apply_async from "./async.js";
import { intersect } from "./intersect.js";
import Cache, { searchCache } from "./cache.js";
import apply_preset from "./preset.js";
import { exportIndex, importIndex } from "./serialize.js";
/**
* @constructor
* @implements IndexInterface
* @param {Object=} options
* @param {Object=} _register
* @return {Index}
*/
function Index(options, _register){
if(!(this instanceof Index)) {
return new Index(options);
}
let charset, lang, tmp;
if(options){
if(SUPPORT_ENCODER){
options = apply_preset(options);
}
charset = options["charset"];
lang = options["lang"];
if(is_string(charset)){
if(charset.indexOf(":") === -1){
charset += ":default";
}
charset = global_charset[charset];
}
if(is_string(lang)){
lang = global_lang[lang];
}
}
else{
options = {};
}
let resolution, optimize, context = options["context"] || {};
this.encode = options["encode"] || (charset && charset.encode) || default_encoder;
this.register = _register || create_object();
this.resolution = resolution = options["resolution"] || 9;
this.tokenize = tmp = (charset && charset.tokenize) || options["tokenize"] || "strict";
this.depth = (tmp === "strict") && context["depth"];
this.bidirectional = parse_option(context["bidirectional"], true);
this.optimize = optimize = parse_option(options["optimize"], true);
this.fastupdate = parse_option(options["fastupdate"], true);
this.minlength = options["minlength"] || 1;
this.boost = options["boost"];
// when not using the memory strategy the score array should not pre-allocated to its full length
this.map = optimize ? create_object_array(resolution) : create_object();
this.resolution_ctx = resolution = context["resolution"] || 1;
this.ctx = optimize ? create_object_array(resolution) : create_object();
this.rtl = (charset && charset.rtl) || options["rtl"];
this.matcher = (tmp = options["matcher"] || (lang && lang.matcher)) && init_stemmer_or_matcher(tmp, false);
this.stemmer = (tmp = options["stemmer"] || (lang && lang.stemmer)) && init_stemmer_or_matcher(tmp, true);
this.filter = (tmp = options["filter"] || (lang && lang.filter)) && init_filter(tmp);
if(SUPPORT_CACHE){
this.cache = (tmp = options["cache"]) && new Cache(tmp);
}
}
export default Index;
//Index.prototype.pipeline = pipeline;
/**
* @param {!number|string} id
* @param {!string} content
*/
Index.prototype.append = function(id, content){
return this.add(id, content, true);
};
// TODO:
// string + number as text
// boolean, null, undefined as ?
/**
* @param {!number|string} id
* @param {!string} content
* @param {boolean=} _append
* @param {boolean=} _skip_update
*/
Index.prototype.add = function(id, content, _append, _skip_update){
if(content && (id || (id === 0))){
if(!_skip_update && !_append && this.register[id]){
return this.update(id, content);
}
content = this.encode("" + content);
const length = content.length;
if(length){
// check context dupes to skip all contextual redundancy along a document
const dupes_ctx = create_object();
const dupes = create_object();
const depth = this.depth;
const resolution = this.resolution;
for(let i = 0; i < length; i++){
let term = content[this.rtl ? length - 1 - i : i];
let term_length = term.length;
// skip dupes will break the context chain
if(term && (term_length >= this.minlength) && (depth || !dupes[term])){
let score = get_score(resolution, length, i);
let token = "";
switch(this.tokenize){
case "full":
if(term_length > 2){
for(let x = 0; x < term_length; x++){
for(let y = term_length; y > x; y--){
if((y - x) >= this.minlength){
const partial_score = get_score(resolution, length, i, term_length, x);
token = term.substring(x, y);
this.push_index(dupes, token, partial_score, id, _append);
}
}
}
break;
}
// fallthrough to next case when term length < 3
case "reverse":
// skip last round (this token exist already in "forward")
if(term_length > 1){
for(let x = term_length - 1; x > 0; x--){
token = term[x] + token;
if(token.length >= this.minlength){
const partial_score = get_score(resolution, length, i, term_length, x);
this.push_index(dupes, token, partial_score, id, _append);
}
}
token = "";
}
// fallthrough to next case to apply forward also
case "forward":
if(term_length > 1){
for(let x = 0; x < term_length; x++){
token += term[x];
if(token.length >= this.minlength){
this.push_index(dupes, token, score, id, _append);
}
}
break;
}
// fallthrough to next case when token has a length of 1
default:
// case "strict":
if(this.boost){
score = Math.min((score / this.boost(content, term, i)) | 0, resolution - 1);
}
this.push_index(dupes, term, score, id, _append);
// context is just supported by tokenizer "strict"
if(depth){
if((length > 1) && (i < (length - 1))){
// check inner dupes to skip repeating words in the current context
const dupes_inner = create_object();
const resolution = this.resolution_ctx;
const keyword = term;
const size = Math.min(depth + 1, length - i);
dupes_inner[keyword] = 1;
for(let x = 1; x < size; x++){
term = content[this.rtl ? length - 1 - i - x : i + x];
if(term && (term.length >= this.minlength) && !dupes_inner[term]){
dupes_inner[term] = 1;
const context_score = get_score(resolution + ((length / 2) > resolution ? 0 : 1), length, i, size - 1, x - 1);
const swap = this.bidirectional && (term > keyword);
this.push_index(dupes_ctx, swap ? keyword : term, context_score, id, _append, swap ? term : keyword);
}
}
}
}
}
}
}
this.fastupdate || (this.register[id] = 1);
}
}
return this;
};
/**
* @param {number} resolution
* @param {number} length
* @param {number} i
* @param {number=} term_length
* @param {number=} x
* @returns {number}
*/
function get_score(resolution, length, i, term_length, x){
// console.log("resolution", resolution);
// console.log("length", length);
// console.log("term_length", term_length);
// console.log("i", i);
// console.log("x", x);
// console.log((resolution - 1) / (length + (term_length || 0)) * (i + (x || 0)) + 1);
// the first resolution slot is reserved for the best match,
// when a query matches the first word(s).
// also to stretch score to the whole range of resolution, the
// calculation is shift by one and cut the floating point.
// this needs the resolution "1" to be handled additionally.
// do not stretch the resolution more than the term length will
// improve performance and memory, also it improves scoring in
// most cases between a short document and a long document
return i && (resolution > 1) ? (
(length + (term_length || 0)) <= resolution ?
i + (x || 0)
:
((resolution - 1) / (length + (term_length || 0)) * (i + (x || 0)) + 1) | 0
):
0;
}
/**
* @private
* @param dupes
* @param value
* @param score
* @param id
* @param {boolean=} append
* @param {string=} keyword
*/
Index.prototype.push_index = function(dupes, value, score, id, append, keyword){
let arr = keyword ? this.ctx : this.map;
if(!dupes[value] || (keyword && !dupes[value][keyword])){
if(this.optimize){
arr = arr[score];
}
if(keyword){
dupes = dupes[value] || (dupes[value] = create_object());
dupes[keyword] = 1;
arr = arr[keyword] || (arr[keyword] = create_object());
}
else{
dupes[value] = 1;
}
arr = arr[value] || (arr[value] = []);
if(!this.optimize){
arr = arr[score] || (arr[score] = []);
}
if(!append || !arr.includes(id)){
arr[arr.length] = id;
// add a reference to the register for fast updates
if(this.fastupdate){
const tmp = this.register[id] || (this.register[id] = []);
tmp[tmp.length] = arr;
}
}
}
}
/**
* @param {string|Object} query
* @param {number|Object=} limit
* @param {Object=} options
* @returns {Array<number|string>}
*/
Index.prototype.search = function(query, limit, options){
if(!options){
if(!limit && is_object(query)){
options = /** @type {Object} */ (query);
query = options["query"];
}
else if(is_object(limit)){
options = /** @type {Object} */ (limit);
}
}
let result = [];
let length;
let context, suggest, offset = 0;
if(options){
query = options["query"] || query;
limit = options["limit"];
offset = options["offset"] || 0;
context = options["context"];
suggest = SUPPORT_SUGGESTION && options["suggest"];
}
if(query){
query = /** @type {Array} */ (this.encode("" + query));
length = query.length;
// TODO: solve this in one single loop below
if(length > 1){
const dupes = create_object();
const query_new = [];
for(let i = 0, count = 0, term; i < length; i++){
term = query[i];
if(term && (term.length >= this.minlength) && !dupes[term]){
// this fast path can just apply when not in memory-optimized mode
if(!this.optimize && !suggest && !this.map[term]){
// fast path "not found"
return result;
}
else{
query_new[count++] = term;
dupes[term] = 1;
}
}
}
query = query_new;
length = query.length;
}
}
if(!length){
return result;
}
limit || (limit = 100);
let depth = this.depth && (length > 1) && (context !== false);
let index = 0, keyword;
if(depth){
keyword = query[0];
index = 1;
}
else{
if(length > 1){
query.sort(sort_by_length_down);
}
}
for(let arr, term; index < length; index++){
term = query[index];
// console.log(keyword);
// console.log(term);
// console.log("");
if(depth){
arr = this.add_result(result, suggest, limit, offset, length === 2, term, keyword);
// console.log(arr);
// console.log(result);
// when suggestion enabled just forward keyword if term was found
// as long as the result is empty forward the pointer also
if(!suggest || (arr !== false) || !result.length){
keyword = term;
}
}
else{
arr = this.add_result(result, suggest, limit, offset, length === 1, term);
}
if(arr){
return /** @type {Array<number|string>} */ (arr);
}
// apply suggestions on last loop or fallback
if(suggest && (index === length - 1)){
let length = result.length;
if(!length){
if(depth){
// fallback to non-contextual search when no result was found
depth = 0;
index = -1;
continue;
}
return result;
}
else if(length === 1){
// fast path optimization
return single_result(result[0], limit, offset);
}
}
}
return intersect(result, limit, offset, suggest);
};
/**
* Returns an array when the result is done (to stop the process immediately),
* returns false when suggestions is enabled and no result was found,
* or returns nothing when a set was pushed successfully to the results
*
* @private
* @param {Array} result
* @param {Array} suggest
* @param {number} limit
* @param {number} offset
* @param {boolean} single_term
* @param {string} term
* @param {string=} keyword
* @return {Array<Array<string|number>>|boolean|undefined}
*/
Index.prototype.add_result = function(result, suggest, limit, offset, single_term, term, keyword){
let word_arr = [];
let arr = keyword ? this.ctx : this.map;
if(!this.optimize){
arr = get_array(arr, term, keyword, this.bidirectional);
}
if(arr){
let count = 0;
const arr_len = Math.min(arr.length, keyword ? this.resolution_ctx : this.resolution);
// relevance:
for(let x = 0, size = 0, tmp, len; x < arr_len; x++){
tmp = arr[x];
if(tmp){
if(this.optimize){
tmp = get_array(tmp, term, keyword, this.bidirectional);
}
if(offset){
if(tmp && single_term){
len = tmp.length;
if(len <= offset){
offset -= len;
tmp = null;
}
else{
tmp = tmp.slice(offset);
offset = 0;
}
}
}
if(tmp){
// keep score (sparse array):
//word_arr[x] = tmp;
// simplified score order:
word_arr[count++] = tmp;
if(single_term){
size += tmp.length;
if(size >= limit){
// fast path optimization
break;
}
}
}
}
}
if(count){
if(single_term){
// fast path optimization
// offset was already applied at this point
return single_result(word_arr, limit, 0);
}
result[result.length] = word_arr;
return;
}
}
// return an empty array will stop the loop,
// to prevent stop when using suggestions return a false value
return !suggest && word_arr;
};
function single_result(result, limit, offset){
if(result.length === 1){
result = result[0];
}
else{
result = concat(result);
}
return offset || (result.length > limit) ?
result.slice(offset, offset + limit)
:
result;
}
function get_array(arr, term, keyword, bidirectional){
if(keyword){
// the frequency of the starting letter is slightly less
// on the last half of the alphabet (m-z) in almost every latin language,
// so we sort downwards (https://en.wikipedia.org/wiki/Letter_frequency)
const swap = bidirectional && (term > keyword);
arr = arr[swap ? term : keyword];
arr = arr && arr[swap ? keyword : term];
}
else{
arr = arr[term];
}
return arr;
}
Index.prototype.contain = function(id){
return !!this.register[id];
};
Index.prototype.update = function(id, content){
return this.remove(id).add(id, content);
};
/**
* @param {boolean=} _skip_deletion
*/
Index.prototype.remove = function(id, _skip_deletion){
const refs = this.register[id];
if(refs){
if(this.fastupdate){
// fast updates performs really fast but did not fully cleanup the key entries
for(let i = 0, tmp; i < refs.length; i++){
tmp = refs[i];
tmp.splice(tmp.indexOf(id), 1);
}
}
else{
remove_index(this.map, id, this.resolution, this.optimize);
if(this.depth){
remove_index(this.ctx, id, this.resolution_ctx, this.optimize);
}
}
_skip_deletion || delete this.register[id];
if(SUPPORT_CACHE && this.cache){
this.cache.del(id);
}
}
return this;
};
/**
* @param map
* @param id
* @param res
* @param optimize
* @param {number=} resolution
* @return {number}
*/
function remove_index(map, id, res, optimize, resolution){
let count = 0;
if(is_array(map)){
// the first array is the score array in both strategies
if(!resolution){
resolution = Math.min(map.length, res);
for(let x = 0, arr; x < resolution; x++){
arr = map[x];
if(arr){
count = remove_index(arr, id, res, optimize, resolution);
if(!optimize && !count){
// when not memory optimized the score index should removed
delete map[x];
}
}
}
}
else{
const pos = map.indexOf(id);
if(pos !== -1){
// fast path, when length is 1 or lower then the whole field gets deleted
if(map.length > 1){
map.splice(pos, 1);
count++;
}
}
else{
count++;
}
}
}
else{
for(let key in map){
count = remove_index(map[key], id, res, optimize, resolution);
if(!count){
delete map[key];
}
}
}
return count;
}
if(SUPPORT_CACHE){
Index.prototype.searchCache = searchCache;
}
if(SUPPORT_SERIALIZE){
Index.prototype.export = exportIndex;
Index.prototype.import = importIndex;
}
if(SUPPORT_ASYNC){
apply_async(Index.prototype);
}