minisearch
Version:
Tiny but powerful full-text search engine for browser and Node
1,311 lines (1,308 loc) • 77.9 kB
JavaScript
/** @ignore */
const ENTRIES = 'ENTRIES';
/** @ignore */
const KEYS = 'KEYS';
/** @ignore */
const VALUES = 'VALUES';
/** @ignore */
const LEAF = '';
/**
* @private
*/
class TreeIterator {
constructor(set, type) {
const node = set._tree;
const keys = Array.from(node.keys());
this.set = set;
this._type = type;
this._path = keys.length > 0 ? [{ node, keys }] : [];
}
next() {
const value = this.dive();
this.backtrack();
return value;
}
dive() {
if (this._path.length === 0) {
return { done: true, value: undefined };
}
const { node, keys } = last$1(this._path);
if (last$1(keys) === LEAF) {
return { done: false, value: this.result() };
}
const child = node.get(last$1(keys));
this._path.push({ node: child, keys: Array.from(child.keys()) });
return this.dive();
}
backtrack() {
if (this._path.length === 0) {
return;
}
const keys = last$1(this._path).keys;
keys.pop();
if (keys.length > 0) {
return;
}
this._path.pop();
this.backtrack();
}
key() {
return this.set._prefix + this._path
.map(({ keys }) => last$1(keys))
.filter(key => key !== LEAF)
.join('');
}
value() {
return last$1(this._path).node.get(LEAF);
}
result() {
switch (this._type) {
case VALUES: return this.value();
case KEYS: return this.key();
default: return [this.key(), this.value()];
}
}
[Symbol.iterator]() {
return this;
}
}
const last$1 = (array) => {
return array[array.length - 1];
};
/* eslint-disable no-labels */
/**
* @ignore
*/
const fuzzySearch = (node, query, maxDistance) => {
const results = new Map();
if (query === undefined)
return results;
// Number of columns in the Levenshtein matrix.
const n = query.length + 1;
// Matching terms can never be longer than N + maxDistance.
const m = n + maxDistance;
// Fill first matrix row and column with numbers: 0 1 2 3 ...
const matrix = new Uint8Array(m * n).fill(maxDistance + 1);
for (let j = 0; j < n; ++j)
matrix[j] = j;
for (let i = 1; i < m; ++i)
matrix[i * n] = i;
recurse(node, query, maxDistance, results, matrix, 1, n, '');
return results;
};
// Modified version of http://stevehanov.ca/blog/?id=114
// This builds a Levenshtein matrix for a given query and continuously updates
// it for nodes in the radix tree that fall within the given maximum edit
// distance. Keeping the same matrix around is beneficial especially for larger
// edit distances.
//
// k a t e <-- query
// 0 1 2 3 4
// c 1 1 2 3 4
// a 2 2 1 2 3
// t 3 3 2 1 [2] <-- edit distance
// ^
// ^ term in radix tree, rows are added and removed as needed
const recurse = (node, query, maxDistance, results, matrix, m, n, prefix) => {
const offset = m * n;
key: for (const key of node.keys()) {
if (key === LEAF) {
// We've reached a leaf node. Check if the edit distance acceptable and
// store the result if it is.
const distance = matrix[offset - 1];
if (distance <= maxDistance) {
results.set(prefix, [node.get(key), distance]);
}
}
else {
// Iterate over all characters in the key. Update the Levenshtein matrix
// and check if the minimum distance in the last row is still within the
// maximum edit distance. If it is, we can recurse over all child nodes.
let i = m;
for (let pos = 0; pos < key.length; ++pos, ++i) {
const char = key[pos];
const thisRowOffset = n * i;
const prevRowOffset = thisRowOffset - n;
// Set the first column based on the previous row, and initialize the
// minimum distance in the current row.
let minDistance = matrix[thisRowOffset];
const jmin = Math.max(0, i - maxDistance - 1);
const jmax = Math.min(n - 1, i + maxDistance);
// Iterate over remaining columns (characters in the query).
for (let j = jmin; j < jmax; ++j) {
const different = char !== query[j];
// It might make sense to only read the matrix positions used for
// deletion/insertion if the characters are different. But we want to
// avoid conditional reads for performance reasons.
const rpl = matrix[prevRowOffset + j] + +different;
const del = matrix[prevRowOffset + j + 1] + 1;
const ins = matrix[thisRowOffset + j] + 1;
const dist = matrix[thisRowOffset + j + 1] = Math.min(rpl, del, ins);
if (dist < minDistance)
minDistance = dist;
}
// Because distance will never decrease, we can stop. There will be no
// matching child nodes.
if (minDistance > maxDistance) {
continue key;
}
}
recurse(node.get(key), query, maxDistance, results, matrix, i, n, prefix + key);
}
}
};
/* eslint-disable no-labels */
/**
* A class implementing the same interface as a standard JavaScript
* [`Map`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map)
* with string keys, but adding support for efficiently searching entries with
* prefix or fuzzy search. This class is used internally by {@link MiniSearch}
* as the inverted index data structure. The implementation is a radix tree
* (compressed prefix tree).
*
* Since this class can be of general utility beyond _MiniSearch_, it is
* exported by the `minisearch` package and can be imported (or required) as
* `minisearch/SearchableMap`.
*
* @typeParam T The type of the values stored in the map.
*/
class SearchableMap {
/**
* The constructor is normally called without arguments, creating an empty
* map. In order to create a {@link SearchableMap} from an iterable or from an
* object, check {@link SearchableMap.from} and {@link
* SearchableMap.fromObject}.
*
* The constructor arguments are for internal use, when creating derived
* mutable views of a map at a prefix.
*/
constructor(tree = new Map(), prefix = '') {
this._size = undefined;
this._tree = tree;
this._prefix = prefix;
}
/**
* Creates and returns a mutable view of this {@link SearchableMap},
* containing only entries that share the given prefix.
*
* ### Usage:
*
* ```javascript
* let map = new SearchableMap()
* map.set("unicorn", 1)
* map.set("universe", 2)
* map.set("university", 3)
* map.set("unique", 4)
* map.set("hello", 5)
*
* let uni = map.atPrefix("uni")
* uni.get("unique") // => 4
* uni.get("unicorn") // => 1
* uni.get("hello") // => undefined
*
* let univer = map.atPrefix("univer")
* univer.get("unique") // => undefined
* univer.get("universe") // => 2
* univer.get("university") // => 3
* ```
*
* @param prefix The prefix
* @return A {@link SearchableMap} representing a mutable view of the original
* Map at the given prefix
*/
atPrefix(prefix) {
if (!prefix.startsWith(this._prefix)) {
throw new Error('Mismatched prefix');
}
const [node, path] = trackDown(this._tree, prefix.slice(this._prefix.length));
if (node === undefined) {
const [parentNode, key] = last(path);
for (const k of parentNode.keys()) {
if (k !== LEAF && k.startsWith(key)) {
const node = new Map();
node.set(k.slice(key.length), parentNode.get(k));
return new SearchableMap(node, prefix);
}
}
}
return new SearchableMap(node, prefix);
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/clear
*/
clear() {
this._size = undefined;
this._tree.clear();
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/delete
* @param key Key to delete
*/
delete(key) {
this._size = undefined;
return remove(this._tree, key);
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries
* @return An iterator iterating through `[key, value]` entries.
*/
entries() {
return new TreeIterator(this, ENTRIES);
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/forEach
* @param fn Iteration function
*/
forEach(fn) {
for (const [key, value] of this) {
fn(key, value, this);
}
}
/**
* Returns a Map of all the entries that have a key within the given edit
* distance from the search key. The keys of the returned Map are the matching
* keys, while the values are two-element arrays where the first element is
* the value associated to the key, and the second is the edit distance of the
* key to the search key.
*
* ### Usage:
*
* ```javascript
* let map = new SearchableMap()
* map.set('hello', 'world')
* map.set('hell', 'yeah')
* map.set('ciao', 'mondo')
*
* // Get all entries that match the key 'hallo' with a maximum edit distance of 2
* map.fuzzyGet('hallo', 2)
* // => Map(2) { 'hello' => ['world', 1], 'hell' => ['yeah', 2] }
*
* // In the example, the "hello" key has value "world" and edit distance of 1
* // (change "e" to "a"), the key "hell" has value "yeah" and edit distance of 2
* // (change "e" to "a", delete "o")
* ```
*
* @param key The search key
* @param maxEditDistance The maximum edit distance (Levenshtein)
* @return A Map of the matching keys to their value and edit distance
*/
fuzzyGet(key, maxEditDistance) {
return fuzzySearch(this._tree, key, maxEditDistance);
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/get
* @param key Key to get
* @return Value associated to the key, or `undefined` if the key is not
* found.
*/
get(key) {
const node = lookup(this._tree, key);
return node !== undefined ? node.get(LEAF) : undefined;
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/has
* @param key Key
* @return True if the key is in the map, false otherwise
*/
has(key) {
const node = lookup(this._tree, key);
return node !== undefined && node.has(LEAF);
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/keys
* @return An `Iterable` iterating through keys
*/
keys() {
return new TreeIterator(this, KEYS);
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/set
* @param key Key to set
* @param value Value to associate to the key
* @return The {@link SearchableMap} itself, to allow chaining
*/
set(key, value) {
if (typeof key !== 'string') {
throw new Error('key must be a string');
}
this._size = undefined;
const node = createPath(this._tree, key);
node.set(LEAF, value);
return this;
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/size
*/
get size() {
if (this._size) {
return this._size;
}
/** @ignore */
this._size = 0;
const iter = this.entries();
while (!iter.next().done)
this._size += 1;
return this._size;
}
/**
* Updates the value at the given key using the provided function. The function
* is called with the current value at the key, and its return value is used as
* the new value to be set.
*
* ### Example:
*
* ```javascript
* // Increment the current value by one
* searchableMap.update('somekey', (currentValue) => currentValue == null ? 0 : currentValue + 1)
* ```
*
* If the value at the given key is or will be an object, it might not require
* re-assignment. In that case it is better to use `fetch()`, because it is
* faster.
*
* @param key The key to update
* @param fn The function used to compute the new value from the current one
* @return The {@link SearchableMap} itself, to allow chaining
*/
update(key, fn) {
if (typeof key !== 'string') {
throw new Error('key must be a string');
}
this._size = undefined;
const node = createPath(this._tree, key);
node.set(LEAF, fn(node.get(LEAF)));
return this;
}
/**
* Fetches the value of the given key. If the value does not exist, calls the
* given function to create a new value, which is inserted at the given key
* and subsequently returned.
*
* ### Example:
*
* ```javascript
* const map = searchableMap.fetch('somekey', () => new Map())
* map.set('foo', 'bar')
* ```
*
* @param key The key to update
* @param initial A function that creates a new value if the key does not exist
* @return The existing or new value at the given key
*/
fetch(key, initial) {
if (typeof key !== 'string') {
throw new Error('key must be a string');
}
this._size = undefined;
const node = createPath(this._tree, key);
let value = node.get(LEAF);
if (value === undefined) {
node.set(LEAF, value = initial());
}
return value;
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/values
* @return An `Iterable` iterating through values.
*/
values() {
return new TreeIterator(this, VALUES);
}
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/@@iterator
*/
[Symbol.iterator]() {
return this.entries();
}
/**
* Creates a {@link SearchableMap} from an `Iterable` of entries
*
* @param entries Entries to be inserted in the {@link SearchableMap}
* @return A new {@link SearchableMap} with the given entries
*/
static from(entries) {
const tree = new SearchableMap();
for (const [key, value] of entries) {
tree.set(key, value);
}
return tree;
}
/**
* Creates a {@link SearchableMap} from the iterable properties of a JavaScript object
*
* @param object Object of entries for the {@link SearchableMap}
* @return A new {@link SearchableMap} with the given entries
*/
static fromObject(object) {
return SearchableMap.from(Object.entries(object));
}
}
const trackDown = (tree, key, path = []) => {
if (key.length === 0 || tree == null) {
return [tree, path];
}
for (const k of tree.keys()) {
if (k !== LEAF && key.startsWith(k)) {
path.push([tree, k]); // performance: update in place
return trackDown(tree.get(k), key.slice(k.length), path);
}
}
path.push([tree, key]); // performance: update in place
return trackDown(undefined, '', path);
};
const lookup = (tree, key) => {
if (key.length === 0 || tree == null) {
return tree;
}
for (const k of tree.keys()) {
if (k !== LEAF && key.startsWith(k)) {
return lookup(tree.get(k), key.slice(k.length));
}
}
};
// Create a path in the radix tree for the given key, and returns the deepest
// node. This function is in the hot path for indexing. It avoids unnecessary
// string operations and recursion for performance.
const createPath = (node, key) => {
const keyLength = key.length;
outer: for (let pos = 0; node && pos < keyLength;) {
for (const k of node.keys()) {
// Check whether this key is a candidate: the first characters must match.
if (k !== LEAF && key[pos] === k[0]) {
const len = Math.min(keyLength - pos, k.length);
// Advance offset to the point where key and k no longer match.
let offset = 1;
while (offset < len && key[pos + offset] === k[offset])
++offset;
const child = node.get(k);
if (offset === k.length) {
// The existing key is shorter than the key we need to create.
node = child;
}
else {
// Partial match: we need to insert an intermediate node to contain
// both the existing subtree and the new node.
const intermediate = new Map();
intermediate.set(k.slice(offset), child);
node.set(key.slice(pos, pos + offset), intermediate);
node.delete(k);
node = intermediate;
}
pos += offset;
continue outer;
}
}
// Create a final child node to contain the final suffix of the key.
const child = new Map();
node.set(key.slice(pos), child);
return child;
}
return node;
};
const remove = (tree, key) => {
const [node, path] = trackDown(tree, key);
if (node === undefined) {
return;
}
node.delete(LEAF);
if (node.size === 0) {
cleanup(path);
}
else if (node.size === 1) {
const [key, value] = node.entries().next().value;
merge(path, key, value);
}
};
const cleanup = (path) => {
if (path.length === 0) {
return;
}
const [node, key] = last(path);
node.delete(key);
if (node.size === 0) {
cleanup(path.slice(0, -1));
}
else if (node.size === 1) {
const [key, value] = node.entries().next().value;
if (key !== LEAF) {
merge(path.slice(0, -1), key, value);
}
}
};
const merge = (path, key, value) => {
if (path.length === 0) {
return;
}
const [node, nodeKey] = last(path);
node.set(nodeKey + key, value);
node.delete(nodeKey);
};
const last = (array) => {
return array[array.length - 1];
};
const OR = 'or';
const AND = 'and';
const AND_NOT = 'and_not';
/**
* {@link MiniSearch} is the main entrypoint class, implementing a full-text
* search engine in memory.
*
* @typeParam T The type of the documents being indexed.
*
* ### Basic example:
*
* ```javascript
* const documents = [
* {
* id: 1,
* title: 'Moby Dick',
* text: 'Call me Ishmael. Some years ago...',
* category: 'fiction'
* },
* {
* id: 2,
* title: 'Zen and the Art of Motorcycle Maintenance',
* text: 'I can see by my watch...',
* category: 'fiction'
* },
* {
* id: 3,
* title: 'Neuromancer',
* text: 'The sky above the port was...',
* category: 'fiction'
* },
* {
* id: 4,
* title: 'Zen and the Art of Archery',
* text: 'At first sight it must seem...',
* category: 'non-fiction'
* },
* // ...and more
* ]
*
* // Create a search engine that indexes the 'title' and 'text' fields for
* // full-text search. Search results will include 'title' and 'category' (plus the
* // id field, that is always stored and returned)
* const miniSearch = new MiniSearch({
* fields: ['title', 'text'],
* storeFields: ['title', 'category']
* })
*
* // Add documents to the index
* miniSearch.addAll(documents)
*
* // Search for documents:
* let results = miniSearch.search('zen art motorcycle')
* // => [
* // { id: 2, title: 'Zen and the Art of Motorcycle Maintenance', category: 'fiction', score: 2.77258 },
* // { id: 4, title: 'Zen and the Art of Archery', category: 'non-fiction', score: 1.38629 }
* // ]
* ```
*/
class MiniSearch {
/**
* @param options Configuration options
*
* ### Examples:
*
* ```javascript
* // Create a search engine that indexes the 'title' and 'text' fields of your
* // documents:
* const miniSearch = new MiniSearch({ fields: ['title', 'text'] })
* ```
*
* ### ID Field:
*
* ```javascript
* // Your documents are assumed to include a unique 'id' field, but if you want
* // to use a different field for document identification, you can set the
* // 'idField' option:
* const miniSearch = new MiniSearch({ idField: 'key', fields: ['title', 'text'] })
* ```
*
* ### Options and defaults:
*
* ```javascript
* // The full set of options (here with their default value) is:
* const miniSearch = new MiniSearch({
* // idField: field that uniquely identifies a document
* idField: 'id',
*
* // extractField: function used to get the value of a field in a document.
* // By default, it assumes the document is a flat object with field names as
* // property keys and field values as string property values, but custom logic
* // can be implemented by setting this option to a custom extractor function.
* extractField: (document, fieldName) => document[fieldName],
*
* // tokenize: function used to split fields into individual terms. By
* // default, it is also used to tokenize search queries, unless a specific
* // `tokenize` search option is supplied. When tokenizing an indexed field,
* // the field name is passed as the second argument.
* tokenize: (string, _fieldName) => string.split(SPACE_OR_PUNCTUATION),
*
* // processTerm: function used to process each tokenized term before
* // indexing. It can be used for stemming and normalization. Return a falsy
* // value in order to discard a term. By default, it is also used to process
* // search queries, unless a specific `processTerm` option is supplied as a
* // search option. When processing a term from a indexed field, the field
* // name is passed as the second argument.
* processTerm: (term, _fieldName) => term.toLowerCase(),
*
* // searchOptions: default search options, see the `search` method for
* // details
* searchOptions: undefined,
*
* // fields: document fields to be indexed. Mandatory, but not set by default
* fields: undefined
*
* // storeFields: document fields to be stored and returned as part of the
* // search results.
* storeFields: []
* })
* ```
*/
constructor(options) {
if ((options === null || options === void 0 ? void 0 : options.fields) == null) {
throw new Error('MiniSearch: option "fields" must be provided');
}
const autoVacuum = (options.autoVacuum == null || options.autoVacuum === true) ? defaultAutoVacuumOptions : options.autoVacuum;
this._options = {
...defaultOptions,
...options,
autoVacuum,
searchOptions: { ...defaultSearchOptions, ...(options.searchOptions || {}) },
autoSuggestOptions: { ...defaultAutoSuggestOptions, ...(options.autoSuggestOptions || {}) }
};
this._index = new SearchableMap();
this._documentCount = 0;
this._documentIds = new Map();
this._idToShortId = new Map();
// Fields are defined during initialization, don't change, are few in
// number, rarely need iterating over, and have string keys. Therefore in
// this case an object is a better candidate than a Map to store the mapping
// from field key to ID.
this._fieldIds = {};
this._fieldLength = new Map();
this._avgFieldLength = [];
this._nextId = 0;
this._storedFields = new Map();
this._dirtCount = 0;
this._currentVacuum = null;
this._enqueuedVacuum = null;
this._enqueuedVacuumConditions = defaultVacuumConditions;
this.addFields(this._options.fields);
}
/**
* Adds a document to the index
*
* @param document The document to be indexed
*/
add(document) {
const { extractField, tokenize, processTerm, fields, idField } = this._options;
const id = extractField(document, idField);
if (id == null) {
throw new Error(`MiniSearch: document does not have ID field "${idField}"`);
}
if (this._idToShortId.has(id)) {
throw new Error(`MiniSearch: duplicate ID ${id}`);
}
const shortDocumentId = this.addDocumentId(id);
this.saveStoredFields(shortDocumentId, document);
for (const field of fields) {
const fieldValue = extractField(document, field);
if (fieldValue == null)
continue;
const tokens = tokenize(fieldValue.toString(), field);
const fieldId = this._fieldIds[field];
const uniqueTerms = new Set(tokens).size;
this.addFieldLength(shortDocumentId, fieldId, this._documentCount - 1, uniqueTerms);
for (const term of tokens) {
const processedTerm = processTerm(term, field);
if (Array.isArray(processedTerm)) {
for (const t of processedTerm) {
this.addTerm(fieldId, shortDocumentId, t);
}
}
else if (processedTerm) {
this.addTerm(fieldId, shortDocumentId, processedTerm);
}
}
}
}
/**
* Adds all the given documents to the index
*
* @param documents An array of documents to be indexed
*/
addAll(documents) {
for (const document of documents)
this.add(document);
}
/**
* Adds all the given documents to the index asynchronously.
*
* Returns a promise that resolves (to `undefined`) when the indexing is done.
* This method is useful when index many documents, to avoid blocking the main
* thread. The indexing is performed asynchronously and in chunks.
*
* @param documents An array of documents to be indexed
* @param options Configuration options
* @return A promise resolving to `undefined` when the indexing is done
*/
addAllAsync(documents, options = {}) {
const { chunkSize = 10 } = options;
const acc = { chunk: [], promise: Promise.resolve() };
const { chunk, promise } = documents.reduce(({ chunk, promise }, document, i) => {
chunk.push(document);
if ((i + 1) % chunkSize === 0) {
return {
chunk: [],
promise: promise
.then(() => new Promise(resolve => setTimeout(resolve, 0)))
.then(() => this.addAll(chunk))
};
}
else {
return { chunk, promise };
}
}, acc);
return promise.then(() => this.addAll(chunk));
}
/**
* Removes the given document from the index.
*
* The document to remove must NOT have changed between indexing and removal,
* otherwise the index will be corrupted.
*
* This method requires passing the full document to be removed (not just the
* ID), and immediately removes the document from the inverted index, allowing
* memory to be released. A convenient alternative is {@link
* MiniSearch#discard}, which needs only the document ID, and has the same
* visible effect, but delays cleaning up the index until the next vacuuming.
*
* @param document The document to be removed
*/
remove(document) {
const { tokenize, processTerm, extractField, fields, idField } = this._options;
const id = extractField(document, idField);
if (id == null) {
throw new Error(`MiniSearch: document does not have ID field "${idField}"`);
}
const shortId = this._idToShortId.get(id);
if (shortId == null) {
throw new Error(`MiniSearch: cannot remove document with ID ${id}: it is not in the index`);
}
for (const field of fields) {
const fieldValue = extractField(document, field);
if (fieldValue == null)
continue;
const tokens = tokenize(fieldValue.toString(), field);
const fieldId = this._fieldIds[field];
const uniqueTerms = new Set(tokens).size;
this.removeFieldLength(shortId, fieldId, this._documentCount, uniqueTerms);
for (const term of tokens) {
const processedTerm = processTerm(term, field);
if (Array.isArray(processedTerm)) {
for (const t of processedTerm) {
this.removeTerm(fieldId, shortId, t);
}
}
else if (processedTerm) {
this.removeTerm(fieldId, shortId, processedTerm);
}
}
}
this._storedFields.delete(shortId);
this._documentIds.delete(shortId);
this._idToShortId.delete(id);
this._fieldLength.delete(shortId);
this._documentCount -= 1;
}
/**
* Removes all the given documents from the index. If called with no arguments,
* it removes _all_ documents from the index.
*
* @param documents The documents to be removed. If this argument is omitted,
* all documents are removed. Note that, for removing all documents, it is
* more efficient to call this method with no arguments than to pass all
* documents.
*/
removeAll(documents) {
if (documents) {
for (const document of documents)
this.remove(document);
}
else if (arguments.length > 0) {
throw new Error('Expected documents to be present. Omit the argument to remove all documents.');
}
else {
this._index = new SearchableMap();
this._documentCount = 0;
this._documentIds = new Map();
this._idToShortId = new Map();
this._fieldLength = new Map();
this._avgFieldLength = [];
this._storedFields = new Map();
this._nextId = 0;
}
}
/**
* Discards the document with the given ID, so it won't appear in search results
*
* It has the same visible effect of {@link MiniSearch.remove} (both cause the
* document to stop appearing in searches), but a different effect on the
* internal data structures:
*
* - {@link MiniSearch#remove} requires passing the full document to be
* removed as argument, and removes it from the inverted index immediately.
*
* - {@link MiniSearch#discard} instead only needs the document ID, and
* works by marking the current version of the document as discarded, so it
* is immediately ignored by searches. This is faster and more convenient
* than {@link MiniSearch#remove}, but the index is not immediately
* modified. To take care of that, vacuuming is performed after a certain
* number of documents are discarded, cleaning up the index and allowing
* memory to be released.
*
* After discarding a document, it is possible to re-add a new version, and
* only the new version will appear in searches. In other words, discarding
* and re-adding a document works exactly like removing and re-adding it. The
* {@link MiniSearch.replace} method can also be used to replace a document
* with a new version.
*
* #### Details about vacuuming
*
* Repetite calls to this method would leave obsolete document references in
* the index, invisible to searches. Two mechanisms take care of cleaning up:
* clean up during search, and vacuuming.
*
* - Upon search, whenever a discarded ID is found (and ignored for the
* results), references to the discarded document are removed from the
* inverted index entries for the search terms. This ensures that subsequent
* searches for the same terms do not need to skip these obsolete references
* again.
*
* - In addition, vacuuming is performed automatically by default (see the
* `autoVacuum` field in {@link Options}) after a certain number of
* documents are discarded. Vacuuming traverses all terms in the index,
* cleaning up all references to discarded documents. Vacuuming can also be
* triggered manually by calling {@link MiniSearch#vacuum}.
*
* @param id The ID of the document to be discarded
*/
discard(id) {
const shortId = this._idToShortId.get(id);
if (shortId == null) {
throw new Error(`MiniSearch: cannot discard document with ID ${id}: it is not in the index`);
}
this._idToShortId.delete(id);
this._documentIds.delete(shortId);
this._storedFields.delete(shortId);
(this._fieldLength.get(shortId) || []).forEach((fieldLength, fieldId) => {
this.removeFieldLength(shortId, fieldId, this._documentCount, fieldLength);
});
this._fieldLength.delete(shortId);
this._documentCount -= 1;
this._dirtCount += 1;
this.maybeAutoVacuum();
}
maybeAutoVacuum() {
if (this._options.autoVacuum === false) {
return;
}
const { minDirtFactor, minDirtCount, batchSize, batchWait } = this._options.autoVacuum;
this.conditionalVacuum({ batchSize, batchWait }, { minDirtCount, minDirtFactor });
}
/**
* Discards the documents with the given IDs, so they won't appear in search
* results
*
* It is equivalent to calling {@link MiniSearch#discard} for all the given
* IDs, but with the optimization of triggering at most one automatic
* vacuuming at the end.
*
* Note: to remove all documents from the index, it is faster and more
* convenient to call {@link MiniSearch.removeAll} with no argument, instead
* of passing all IDs to this method.
*/
discardAll(ids) {
const autoVacuum = this._options.autoVacuum;
try {
this._options.autoVacuum = false;
for (const id of ids) {
this.discard(id);
}
}
finally {
this._options.autoVacuum = autoVacuum;
}
this.maybeAutoVacuum();
}
/**
* It replaces an existing document with the given updated version
*
* It works by discarding the current version and adding the updated one, so
* it is functionally equivalent to calling {@link MiniSearch#discard}
* followed by {@link MiniSearch#add}. The ID of the updated document should
* be the same as the original one.
*
* Since it uses {@link MiniSearch#discard} internally, this method relies on
* vacuuming to clean up obsolete document references from the index, allowing
* memory to be released (see {@link MiniSearch#discard}).
*
* @param updatedDocument The updated document to replace the old version
* with
*/
replace(updatedDocument) {
const { idField, extractField } = this._options;
const id = extractField(updatedDocument, idField);
this.discard(id);
this.add(updatedDocument);
}
/**
* Triggers a manual vacuuming, cleaning up references to discarded documents
* from the inverted index
*
* Vacuuming is only useful for applications that use the {@link
* MiniSearch#discard} or {@link MiniSearch#replace} methods.
*
* By default, vacuuming is performed automatically when needed (controlled by
* the `autoVacuum` field in {@link Options}), so there is usually no need to
* call this method, unless one wants to make sure to perform vacuuming at a
* specific moment.
*
* Vacuuming traverses all terms in the inverted index in batches, and cleans
* up references to discarded documents from the posting list, allowing memory
* to be released.
*
* The method takes an optional object as argument with the following keys:
*
* - `batchSize`: the size of each batch (1000 by default)
*
* - `batchWait`: the number of milliseconds to wait between batches (10 by
* default)
*
* On large indexes, vacuuming could have a non-negligible cost: batching
* avoids blocking the thread for long, diluting this cost so that it is not
* negatively affecting the application. Nonetheless, this method should only
* be called when necessary, and relying on automatic vacuuming is usually
* better.
*
* It returns a promise that resolves (to undefined) when the clean up is
* completed. If vacuuming is already ongoing at the time this method is
* called, a new one is enqueued immediately after the ongoing one, and a
* corresponding promise is returned. However, no more than one vacuuming is
* enqueued on top of the ongoing one, even if this method is called more
* times (enqueuing multiple ones would be useless).
*
* @param options Configuration options for the batch size and delay. See
* {@link VacuumOptions}.
*/
vacuum(options = {}) {
return this.conditionalVacuum(options);
}
conditionalVacuum(options, conditions) {
// If a vacuum is already ongoing, schedule another as soon as it finishes,
// unless there's already one enqueued. If one was already enqueued, do not
// enqueue another on top, but make sure that the conditions are the
// broadest.
if (this._currentVacuum) {
this._enqueuedVacuumConditions = this._enqueuedVacuumConditions && conditions;
if (this._enqueuedVacuum != null) {
return this._enqueuedVacuum;
}
this._enqueuedVacuum = this._currentVacuum.then(() => {
const conditions = this._enqueuedVacuumConditions;
this._enqueuedVacuumConditions = defaultVacuumConditions;
return this.performVacuuming(options, conditions);
});
return this._enqueuedVacuum;
}
if (this.vacuumConditionsMet(conditions) === false) {
return Promise.resolve();
}
this._currentVacuum = this.performVacuuming(options);
return this._currentVacuum;
}
async performVacuuming(options, conditions) {
const initialDirtCount = this._dirtCount;
if (this.vacuumConditionsMet(conditions)) {
const batchSize = options.batchSize || defaultVacuumOptions.batchSize;
const batchWait = options.batchWait || defaultVacuumOptions.batchWait;
let i = 1;
for (const [term, fieldsData] of this._index) {
for (const [fieldId, fieldIndex] of fieldsData) {
for (const [shortId] of fieldIndex) {
if (this._documentIds.has(shortId)) {
continue;
}
if (fieldIndex.size <= 1) {
fieldsData.delete(fieldId);
}
else {
fieldIndex.delete(shortId);
}
}
}
if (this._index.get(term).size === 0) {
this._index.delete(term);
}
if (i % batchSize === 0) {
await new Promise((resolve) => setTimeout(resolve, batchWait));
}
i += 1;
}
this._dirtCount -= initialDirtCount;
}
// Make the next lines always async, so they execute after this function returns
await null;
this._currentVacuum = this._enqueuedVacuum;
this._enqueuedVacuum = null;
}
vacuumConditionsMet(conditions) {
if (conditions == null) {
return true;
}
let { minDirtCount, minDirtFactor } = conditions;
minDirtCount = minDirtCount || defaultAutoVacuumOptions.minDirtCount;
minDirtFactor = minDirtFactor || defaultAutoVacuumOptions.minDirtFactor;
return this.dirtCount >= minDirtCount && this.dirtFactor >= minDirtFactor;
}
/**
* Is `true` if a vacuuming operation is ongoing, `false` otherwise
*/
get isVacuuming() {
return this._currentVacuum != null;
}
/**
* The number of documents discarded since the most recent vacuuming
*/
get dirtCount() {
return this._dirtCount;
}
/**
* A number between 0 and 1 giving an indication about the proportion of
* documents that are discarded, and can therefore be cleaned up by vacuuming.
* A value close to 0 means that the index is relatively clean, while a higher
* value means that the index is relatively dirty, and vacuuming could release
* memory.
*/
get dirtFactor() {
return this._dirtCount / (1 + this._documentCount + this._dirtCount);
}
/**
* Returns `true` if a document with the given ID is present in the index and
* available for search, `false` otherwise
*
* @param id The document ID
*/
has(id) {
return this._idToShortId.has(id);
}
/**
* Returns the stored fields (as configured in the `storeFields` constructor
* option) for the given document ID. Returns `undefined` if the document is
* not present in the index.
*
* @param id The document ID
*/
getStoredFields(id) {
const shortId = this._idToShortId.get(id);
if (shortId == null) {
return undefined;
}
return this._storedFields.get(shortId);
}
/**
* Search for documents matching the given search query.
*
* The result is a list of scored document IDs matching the query, sorted by
* descending score, and each including data about which terms were matched and
* in which fields.
*
* ### Basic usage:
*
* ```javascript
* // Search for "zen art motorcycle" with default options: terms have to match
* // exactly, and individual terms are joined with OR
* miniSearch.search('zen art motorcycle')
* // => [ { id: 2, score: 2.77258, match: { ... } }, { id: 4, score: 1.38629, match: { ... } } ]
* ```
*
* ### Restrict search to specific fields:
*
* ```javascript
* // Search only in the 'title' field
* miniSearch.search('zen', { fields: ['title'] })
* ```
*
* ### Field boosting:
*
* ```javascript
* // Boost a field
* miniSearch.search('zen', { boost: { title: 2 } })
* ```
*
* ### Prefix search:
*
* ```javascript
* // Search for "moto" with prefix search (it will match documents
* // containing terms that start with "moto" or "neuro")
* miniSearch.search('moto neuro', { prefix: true })
* ```
*
* ### Fuzzy search:
*
* ```javascript
* // Search for "ismael" with fuzzy search (it will match documents containing
* // terms similar to "ismael", with a maximum edit distance of 0.2 term.length
* // (rounded to nearest integer)
* miniSearch.search('ismael', { fuzzy: 0.2 })
* ```
*
* ### Combining strategies:
*
* ```javascript
* // Mix of exact match, prefix search, and fuzzy search
* miniSearch.search('ismael mob', {
* prefix: true,
* fuzzy: 0.2
* })
* ```
*
* ### Advanced prefix and fuzzy search:
*
* ```javascript
* // Perform fuzzy and prefix search depending on the search term. Here
* // performing prefix and fuzzy search only on terms longer than 3 characters
* miniSearch.search('ismael mob', {
* prefix: term => term.length > 3
* fuzzy: term => term.length > 3 ? 0.2 : null
* })
* ```
*
* ### Combine with AND:
*
* ```javascript
* // Combine search terms with AND (to match only documents that contain both
* // "motorcycle" and "art")
* miniSearch.search('motorcycle art', { combineWith: 'AND' })
* ```
*
* ### Combine with AND_NOT:
*
* There is also an AND_NOT combinator, that finds documents that match the
* first term, but do not match any of the other terms. This combinator is
* rarely useful with simple queries, and is meant to be used with advanced
* query combinations (see later for more details).
*
* ### Filtering results:
*
* ```javascript
* // Filter only results in the 'fiction' category (assuming that 'category'
* // is a stored field)
* miniSearch.search('motorcycle art', {
* filter: (result) => result.category === 'fiction'
* })
* ```
*
* ### Wildcard query
*
* Searching for an empty string (assuming the default tokenizer) returns no
* results. Sometimes though, one needs to match all documents, like in a
* "wildcard" search. This is possible by passing the special value
* {@link MiniSearch.wildcard} as the query:
*
* ```javascript
* // Return search results for all documents
* miniSearch.search(MiniSearch.wildcard)
* ```
*
* Note that search options such as `filter` and `boostDocument` are still
* applied, influencing which results are returned, and their order:
*
* ```javascript
* // Return search results for all documents in the 'fiction' category
* miniSearch.search(MiniSearch.wildcard, {
* filter: (result) => result.category === 'fiction'
* })
* ```
*
* ### Advanced combination of queries:
*
* It is possible to combine different subqueries with OR, AND, and AND_NOT,
* and even with different search options, by passing a query expression
* tree object as the first argument, instead of a string.
*
* ```javascript
* // Search for documents that contain "zen" and ("motorcycle" or "archery")
* miniSearch.search({
* combineWith: 'AND',
* queries: [
* 'zen',
* {
* combineWith: 'OR',
* queries: ['motorcycle', 'archery']
* }
* ]
* })
*
* // Search for documents that contain ("apple" or "pear") but not "juice" and
* // not "tree"
* miniSearch.search({
* combineWith: 'AND_NOT',
* queries: [
* {
* combineWith: 'OR',
* queries: ['apple', 'pear']
* },
* 'juice',
* 'tree'
* ]
* })
* ```
*
* Each node in the expression tree can be either a string, or an object that
* supports all {@link SearchOptions} fields, plus a `queries` array field for
* subqueries.
*
* Note that, while this can become complicated to do by hand for complex or
* deeply nested queries, it provides a formalized expression tree API for
* external libraries that implement a parser for custom query languages.
*
* @param query Search query
* @param searchOptions Search options. Each option, if not given, defaults to the corresponding value of `searchOptions` given to the constructor, or to the library default.
*/
search(query, searchOptions = {}) {
const { searchOptions: globalSearchOptions } = this._options;
const searchOptionsWithDefaults = { ...globalSearchOptions, ...searchOptions };
const rawResults = this.executeQuery(query, searchOptions);
const results = [];
for (const [docId, { score, terms, match }] of rawResults) {
// terms are the matched query terms, which will be returned to the user
// as queryTerms. The quality is calculated based on them, as opposed to
// the matched terms in the document (which can be different due to
// prefix and fuzzy match)
const quality = terms.length || 1;
const result = {
id: this._documentIds.get(docId),
score: score * quality,
terms: Object.keys(match),
queryTerms: terms,
match
};
Object.assign(result, this._storedFields.get(docId));
if (searchOptionsWithDefaults.filter == null || searchOptionsWithDefaults.filter(result)) {
results.push(result);
}
}
// If it's a wildcard query, and no document boost is applied, skip sorting
// the results, as all results have the same score of 1
if (query === MiniSearch.wildcard && searchOptionsWithDefaults.boostDocument == null) {
return results;
}
results.sort(byScore);
return results;
}
/**
*