UNPKG

minisearch

Version:

Tiny but powerful full-text search engine for browser and Node

1,384 lines (1,230 loc) 77.6 kB
import SearchableMap from './SearchableMap/SearchableMap' export type LowercaseCombinationOperator = 'or' | 'and' | 'and_not' export type CombinationOperator = LowercaseCombinationOperator | Uppercase<LowercaseCombinationOperator> | Capitalize<LowercaseCombinationOperator> const OR: LowercaseCombinationOperator = 'or' const AND: LowercaseCombinationOperator = 'and' const AND_NOT: LowercaseCombinationOperator = 'and_not' /** * Search options to customize the search behavior. */ export type SearchOptions = { /** * Names of the fields to search in. If omitted, all fields are searched. */ fields?: string[], /** * Function used to filter search results, for example on the basis of stored * fields. It takes as argument each search result and should return a boolean * to indicate if the result should be kept or not. */ filter?: (result: SearchResult) => boolean, /** * Key-value object of field names to boosting values. By default, fields are * assigned a boosting factor of 1. If one assigns to a field a boosting value * of 2, a result that matches the query in that field is assigned a score * twice as high as a result matching the query in another field, all else * being equal. */ boost?: { [fieldName: string]: number }, /** * Function to calculate a boost factor for each term. * * This function, if provided, is called for each query term (as split by * `tokenize` and processed by `processTerm`). The arguments passed to the * function are the query term, the positional index of the term in the query, * and the array of all query terms. It is expected to return a numeric boost * factor for the term. A factor lower than 1 reduces the importance of the * term, a factor greater than 1 increases it. A factor of exactly 1 is * neutral, and does not affect the term's importance. */ boostTerm?: (term: string, i: number, terms: string[]) => number, /** * Relative weights to assign to prefix search results and fuzzy search * results. Exact matches are assigned a weight of 1. */ weights?: { fuzzy: number, prefix: number }, /** * Function to calculate a boost factor for documents. It takes as arguments * the document ID, and a term that matches the search in that document, and * the value of the stored fields for the document (if any). It should return * a boosting factor: a number higher than 1 increases the computed score, a * number lower than 1 decreases the score, and a falsy value skips the search * result completely. */ boostDocument?: (documentId: any, term: string, storedFields?: Record<string, unknown>) => number, /** * Controls whether to perform prefix search. It can be a simple boolean, or a * function. * * If a boolean is passed, prefix search is performed if true. * * If a function is passed, it is called upon search with a search term, the * positional index of that search term in the tokenized search query, and the * tokenized search query. The function should return a boolean to indicate * whether to perform prefix search for that search term. */ prefix?: boolean | ((term: string, index: number, terms: string[]) => boolean), /** * Controls whether to perform fuzzy search. It can be a simple boolean, or a * number, or a function. * * If a boolean is given, fuzzy search with a default fuzziness parameter is * performed if true. * * If a number higher or equal to 1 is given, fuzzy search is performed, with * a maximum edit distance (Levenshtein) equal to the number. * * If a number between 0 and 1 is given, fuzzy search is performed within a * maximum edit distance corresponding to that fraction of the term length, * approximated to the nearest integer. For example, 0.2 would mean an edit * distance of 20% of the term length, so 1 character in a 5-characters term. * The calculated fuzziness value is limited by the `maxFuzzy` option, to * prevent slowdown for very long queries. * * If a function is passed, the function is called upon search with a search * term, a positional index of that term in the tokenized search query, and * the tokenized search query. It should return a boolean or a number, with * the meaning documented above. */ fuzzy?: boolean | number | ((term: string, index: number, terms: string[]) => boolean | number), /** * Controls the maximum fuzziness when using a fractional fuzzy value. This is * set to 6 by default. Very high edit distances usually don't produce * meaningful results, but can excessively impact search performance. */ maxFuzzy?: number, /** * The operand to combine partial results for each term. By default it is * "OR", so results matching _any_ of the search terms are returned by a * search. If "AND" is given, only results matching _all_ the search terms are * returned by a search. */ combineWith?: CombinationOperator, /** * Function to tokenize the search query. By default, the same tokenizer used * for indexing is used also for search. * * @remarks This function is called after `extractField` extracts a truthy * value from a field. This function is then expected to split the extracted * `text` document into tokens (more commonly referred to as "terms" in this * context). The resulting split might be simple, like for example on word * boundaries, or it might be more complex, taking into account certain * encoding, or parsing needs, or even just special cases. Think about how one * might need to go about indexing the term "short-term". You would likely * want to treat this case specially, and return two terms instead, `[ * "short", "term" ]`. * * Or, you could let such a case be handled by the `processTerm` function, * which is designed to turn each token/term into whole terms or sub-terms. In * any case, the purpose of this function is to split apart the provided * `text` document into parts that can be processed by the `processTerm` * function. */ tokenize?: (text: string) => string[], /** * Function to process or normalize terms in the search query. By default, the * same term processor used for indexing is used also for search. * * @remarks * During the document indexing phase, the first step is to call the * `extractField` function to fetch the requested value/field from the * document. This is then passed off to the `tokenize` function, which will * break apart each value into "terms". These terms are then individually * passed through this function to compute each term individually. A term * might for example be something like "lbs", in which case one would likely * want to return `[ "lbs", "lb", "pound", "pounds" ]`. You may also return * just a single string, or a falsy value if you would like to skip indexing * entirely for a specific term. * * Truthy return value(s) are then fed to the indexer as positive matches for * this document. In our example above, all four of the `[ "lbs", "lb", * "pound", "pounds" ]` terms would be added to the indexing engine, matching * against the current document being computed. * * *Note: Whatever values are returned from this function will receive no * further processing before being indexed. This means for example, if you * include whitespace at the beginning or end of a word, it will also be * indexed that way, with the included whitespace.* */ processTerm?: (term: string) => string | string[] | null | undefined | false /** * BM25+ algorithm parameters. Customizing these is almost never necessary, * and finetuning them requires an understanding of the BM25 scoring model. In * most cases, it is best to omit this option to use defaults, and instead use * boosting to tweak scoring for specific use cases. */ bm25?: BM25Params } type SearchOptionsWithDefaults = SearchOptions & { boost: { [fieldName: string]: number }, weights: { fuzzy: number, prefix: number }, prefix: boolean | ((term: string, index: number, terms: string[]) => boolean), fuzzy: boolean | number | ((term: string, index: number, terms: string[]) => boolean | number), maxFuzzy: number, combineWith: CombinationOperator bm25: BM25Params } /** * Configuration options passed to the {@link MiniSearch} constructor * * @typeParam T The type of documents being indexed. */ export type Options<T = any> = { /** * Names of the document fields to be indexed. */ fields: string[], /** * Name of the ID field, uniquely identifying a document. */ idField?: string, /** * Names of fields to store, so that search results would include them. By * default none, so results would only contain the id field. */ storeFields?: string[], /** * Function used to extract the value of each field in documents. By default, * the documents are assumed to be plain objects with field names as keys, * but by specifying a custom `extractField` function one can completely * customize how the fields are extracted. * * The function takes as arguments the document, and the name of the field to * extract from it. It should return the field value as a string. * * @remarks * The returned string is fed into the `tokenize` function to split it up * into tokens. */ extractField?: (document: T, fieldName: string) => string, /** * Function used to split a field value into individual terms to be indexed. * The default tokenizer separates terms by space or punctuation, but a * custom tokenizer can be provided for custom logic. * * The function takes as arguments string to tokenize, and the name of the * field it comes from. It should return the terms as an array of strings. * When used for tokenizing a search query instead of a document field, the * `fieldName` is undefined. * * @remarks * This function is called after `extractField` extracts a truthy value from a * field. This function is then expected to split the extracted `text` document * into tokens (more commonly referred to as "terms" in this context). The resulting * split might be simple, like for example on word boundaries, or it might be more * complex, taking into account certain encoding, or parsing needs, or even just * special cases. Think about how one might need to go about indexing the term * "short-term". You would likely want to treat this case specially, and return two * terms instead, `[ "short", "term" ]`. * * Or, you could let such a case be handled by the `processTerm` function, * which is designed to turn each token/term into whole terms or sub-terms. In any * case, the purpose of this function is to split apart the provided `text` document * into parts that can be processed by the `processTerm` function. */ tokenize?: (text: string, fieldName?: string) => string[], /** * Function used to process a term before indexing or search. This can be * used for normalization (such as stemming). By default, terms are * downcased, and otherwise no other normalization is performed. * * The function takes as arguments a term to process, and the name of the * field it comes from. It should return the processed term as a string, or a * falsy value to reject the term entirely. * * It can also return an array of strings, in which case each string in the * returned array is indexed as a separate term. * * @remarks * During the document indexing phase, the first step is to call the `extractField` * function to fetch the requested value/field from the document. This is then * passed off to the `tokenizer`, which will break apart each value into "terms". * These terms are then individually passed through the `processTerm` function * to compute each term individually. A term might for example be something * like "lbs", in which case one would likely want to return * `[ "lbs", "lb", "pound", "pounds" ]`. You may also return a single string value, * or a falsy value if you would like to skip indexing entirely for a specific term. * * Truthy return value(s) are then fed to the indexer as positive matches for this * document. In our example above, all four of the `[ "lbs", "lb", "pound", "pounds" ]` * terms would be added to the indexing engine, matching against the current document * being computed. * * *Note: Whatever values are returned from this function will receive no further * processing before being indexed. This means for example, if you include whitespace * at the beginning or end of a word, it will also be indexed that way, with the * included whitespace.* */ processTerm?: (term: string, fieldName?: string) => string | string[] | null | undefined | false, /** * Function called to log messages. Arguments are a log level ('debug', * 'info', 'warn', or 'error'), a log message, and an optional string code * that identifies the reason for the log. * * The default implementation uses `console`, if defined. */ logger?: (level: LogLevel, message: string, code?: string) => void /** * If `true` (the default), vacuuming is performed automatically as soon as * {@link MiniSearch#discard} is called a certain number of times, cleaning up * obsolete references from the index. If `false`, no automatic vacuuming is * performed. Custom settings controlling auto vacuuming thresholds, as well * as batching behavior, can be passed as an object (see the {@link * AutoVacuumOptions} type). */ autoVacuum?: boolean | AutoVacuumOptions /** * Default search options (see the {@link SearchOptions} type and the {@link * MiniSearch#search} method for details) */ searchOptions?: SearchOptions, /** * Default auto suggest options (see the {@link SearchOptions} type and the * {@link MiniSearch#autoSuggest} method for details) */ autoSuggestOptions?: SearchOptions } type OptionsWithDefaults<T = any> = Options<T> & { storeFields: string[] idField: string extractField: (document: T, fieldName: string) => string tokenize: (text: string, fieldName: string) => string[] processTerm: (term: string, fieldName: string) => string | string[] | null | undefined | false logger: (level: LogLevel, message: string, code?: string) => void autoVacuum: false | AutoVacuumOptions searchOptions: SearchOptionsWithDefaults autoSuggestOptions: SearchOptions } type LogLevel = 'debug' | 'info' | 'warn' | 'error' /** * The type of auto-suggestions */ export type Suggestion = { /** * The suggestion */ suggestion: string, /** * Suggestion as an array of terms */ terms: string[], /** * Score for the suggestion */ score: number } /** * Match information for a search result. It is a key-value object where keys * are terms that matched, and values are the list of fields that the term was * found in. */ export type MatchInfo = { [term: string]: string[] } /** * Type of the search results. Each search result indicates the document ID, the * terms that matched, the match information, the score, and all the stored * fields. */ export type SearchResult = { /** * The document ID */ id: any, /** * List of document terms that matched. For example, if a prefix search for * `"moto"` matches `"motorcycle"`, `terms` will contain `"motorcycle"`. */ terms: string[], /** * List of query terms that matched. For example, if a prefix search for * `"moto"` matches `"motorcycle"`, `queryTerms` will contain `"moto"`. */ queryTerms: string[], /** * Score of the search results */ score: number, /** * Match information, see {@link MatchInfo} */ match: MatchInfo, /** * Stored fields */ [key: string]: any } /** * @ignore */ export type AsPlainObject = { documentCount: number, nextId: number, documentIds: { [shortId: string]: any } fieldIds: { [fieldName: string]: number } fieldLength: { [shortId: string]: number[] } averageFieldLength: number[], storedFields: { [shortId: string]: any } dirtCount?: number, index: [string, { [fieldId: string]: SerializedIndexEntry }][] serializationVersion: number } export type QueryCombination = SearchOptions & { queries: Query[] } /** * Wildcard query, used to match all terms */ export type Wildcard = typeof MiniSearch.wildcard /** * Search query expression, either a query string or an expression tree * combining several queries with a combination of AND or OR. */ export type Query = QueryCombination | string | Wildcard /** * Options to control vacuuming behavior. * * Vacuuming cleans up document references made obsolete by {@link * MiniSearch.discard} from the index. On large indexes, vacuuming is * potentially costly, because it has to traverse the whole inverted index. * Therefore, in order to dilute this cost so it does not negatively affects the * application, vacuuming is performed in batches, with a delay between each * batch. These options are used to configure the batch size and the delay * between batches. */ export type VacuumOptions = { /** * Size of each vacuuming batch (the number of terms in the index that will be * traversed in each batch). Defaults to 1000. */ batchSize?: number, /** * Wait time between each vacuuming batch in milliseconds. Defaults to 10. */ batchWait?: number } /** * Sets minimum thresholds for `dirtCount` and `dirtFactor` that trigger an * automatic vacuuming. */ export type VacuumConditions = { /** * Minimum `dirtCount` (number of discarded documents since the last vacuuming) * under which auto vacuum is not triggered. It defaults to 20. */ minDirtCount?: number /** * Minimum `dirtFactor` (proportion of discarded documents over the total) * under which auto vacuum is not triggered. It defaults to 0.1. */ minDirtFactor?: number, } /** * Options to control auto vacuum behavior. When discarding a document with * {@link MiniSearch#discard}, a vacuuming operation is automatically started if * the `dirtCount` and `dirtFactor` are above the `minDirtCount` and * `minDirtFactor` thresholds defined by this configuration. See {@link * VacuumConditions} for details on these. * * Also, `batchSize` and `batchWait` can be specified, controlling batching * behavior (see {@link VacuumOptions}). */ export type AutoVacuumOptions = VacuumOptions & VacuumConditions type QuerySpec = { prefix: boolean, fuzzy: number | boolean, term: string, termBoost: number } type DocumentTermFreqs = Map<number, number> type FieldTermData = Map<number, DocumentTermFreqs> interface RawResultValue { // Intermediate score, before applying the final score based on number of // matched terms. score: number, // Set of all query terms that were matched. They may not be present in the // text exactly in the case of prefix/fuzzy matches. We must check for // uniqueness before adding a new term. This is much faster than using a set, // because the number of elements is relatively small. terms: string[], // All terms that were found in the content, including the fields in which // they were present. This object will be provided as part of the final search // results. match: MatchInfo, } type RawResult = Map<number, RawResultValue> /** * {@link MiniSearch} is the main entrypoint class, implementing a full-text * search engine in memory. * * @typeParam T The type of the documents being indexed. * * ### Basic example: * * ```javascript * const documents = [ * { * id: 1, * title: 'Moby Dick', * text: 'Call me Ishmael. Some years ago...', * category: 'fiction' * }, * { * id: 2, * title: 'Zen and the Art of Motorcycle Maintenance', * text: 'I can see by my watch...', * category: 'fiction' * }, * { * id: 3, * title: 'Neuromancer', * text: 'The sky above the port was...', * category: 'fiction' * }, * { * id: 4, * title: 'Zen and the Art of Archery', * text: 'At first sight it must seem...', * category: 'non-fiction' * }, * // ...and more * ] * * // Create a search engine that indexes the 'title' and 'text' fields for * // full-text search. Search results will include 'title' and 'category' (plus the * // id field, that is always stored and returned) * const miniSearch = new MiniSearch({ * fields: ['title', 'text'], * storeFields: ['title', 'category'] * }) * * // Add documents to the index * miniSearch.addAll(documents) * * // Search for documents: * let results = miniSearch.search('zen art motorcycle') * // => [ * // { id: 2, title: 'Zen and the Art of Motorcycle Maintenance', category: 'fiction', score: 2.77258 }, * // { id: 4, title: 'Zen and the Art of Archery', category: 'non-fiction', score: 1.38629 } * // ] * ``` */ export default class MiniSearch<T = any> { protected _options: OptionsWithDefaults<T> protected _index: SearchableMap<FieldTermData> protected _documentCount: number protected _documentIds: Map<number, any> protected _idToShortId: Map<any, number> protected _fieldIds: { [key: string]: number } protected _fieldLength: Map<number, number[]> protected _avgFieldLength: number[] protected _nextId: number protected _storedFields: Map<number, Record<string, unknown>> protected _dirtCount: number private _currentVacuum: Promise<void> | null private _enqueuedVacuum: Promise<void> | null private _enqueuedVacuumConditions: VacuumConditions | undefined /** * The special wildcard symbol that can be passed to {@link MiniSearch#search} * to match all documents */ static readonly wildcard: unique symbol = Symbol('*') /** * @param options Configuration options * * ### Examples: * * ```javascript * // Create a search engine that indexes the 'title' and 'text' fields of your * // documents: * const miniSearch = new MiniSearch({ fields: ['title', 'text'] }) * ``` * * ### ID Field: * * ```javascript * // Your documents are assumed to include a unique 'id' field, but if you want * // to use a different field for document identification, you can set the * // 'idField' option: * const miniSearch = new MiniSearch({ idField: 'key', fields: ['title', 'text'] }) * ``` * * ### Options and defaults: * * ```javascript * // The full set of options (here with their default value) is: * const miniSearch = new MiniSearch({ * // idField: field that uniquely identifies a document * idField: 'id', * * // extractField: function used to get the value of a field in a document. * // By default, it assumes the document is a flat object with field names as * // property keys and field values as string property values, but custom logic * // can be implemented by setting this option to a custom extractor function. * extractField: (document, fieldName) => document[fieldName], * * // tokenize: function used to split fields into individual terms. By * // default, it is also used to tokenize search queries, unless a specific * // `tokenize` search option is supplied. When tokenizing an indexed field, * // the field name is passed as the second argument. * tokenize: (string, _fieldName) => string.split(SPACE_OR_PUNCTUATION), * * // processTerm: function used to process each tokenized term before * // indexing. It can be used for stemming and normalization. Return a falsy * // value in order to discard a term. By default, it is also used to process * // search queries, unless a specific `processTerm` option is supplied as a * // search option. When processing a term from a indexed field, the field * // name is passed as the second argument. * processTerm: (term, _fieldName) => term.toLowerCase(), * * // searchOptions: default search options, see the `search` method for * // details * searchOptions: undefined, * * // fields: document fields to be indexed. Mandatory, but not set by default * fields: undefined * * // storeFields: document fields to be stored and returned as part of the * // search results. * storeFields: [] * }) * ``` */ constructor (options: Options<T>) { if (options?.fields == null) { throw new Error('MiniSearch: option "fields" must be provided') } const autoVacuum = (options.autoVacuum == null || options.autoVacuum === true) ? defaultAutoVacuumOptions : options.autoVacuum this._options = { ...defaultOptions, ...options, autoVacuum, searchOptions: { ...defaultSearchOptions, ...(options.searchOptions || {}) }, autoSuggestOptions: { ...defaultAutoSuggestOptions, ...(options.autoSuggestOptions || {}) } } this._index = new SearchableMap() this._documentCount = 0 this._documentIds = new Map() this._idToShortId = new Map() // Fields are defined during initialization, don't change, are few in // number, rarely need iterating over, and have string keys. Therefore in // this case an object is a better candidate than a Map to store the mapping // from field key to ID. this._fieldIds = {} this._fieldLength = new Map() this._avgFieldLength = [] this._nextId = 0 this._storedFields = new Map() this._dirtCount = 0 this._currentVacuum = null this._enqueuedVacuum = null this._enqueuedVacuumConditions = defaultVacuumConditions this.addFields(this._options.fields) } /** * Adds a document to the index * * @param document The document to be indexed */ add (document: T): void { const { extractField, tokenize, processTerm, fields, idField } = this._options const id = extractField(document, idField) if (id == null) { throw new Error(`MiniSearch: document does not have ID field "${idField}"`) } if (this._idToShortId.has(id)) { throw new Error(`MiniSearch: duplicate ID ${id}`) } const shortDocumentId = this.addDocumentId(id) this.saveStoredFields(shortDocumentId, document) for (const field of fields) { const fieldValue = extractField(document, field) if (fieldValue == null) continue const tokens = tokenize(fieldValue.toString(), field) const fieldId = this._fieldIds[field] const uniqueTerms = new Set(tokens).size this.addFieldLength(shortDocumentId, fieldId, this._documentCount - 1, uniqueTerms) for (const term of tokens) { const processedTerm = processTerm(term, field) if (Array.isArray(processedTerm)) { for (const t of processedTerm) { this.addTerm(fieldId, shortDocumentId, t) } } else if (processedTerm) { this.addTerm(fieldId, shortDocumentId, processedTerm) } } } } /** * Adds all the given documents to the index * * @param documents An array of documents to be indexed */ addAll (documents: readonly T[]): void { for (const document of documents) this.add(document) } /** * Adds all the given documents to the index asynchronously. * * Returns a promise that resolves (to `undefined`) when the indexing is done. * This method is useful when index many documents, to avoid blocking the main * thread. The indexing is performed asynchronously and in chunks. * * @param documents An array of documents to be indexed * @param options Configuration options * @return A promise resolving to `undefined` when the indexing is done */ addAllAsync (documents: readonly T[], options: { chunkSize?: number } = {}): Promise<void> { const { chunkSize = 10 } = options const acc: { chunk: T[], promise: Promise<void> } = { chunk: [], promise: Promise.resolve() } const { chunk, promise } = documents.reduce(({ chunk, promise }, document: T, i: number) => { chunk.push(document) if ((i + 1) % chunkSize === 0) { return { chunk: [], promise: promise .then(() => new Promise(resolve => setTimeout(resolve, 0))) .then(() => this.addAll(chunk)) } } else { return { chunk, promise } } }, acc) return promise.then(() => this.addAll(chunk)) } /** * Removes the given document from the index. * * The document to remove must NOT have changed between indexing and removal, * otherwise the index will be corrupted. * * This method requires passing the full document to be removed (not just the * ID), and immediately removes the document from the inverted index, allowing * memory to be released. A convenient alternative is {@link * MiniSearch#discard}, which needs only the document ID, and has the same * visible effect, but delays cleaning up the index until the next vacuuming. * * @param document The document to be removed */ remove (document: T): void { const { tokenize, processTerm, extractField, fields, idField } = this._options const id = extractField(document, idField) if (id == null) { throw new Error(`MiniSearch: document does not have ID field "${idField}"`) } const shortId = this._idToShortId.get(id) if (shortId == null) { throw new Error(`MiniSearch: cannot remove document with ID ${id}: it is not in the index`) } for (const field of fields) { const fieldValue = extractField(document, field) if (fieldValue == null) continue const tokens = tokenize(fieldValue.toString(), field) const fieldId = this._fieldIds[field] const uniqueTerms = new Set(tokens).size this.removeFieldLength(shortId, fieldId, this._documentCount, uniqueTerms) for (const term of tokens) { const processedTerm = processTerm(term, field) if (Array.isArray(processedTerm)) { for (const t of processedTerm) { this.removeTerm(fieldId, shortId, t) } } else if (processedTerm) { this.removeTerm(fieldId, shortId, processedTerm) } } } this._storedFields.delete(shortId) this._documentIds.delete(shortId) this._idToShortId.delete(id) this._fieldLength.delete(shortId) this._documentCount -= 1 } /** * Removes all the given documents from the index. If called with no arguments, * it removes _all_ documents from the index. * * @param documents The documents to be removed. If this argument is omitted, * all documents are removed. Note that, for removing all documents, it is * more efficient to call this method with no arguments than to pass all * documents. */ removeAll (documents?: readonly T[]): void { if (documents) { for (const document of documents) this.remove(document) } else if (arguments.length > 0) { throw new Error('Expected documents to be present. Omit the argument to remove all documents.') } else { this._index = new SearchableMap() this._documentCount = 0 this._documentIds = new Map() this._idToShortId = new Map() this._fieldLength = new Map() this._avgFieldLength = [] this._storedFields = new Map() this._nextId = 0 } } /** * Discards the document with the given ID, so it won't appear in search results * * It has the same visible effect of {@link MiniSearch.remove} (both cause the * document to stop appearing in searches), but a different effect on the * internal data structures: * * - {@link MiniSearch#remove} requires passing the full document to be * removed as argument, and removes it from the inverted index immediately. * * - {@link MiniSearch#discard} instead only needs the document ID, and * works by marking the current version of the document as discarded, so it * is immediately ignored by searches. This is faster and more convenient * than {@link MiniSearch#remove}, but the index is not immediately * modified. To take care of that, vacuuming is performed after a certain * number of documents are discarded, cleaning up the index and allowing * memory to be released. * * After discarding a document, it is possible to re-add a new version, and * only the new version will appear in searches. In other words, discarding * and re-adding a document works exactly like removing and re-adding it. The * {@link MiniSearch.replace} method can also be used to replace a document * with a new version. * * #### Details about vacuuming * * Repetite calls to this method would leave obsolete document references in * the index, invisible to searches. Two mechanisms take care of cleaning up: * clean up during search, and vacuuming. * * - Upon search, whenever a discarded ID is found (and ignored for the * results), references to the discarded document are removed from the * inverted index entries for the search terms. This ensures that subsequent * searches for the same terms do not need to skip these obsolete references * again. * * - In addition, vacuuming is performed automatically by default (see the * `autoVacuum` field in {@link Options}) after a certain number of * documents are discarded. Vacuuming traverses all terms in the index, * cleaning up all references to discarded documents. Vacuuming can also be * triggered manually by calling {@link MiniSearch#vacuum}. * * @param id The ID of the document to be discarded */ discard (id: any): void { const shortId = this._idToShortId.get(id) if (shortId == null) { throw new Error(`MiniSearch: cannot discard document with ID ${id}: it is not in the index`) } this._idToShortId.delete(id) this._documentIds.delete(shortId) this._storedFields.delete(shortId) ;(this._fieldLength.get(shortId) || []).forEach((fieldLength, fieldId) => { this.removeFieldLength(shortId, fieldId, this._documentCount, fieldLength) }) this._fieldLength.delete(shortId) this._documentCount -= 1 this._dirtCount += 1 this.maybeAutoVacuum() } private maybeAutoVacuum (): void { if (this._options.autoVacuum === false) { return } const { minDirtFactor, minDirtCount, batchSize, batchWait } = this._options.autoVacuum this.conditionalVacuum({ batchSize, batchWait }, { minDirtCount, minDirtFactor }) } /** * Discards the documents with the given IDs, so they won't appear in search * results * * It is equivalent to calling {@link MiniSearch#discard} for all the given * IDs, but with the optimization of triggering at most one automatic * vacuuming at the end. * * Note: to remove all documents from the index, it is faster and more * convenient to call {@link MiniSearch.removeAll} with no argument, instead * of passing all IDs to this method. */ discardAll (ids: readonly any[]): void { const autoVacuum = this._options.autoVacuum try { this._options.autoVacuum = false for (const id of ids) { this.discard(id) } } finally { this._options.autoVacuum = autoVacuum } this.maybeAutoVacuum() } /** * It replaces an existing document with the given updated version * * It works by discarding the current version and adding the updated one, so * it is functionally equivalent to calling {@link MiniSearch#discard} * followed by {@link MiniSearch#add}. The ID of the updated document should * be the same as the original one. * * Since it uses {@link MiniSearch#discard} internally, this method relies on * vacuuming to clean up obsolete document references from the index, allowing * memory to be released (see {@link MiniSearch#discard}). * * @param updatedDocument The updated document to replace the old version * with */ replace (updatedDocument: T): void { const { idField, extractField } = this._options const id = extractField(updatedDocument, idField) this.discard(id) this.add(updatedDocument) } /** * Triggers a manual vacuuming, cleaning up references to discarded documents * from the inverted index * * Vacuuming is only useful for applications that use the {@link * MiniSearch#discard} or {@link MiniSearch#replace} methods. * * By default, vacuuming is performed automatically when needed (controlled by * the `autoVacuum` field in {@link Options}), so there is usually no need to * call this method, unless one wants to make sure to perform vacuuming at a * specific moment. * * Vacuuming traverses all terms in the inverted index in batches, and cleans * up references to discarded documents from the posting list, allowing memory * to be released. * * The method takes an optional object as argument with the following keys: * * - `batchSize`: the size of each batch (1000 by default) * * - `batchWait`: the number of milliseconds to wait between batches (10 by * default) * * On large indexes, vacuuming could have a non-negligible cost: batching * avoids blocking the thread for long, diluting this cost so that it is not * negatively affecting the application. Nonetheless, this method should only * be called when necessary, and relying on automatic vacuuming is usually * better. * * It returns a promise that resolves (to undefined) when the clean up is * completed. If vacuuming is already ongoing at the time this method is * called, a new one is enqueued immediately after the ongoing one, and a * corresponding promise is returned. However, no more than one vacuuming is * enqueued on top of the ongoing one, even if this method is called more * times (enqueuing multiple ones would be useless). * * @param options Configuration options for the batch size and delay. See * {@link VacuumOptions}. */ vacuum (options: VacuumOptions = {}): Promise<void> { return this.conditionalVacuum(options) } private conditionalVacuum (options: VacuumOptions, conditions?: VacuumConditions): Promise<void> { // If a vacuum is already ongoing, schedule another as soon as it finishes, // unless there's already one enqueued. If one was already enqueued, do not // enqueue another on top, but make sure that the conditions are the // broadest. if (this._currentVacuum) { this._enqueuedVacuumConditions = this._enqueuedVacuumConditions && conditions if (this._enqueuedVacuum != null) { return this._enqueuedVacuum } this._enqueuedVacuum = this._currentVacuum.then(() => { const conditions = this._enqueuedVacuumConditions this._enqueuedVacuumConditions = defaultVacuumConditions return this.performVacuuming(options, conditions) }) return this._enqueuedVacuum } if (this.vacuumConditionsMet(conditions) === false) { return Promise.resolve() } this._currentVacuum = this.performVacuuming(options) return this._currentVacuum } private async performVacuuming (options: VacuumOptions, conditions?: VacuumConditions): Promise<void> { const initialDirtCount = this._dirtCount if (this.vacuumConditionsMet(conditions)) { const batchSize = options.batchSize || defaultVacuumOptions.batchSize const batchWait = options.batchWait || defaultVacuumOptions.batchWait let i = 1 for (const [term, fieldsData] of this._index) { for (const [fieldId, fieldIndex] of fieldsData) { for (const [shortId] of fieldIndex) { if (this._documentIds.has(shortId)) { continue } if (fieldIndex.size <= 1) { fieldsData.delete(fieldId) } else { fieldIndex.delete(shortId) } } } if (this._index.get(term)!.size === 0) { this._index.delete(term) } if (i % batchSize === 0) { await new Promise((resolve) => setTimeout(resolve, batchWait)) } i += 1 } this._dirtCount -= initialDirtCount } // Make the next lines always async, so they execute after this function returns await null this._currentVacuum = this._enqueuedVacuum this._enqueuedVacuum = null } private vacuumConditionsMet (conditions?: VacuumConditions) { if (conditions == null) { return true } let { minDirtCount, minDirtFactor } = conditions minDirtCount = minDirtCount || defaultAutoVacuumOptions.minDirtCount minDirtFactor = minDirtFactor || defaultAutoVacuumOptions.minDirtFactor return this.dirtCount >= minDirtCount && this.dirtFactor >= minDirtFactor } /** * Is `true` if a vacuuming operation is ongoing, `false` otherwise */ get isVacuuming (): boolean { return this._currentVacuum != null } /** * The number of documents discarded since the most recent vacuuming */ get dirtCount (): number { return this._dirtCount } /** * A number between 0 and 1 giving an indication about the proportion of * documents that are discarded, and can therefore be cleaned up by vacuuming. * A value close to 0 means that the index is relatively clean, while a higher * value means that the index is relatively dirty, and vacuuming could release * memory. */ get dirtFactor (): number { return this._dirtCount / (1 + this._documentCount + this._dirtCount) } /** * Returns `true` if a document with the given ID is present in the index and * available for search, `false` otherwise * * @param id The document ID */ has (id: any): boolean { return this._idToShortId.has(id) } /** * Returns the stored fields (as configured in the `storeFields` constructor * option) for the given document ID. Returns `undefined` if the document is * not present in the index. * * @param id The document ID */ getStoredFields (id: any): Record<string, unknown> | undefined { const shortId = this._idToShortId.get(id) if (shortId == null) { return undefined } return this._storedFields.get(shortId) } /** * Search for documents matching the given search query. * * The result is a list of scored document IDs matching the query, sorted by * descending score, and each including data about which terms were matched and * in which fields. * * ### Basic usage: * * ```javascript * // Search for "zen art motorcycle" with default options: terms have to match * // exactly, and individual terms are joined with OR * miniSearch.search('zen art motorcycle') * // => [ { id: 2, score: 2.77258, match: { ... } }, { id: 4, score: 1.38629, match: { ... } } ] * ``` * * ### Restrict search to specific fields: * * ```javascript * // Search only in the 'title' field * miniSearch.search('zen', { fields: ['title'] }) * ``` * * ### Field boosting: * * ```javascript * // Boost a field * miniSearch.search('zen', { boost: { title: 2 } }) * ``` * * ### Prefix search: * * ```javascript * // Search for "moto" with prefix search (it will match documents * // containing terms that start with "moto" or "neuro") * miniSearch.search('moto neuro', { prefix: true }) * ``` * * ### Fuzzy search: * * ```javascript * // Search for "ismael" with fuzzy search (it will match documents containing * // terms similar to "ismael", with a maximum edit distance of 0.2 term.length * // (rounded to nearest integer) * miniSearch.search('ismael', { fuzzy: 0.2 }) * ``` * * ### Combining strategies: * * ```javascript * // Mix of exact match, prefix search, and fuzzy search * miniSearch.search('ismael mob', { * prefix: true, * fuzzy: 0.2 * }) * ``` * * ### Advanced prefix and fuzzy search: * * ```javascript * // Perform fuzzy and prefix search depending on the search term. Here * // performing prefix and fuzzy search only on terms longer than 3 characters * miniSearch.search('ismael mob', { * prefix: term => term.length > 3 * fuzzy: term => term.length > 3 ? 0.2 : null * }) * ``` * * ### Combine with AND: * * ```javascript * // Combine search terms with AND (to match only documents that contain both * // "motorcycle" and "art") * miniSearch.search('motorcycle art', { combineWith: 'AND' }) * ``` * * ### Combine with AND_NOT: * * There is also an AND_NOT combinator, that finds documents that match the * first term, but do not match any of the other terms. This combinator is * rarely useful with simple queries, and is meant to be used with advanced * query combinations (see later for more details). * * ### Filtering results: * * ```javascript * // Filter only results in the 'fiction' category (assuming that 'category' * // is a stored field) * miniSearch.search('motorcycle art', { * filter: (result) => result.category === 'fiction' * }) * ``` * * ### Wildcard query * * Searching for an empty string (assuming the default tokenizer) returns no * results. Sometimes though, one needs to match all documents, like in a * "wildcard" search. This is possible by passing the special value * {@link MiniSearch.wildcard} as the query: * * ```javascript * // Return search results for all documents * miniSearch.search(MiniSearch.wildcard) * ``` * * Note that search options such as `filter` and `boostDocument` are still * applied, influencing which results are returned, and their order: * * ```javascript * // Return search results for all documents in the 'fiction' category * miniSearch.search(MiniSearch.wildcard, { * filter: (result) => result.category === 'fiction' * }) * ``` * * ### Advanced combination of queries: * * It is possible to combine different subqueries with OR, AND, and AND_NOT, * and even with different search options, by passing a query expression * tree object as the first argument, instead of a string. * * ```javascript * // Search for documents that contain "zen" and ("motorcycle" or "archery") * miniSearch.search({ * combineWith: 'AND', * queries: [ * 'zen', * { * combineWith: 'OR', * queries: ['motorcycle', 'archery'] * } * ] * }) * * // Search for documents that contain ("apple" or "pear") but not "juice" and * // not "tree" * miniSearch.search({ * combineWith: 'AND_NOT', * queries: [ * { * combineWith: 'OR', * queries: ['apple', 'pear'] * }, * 'juice', * 'tree' * ] * }) * ``` * * Each node in the expression tree can be either a string, or an object that * supports all {@link SearchOptions} fields, plus a `queries` array field for * subqueries. * * Note that, while this can become complicated to do by hand for complex or * deeply nested queries, it provides a formalized expression tree API for * external libraries that implement a parser for custom query languages. * * @param query Search query * @param searchOptions Search options. Each option, if not given, defaults to the corresponding value of `searchOptions` given to the constructor, or to the library default. */ search (query: Query, searchOptions: SearchOptions = {}): SearchResult[] { const { searchOptions: globalSearchOptions } = this._options const searchOptionsWithDefaults: SearchOptionsWithDefaults = { ...globalSearchOptions, ...searchOptions } const rawResults = this.executeQuery(query, searchOptions) const results = [] for (const [docId, { score, terms, match }] of rawResults) { // terms are the matched query terms, which will be returned to the user // as queryTerms. The quality is calculated based on them, as opposed to // the matched terms in the document (which can be different due to // prefix and fuzzy match) const quality = terms.length || 1 const result = { id: this._documentIds.get(docId), score: score * quality, terms: Object.keys(match), queryTerms: terms, match } Object.assign(result, this._storedFields.get(docId)) if (searchOptionsWithDefaults.filter == null || searchOptionsWithDefaults.filter(result)) { results.push(result) } } // If it's a wildcard query, and no document boost is applied, skip sorting // the results, as all results have the same score of 1 if (query === MiniSearch.wildcard && searchOptionsWithDefaults.boostDocument == null) { return results } results.sort(byScore) return results } /** * Provide suggestions for the given search query * * The result is a list of suggested modified search queries, derived from the * given search query, each with a relevance score, sorted by descending score. * * By default, it uses the same options used for search, except that by * default it performs prefix search on the last term of the query, and * combine terms with `'AND'` (requiring all query terms to match). Custom * options can be passed as a second argument. Defaults can be changed upon * calling the {@link MiniSearch} constructor, by passing a * `autoSuggestOptions` option. * * ### Basic usage: * * ```javascript * // Get suggestions for 'neuro': * miniSearch.autoSuggest('neuro') * // => [ { suggestion: 'neuromancer', terms: [ 'neuromancer' ], score: 0.46240 } ] * `