UNPKG

elasticlunr

Version:

Lightweight full-text search engine in Javascript for browser search and offline search.

596 lines (533 loc) 19.6 kB
/*! * elasticlunr.Index * Copyright (C) @YEAR Oliver Nightingale * Copyright (C) @YEAR Wei Song */ /** * elasticlunr.Index is object that manages a search index. It contains the indexes * and stores all the tokens and document lookups. It also provides the main * user facing API for the library. * * @constructor */ elasticlunr.Index = function () { this._fields = []; this._ref = 'id'; this.pipeline = new elasticlunr.Pipeline; this.documentStore = new elasticlunr.DocumentStore; this.index = {}; this.eventEmitter = new elasticlunr.EventEmitter; this._idfCache = {}; this.on('add', 'remove', 'update', (function () { this._idfCache = {}; }).bind(this)); }; /** * Bind a handler to events being emitted by the index. * * The handler can be bound to many events at the same time. * * @param {String} [eventName] The name(s) of events to bind the function to. * @param {Function} fn The serialised set to load. * @memberOf Index */ elasticlunr.Index.prototype.on = function () { var args = Array.prototype.slice.call(arguments); return this.eventEmitter.addListener.apply(this.eventEmitter, args); }; /** * Removes a handler from an event being emitted by the index. * * @param {String} eventName The name of events to remove the function from. * @param {Function} fn The serialised set to load. * @memberOf Index */ elasticlunr.Index.prototype.off = function (name, fn) { return this.eventEmitter.removeListener(name, fn); }; /** * Loads a previously serialised index. * * Issues a warning if the index being imported was serialised * by a different version of elasticlunr. * * @param {Object} serialisedData The serialised set to load. * @return {elasticlunr.Index} * @memberOf Index */ elasticlunr.Index.load = function (serialisedData) { if (serialisedData.version !== elasticlunr.version) { elasticlunr.utils.warn('version mismatch: current ' + elasticlunr.version + ' importing ' + serialisedData.version); } var idx = new this; idx._fields = serialisedData.fields; idx._ref = serialisedData.ref; idx.documentStore = elasticlunr.DocumentStore.load(serialisedData.documentStore); idx.pipeline = elasticlunr.Pipeline.load(serialisedData.pipeline); idx.index = {}; for (var field in serialisedData.index) { idx.index[field] = elasticlunr.InvertedIndex.load(serialisedData.index[field]); } return idx; }; /** * Adds a field to the list of fields that will be searchable within documents in the index. * * Remember that inner index is build based on field, which means each field has one inverted index. * * Fields should be added before any documents are added to the index, fields * that are added after documents are added to the index will only apply to new * documents added to the index. * * @param {String} fieldName The name of the field within the document that should be indexed * @return {elasticlunr.Index} * @memberOf Index */ elasticlunr.Index.prototype.addField = function (fieldName) { this._fields.push(fieldName); this.index[fieldName] = new elasticlunr.InvertedIndex; return this; }; /** * Sets the property used to uniquely identify documents added to the index, * by default this property is 'id'. * * This should only be changed before adding documents to the index, changing * the ref property without resetting the index can lead to unexpected results. * * @param {String} refName The property to use to uniquely identify the * documents in the index. * @param {Boolean} emitEvent Whether to emit add events, defaults to true * @return {elasticlunr.Index} * @memberOf Index */ elasticlunr.Index.prototype.setRef = function (refName) { this._ref = refName; return this; }; /** * * Set if the JSON format original documents are save into elasticlunr.DocumentStore * * Defaultly save all the original JSON documents. * * @param {Boolean} save Whether to save the original JSON documents. * @return {elasticlunr.Index} * @memberOf Index */ elasticlunr.Index.prototype.saveDocument = function (save) { this.documentStore = new elasticlunr.DocumentStore(save); return this; }; /** * Add a JSON format document to the index. * * This is the way new documents enter the index, this function will run the * fields from the document through the index's pipeline and then add it to * the index, it will then show up in search results. * * An 'add' event is emitted with the document that has been added and the index * the document has been added to. This event can be silenced by passing false * as the second argument to add. * * @param {Object} doc The JSON format document to add to the index. * @param {Boolean} emitEvent Whether or not to emit events, default true. * @memberOf Index */ elasticlunr.Index.prototype.addDoc = function (doc, emitEvent) { if (!doc) return; var emitEvent = emitEvent === undefined ? true : emitEvent; var docRef = doc[this._ref]; this.documentStore.addDoc(docRef, doc); this._fields.forEach(function (field) { var fieldTokens = this.pipeline.run(elasticlunr.tokenizer(doc[field])); this.documentStore.addFieldLength(docRef, field, fieldTokens.length); var tokenCount = {}; fieldTokens.forEach(function (token) { if (token in tokenCount) tokenCount[token] += 1; else tokenCount[token] = 1; }, this); for (var token in tokenCount) { var termFrequency = tokenCount[token]; termFrequency = Math.sqrt(termFrequency); this.index[field].addToken(token, { ref: docRef, tf: termFrequency }); } }, this); if (emitEvent) this.eventEmitter.emit('add', doc, this); }; /** * Removes a document from the index by doc ref. * * To make sure documents no longer show up in search results they can be * removed from the index using this method. * * A 'remove' event is emitted with the document that has been removed and the index * the document has been removed from. This event can be silenced by passing false * as the second argument to remove. * * If user setting DocumentStore not storing the documents, then remove doc by docRef is not allowed. * * @param {String|Integer} docRef The document ref to remove from the index. * @param {Boolean} emitEvent Whether to emit remove events, defaults to true * @memberOf Index */ elasticlunr.Index.prototype.removeDocByRef = function (docRef, emitEvent) { if (!docRef) return; if (this.documentStore.isDocStored() === false) { return; } if (!this.documentStore.hasDoc(docRef)) return; var doc = this.documentStore.getDoc(docRef); this.removeDoc(doc, false); }; /** * Removes a document from the index. * This remove operation could work even the original doc is not store in the DocumentStore. * * To make sure documents no longer show up in search results they can be * removed from the index using this method. * * A 'remove' event is emitted with the document that has been removed and the index * the document has been removed from. This event can be silenced by passing false * as the second argument to remove. * * * @param {Object} doc The document ref to remove from the index. * @param {Boolean} emitEvent Whether to emit remove events, defaults to true * @memberOf Index */ elasticlunr.Index.prototype.removeDoc = function (doc, emitEvent) { if (!doc) return; var emitEvent = emitEvent === undefined ? true : emitEvent; var docRef = doc[this._ref]; if (!this.documentStore.hasDoc(docRef)) return; this.documentStore.removeDoc(docRef); this._fields.forEach(function (field) { var fieldTokens = this.pipeline.run(elasticlunr.tokenizer(doc[field])); fieldTokens.forEach(function (token) { this.index[field].removeToken(token, docRef); }, this); }, this); if (emitEvent) this.eventEmitter.emit('remove', doc, this); }; /** * Updates a document in the index. * * When a document contained within the index gets updated, fields changed, * added or removed, to make sure it correctly matched against search queries, * it should be updated in the index. * * This method is just a wrapper around `remove` and `add` * * An 'update' event is emitted with the document that has been updated and the index. * This event can be silenced by passing false as the second argument to update. Only * an update event will be fired, the 'add' and 'remove' events of the underlying calls * are silenced. * * @param {Object} doc The document to update in the index. * @param {Boolean} emitEvent Whether to emit update events, defaults to true * @see Index.prototype.remove * @see Index.prototype.add * @memberOf Index */ elasticlunr.Index.prototype.updateDoc = function (doc, emitEvent) { var emitEvent = emitEvent === undefined ? true : emitEvent; this.removeDocByRef(doc[this._ref], false); this.addDoc(doc, false); if (emitEvent) this.eventEmitter.emit('update', doc, this); }; /** * Calculates the inverse document frequency for a token within the index of a field. * * @param {String} token The token to calculate the idf of. * @param {String} field The field to compute idf. * @see Index.prototype.idf * @private * @memberOf Index */ elasticlunr.Index.prototype.idf = function (term, field) { var cacheKey = "@" + field + '/' + term; if (Object.prototype.hasOwnProperty.call(this._idfCache, cacheKey)) return this._idfCache[cacheKey]; var df = this.index[field].getDocFreq(term); var idf = 1 + Math.log(this.documentStore.length / (df + 1)); this._idfCache[cacheKey] = idf; return idf; }; /** * get fields of current index instance * * @return {Array} */ elasticlunr.Index.prototype.getFields = function () { return this._fields.slice(); }; /** * Searches the index using the passed query. * Queries should be a string, multiple words are allowed. * * If config is null, will search all fields defaultly, and lead to OR based query. * If config is specified, will search specified with query time boosting. * * All query tokens are passed through the same pipeline that document tokens * are passed through, so any language processing involved will be run on every * query term. * * Each query term is expanded, so that the term 'he' might be expanded to * 'hello' and 'help' if those terms were already included in the index. * * Matching documents are returned as an array of objects, each object contains * the matching document ref, as set for this index, and the similarity score * for this document against the query. * * @param {String} query The query to search the index with. * @param {JSON} userConfig The user query config, JSON format. * @return {Object} * @see Index.prototype.idf * @see Index.prototype.documentVector * @memberOf Index */ elasticlunr.Index.prototype.search = function (query, userConfig) { if (!query) return []; var configStr = null; if (userConfig != null) { configStr = JSON.stringify(userConfig); } var config = new elasticlunr.Configuration(configStr, this.getFields()).get(); var queryTokens = this.pipeline.run(elasticlunr.tokenizer(query)); var queryResults = {}; for (var field in config) { var fieldSearchResults = this.fieldSearch(queryTokens, field, config); var fieldBoost = config[field].boost; for (var docRef in fieldSearchResults) { fieldSearchResults[docRef] = fieldSearchResults[docRef] * fieldBoost; } for (var docRef in fieldSearchResults) { if (docRef in queryResults) { queryResults[docRef] += fieldSearchResults[docRef]; } else { queryResults[docRef] = fieldSearchResults[docRef]; } } } var results = []; for (var docRef in queryResults) { results.push({ref: docRef, score: queryResults[docRef]}); } results.sort(function (a, b) { return b.score - a.score; }); return results; }; /** * search queryTokens in specified field. * * @param {Array} queryTokens The query tokens to query in this field. * @param {String} field Field to query in. * @param {elasticlunr.Configuration} config The user query config, JSON format. * @return {Object} */ elasticlunr.Index.prototype.fieldSearch = function (queryTokens, fieldName, config) { var booleanType = config[fieldName].bool; var expand = config[fieldName].expand; var boost = config[fieldName].boost; var scores = null; var docTokens = {}; // Do nothing if the boost is 0 if (boost === 0) { return; } queryTokens.forEach(function (token) { var tokens = [token]; if (expand == true) { tokens = this.index[fieldName].expandToken(token); } // Consider every query token in turn. If expanded, each query token // corresponds to a set of tokens, which is all tokens in the // index matching the pattern queryToken* . // For the set of tokens corresponding to a query token, find and score // all matching documents. Store those scores in queryTokenScores, // keyed by docRef. // Then, depending on the value of booleanType, combine the scores // for this query token with previous scores. If booleanType is OR, // then merge the scores by summing into the accumulated total, adding // new document scores are required (effectively a union operator). // If booleanType is AND, accumulate scores only if the document // has previously been scored by another query token (an intersection // operation0. // Furthermore, since when booleanType is AND, additional // query tokens can't add new documents to the result set, use the // current document set to limit the processing of each new query // token for efficiency (i.e., incremental intersection). var queryTokenScores = {}; tokens.forEach(function (key) { var docs = this.index[fieldName].getDocs(key); var idf = this.idf(key, fieldName); if (scores && booleanType == 'AND') { // special case, we can rule out documents that have been // already been filtered out because they weren't scored // by previous query token passes. var filteredDocs = {}; for (var docRef in scores) { if (docRef in docs) { filteredDocs[docRef] = docs[docRef]; } } docs = filteredDocs; } // only record appeared token for retrieved documents for the // original token, not for expaned token. // beause for doing coordNorm for a retrieved document, coordNorm only care how many // query token appear in that document. // so expanded token should not be added into docTokens, if added, this will pollute the // coordNorm if (key == token) { this.fieldSearchStats(docTokens, key, docs); } for (var docRef in docs) { var tf = this.index[fieldName].getTermFrequency(key, docRef); var fieldLength = this.documentStore.getFieldLength(docRef, fieldName); var fieldLengthNorm = 1; if (fieldLength != 0) { fieldLengthNorm = 1 / Math.sqrt(fieldLength); } var penality = 1; if (key != token) { // currently I'm not sure if this penality is enough, // need to do verification penality = (1 - (key.length - token.length) / key.length) * 0.15; } var score = tf * idf * fieldLengthNorm * penality; if (docRef in queryTokenScores) { queryTokenScores[docRef] += score; } else { queryTokenScores[docRef] = score; } } }, this); scores = this.mergeScores(scores, queryTokenScores, booleanType); }, this); scores = this.coordNorm(scores, docTokens, queryTokens.length); return scores; }; /** * Merge the scores from one set of tokens into an accumulated score table. * Exact operation depends on the op parameter. If op is 'AND', then only the * intersection of the two score lists is retained. Otherwise, the union of * the two score lists is returned. For internal use only. * * @param {Object} bool accumulated scores. Should be null on first call. * @param {String} scores new scores to merge into accumScores. * @param {Object} op merge operation (should be 'AND' or 'OR'). * */ elasticlunr.Index.prototype.mergeScores = function (accumScores, scores, op) { if (!accumScores) { return scores; } if (op == 'AND') { var intersection = {}; for (var docRef in scores) { if (docRef in accumScores) { intersection[docRef] = accumScores[docRef] + scores[docRef]; } } return intersection; } else { for (var docRef in scores) { if (docRef in accumScores) { accumScores[docRef] += scores[docRef]; } else { accumScores[docRef] = scores[docRef]; } } return accumScores; } }; /** * Record the occuring query token of retrieved doc specified by doc field. * Only for inner user. * * @param {Object} docTokens a data structure stores which token appears in the retrieved doc. * @param {String} token query token * @param {Object} docs the retrieved documents of the query token * */ elasticlunr.Index.prototype.fieldSearchStats = function (docTokens, token, docs) { for (var doc in docs) { if (doc in docTokens) { docTokens[doc].push(token); } else { docTokens[doc] = [token]; } } }; /** * coord norm the score of a doc. * if a doc contain more query tokens, then the score will larger than the doc * contains less query tokens. * * only for inner use. * * @param {Object} results first results * @param {Object} docs field search results of a token * @param {Integer} n query token number * @return {Object} */ elasticlunr.Index.prototype.coordNorm = function (scores, docTokens, n) { for (var doc in scores) { if (!(doc in docTokens)) continue; var tokens = docTokens[doc].length; scores[doc] = scores[doc] * tokens / n; } return scores; }; /** * Returns a representation of the index ready for serialisation. * * @return {Object} * @memberOf Index */ elasticlunr.Index.prototype.toJSON = function () { var indexJson = {}; this._fields.forEach(function (field) { indexJson[field] = this.index[field].toJSON(); }, this); return { version: elasticlunr.version, fields: this._fields, ref: this._ref, documentStore: this.documentStore.toJSON(), index: indexJson, pipeline: this.pipeline.toJSON() }; }; /** * Applies a plugin to the current index. * * A plugin is a function that is called with the index as its context. * Plugins can be used to customise or extend the behaviour the index * in some way. A plugin is just a function, that encapsulated the custom * behaviour that should be applied to the index. * * The plugin function will be called with the index as its argument, additional * arguments can also be passed when calling use. The function will be called * with the index as its context. * * Example: * * var myPlugin = function (idx, arg1, arg2) { * // `this` is the index to be extended * // apply any extensions etc here. * } * * var idx = elasticlunr(function () { * this.use(myPlugin, 'arg1', 'arg2') * }) * * @param {Function} plugin The plugin to apply. * @memberOf Index */ elasticlunr.Index.prototype.use = function (plugin) { var args = Array.prototype.slice.call(arguments, 1); args.unshift(this); plugin.apply(this, args); };