UNPKG

elasticlunr

Version:

Lightweight full-text search engine in Javascript for browser search and offline search.

weixsong.github.io

weixsong/elasticlunr.js

1,758 lines (1,572 loc) • 68.2 kB

JavaScript

/** * elasticlunr - http://weixsong.github.io * Lightweight full-text search engine in Javascript for browser search and offline search. - 0.9.5 * * Copyright (C) 2016 Oliver Nightingale * Copyright (C) 2016 Wei Song * MIT Licensed * @license */ (function(){ /*! * elasticlunr.js * Copyright (C) 2016 Oliver Nightingale * Copyright (C) 2016 Wei Song */ /** * Convenience function for instantiating a new elasticlunr index and configuring it * with the default pipeline functions and the passed config function. * * When using this convenience function a new index will be created with the * following functions already in the pipeline: * * 1. elasticlunr.trimmer - trim non-word character * 2. elasticlunr.StopWordFilter - filters out any stop words before they enter the * index * 3. elasticlunr.stemmer - stems the tokens before entering the index. * * * Example: * * var idx = elasticlunr(function () { * this.addField('id'); * this.addField('title'); * this.addField('body'); * * //this.setRef('id'); // default ref is 'id' * * this.pipeline.add(function () { * // some custom pipeline function * }); * }); * * idx.addDoc({ * id: 1, * title: 'Oracle released database 12g', * body: 'Yestaday, Oracle has released their latest database, named 12g, more robust. this product will increase Oracle profit.' * }); * * idx.addDoc({ * id: 2, * title: 'Oracle released annual profit report', * body: 'Yestaday, Oracle has released their annual profit report of 2015, total profit is 12.5 Billion.' * }); * * # simple search * idx.search('oracle database'); * * # search with query-time boosting * idx.search('oracle database', {fields: {title: {boost: 2}, body: {boost: 1}}}); * * @param {Function} config A function that will be called with the new instance * of the elasticlunr.Index as both its context and first parameter. It can be used to * customize the instance of new elasticlunr.Index. * @namespace * @module * @return {elasticlunr.Index} * */ var elasticlunr = function (config) { var idx = new elasticlunr.Index; idx.pipeline.add( elasticlunr.trimmer, elasticlunr.stopWordFilter, elasticlunr.stemmer ); if (config) config.call(idx, idx); return idx; }; elasticlunr.version = "0.9.5"; // only used this to make elasticlunr.js compatible with lunr-languages // this is a trick to define a global alias of elasticlunr lunr = elasticlunr; /*! * elasticlunr.utils * Copyright (C) 2016 Oliver Nightingale * Copyright (C) 2016 Wei Song */ /** * A namespace containing utils for the rest of the elasticlunr library */ elasticlunr.utils = {}; /** * Print a warning message to the console. * * @param {String} message The message to be printed. * @memberOf Utils */ elasticlunr.utils.warn = (function (global) { return function (message) { if (global.console && console.warn) { console.warn(message); } }; })(this); /** * Convert an object to string. * * In the case of `null` and `undefined` the function returns * an empty string, in all other cases the result of calling * `toString` on the passed object is returned. * * @param {object} obj The object to convert to a string. * @return {String} string representation of the passed object. * @memberOf Utils */ elasticlunr.utils.toString = function (obj) { if (obj === void 0 || obj === null) { return ""; } return obj.toString(); }; /*! * elasticlunr.EventEmitter * Copyright (C) 2016 Oliver Nightingale * Copyright (C) 2016 Wei Song */ /** * elasticlunr.EventEmitter is an event emitter for elasticlunr. * It manages adding and removing event handlers and triggering events and their handlers. * * Each event could has multiple corresponding functions, * these functions will be called as the sequence that they are added into the event. * * @constructor */ elasticlunr.EventEmitter = function () { this.events = {}; }; /** * Binds a handler function to a specific event(s). * * Can bind a single function to many different events in one call. * * @param {String} [eventName] The name(s) of events to bind this function to. * @param {Function} fn The function to call when an event is fired. * @memberOf EventEmitter */ elasticlunr.EventEmitter.prototype.addListener = function () { var args = Array.prototype.slice.call(arguments), fn = args.pop(), names = args; if (typeof fn !== "function") throw new TypeError ("last argument must be a function"); names.forEach(function (name) { if (!this.hasHandler(name)) this.events[name] = []; this.events[name].push(fn); }, this); }; /** * Removes a handler function from a specific event. * * @param {String} eventName The name of the event to remove this function from. * @param {Function} fn The function to remove from an event. * @memberOf EventEmitter */ elasticlunr.EventEmitter.prototype.removeListener = function (name, fn) { if (!this.hasHandler(name)) return; var fnIndex = this.events[name].indexOf(fn); if (fnIndex === -1) return; this.events[name].splice(fnIndex, 1); if (this.events[name].length == 0) delete this.events[name]; }; /** * Call all functions that bounded to the given event. * * Additional data can be passed to the event handler as arguments to `emit` * after the event name. * * @param {String} eventName The name of the event to emit. * @memberOf EventEmitter */ elasticlunr.EventEmitter.prototype.emit = function (name) { if (!this.hasHandler(name)) return; var args = Array.prototype.slice.call(arguments, 1); this.events[name].forEach(function (fn) { fn.apply(undefined, args); }, this); }; /** * Checks whether a handler has ever been stored against an event. * * @param {String} eventName The name of the event to check. * @private * @memberOf EventEmitter */ elasticlunr.EventEmitter.prototype.hasHandler = function (name) { return name in this.events; }; /*! * elasticlunr.tokenizer * Copyright (C) 2016 Oliver Nightingale * Copyright (C) 2016 Wei Song */ /** * A function for splitting a string into tokens. * Currently English is supported as default. * Uses `elasticlunr.tokenizer.seperator` to split strings, you could change * the value of this property to set how you want strings are split into tokens. * IMPORTANT: use elasticlunr.tokenizer.seperator carefully, if you are not familiar with * text process, then you'd better not change it. * * @module * @param {String} str The string that you want to tokenize. * @see elasticlunr.tokenizer.seperator * @return {Array} */ elasticlunr.tokenizer = function (str) { if (!arguments.length || str === null || str === undefined) return []; if (Array.isArray(str)) { var arr = str.filter(function(token) { if (token === null || token === undefined) { return false; } return true; }); arr = arr.map(function (t) { return elasticlunr.utils.toString(t).toLowerCase(); }); var out = []; arr.forEach(function(item) { var tokens = item.split(elasticlunr.tokenizer.seperator); out = out.concat(tokens); }, this); return out; } return str.toString().trim().toLowerCase().split(elasticlunr.tokenizer.seperator); }; /** * Default string seperator. */ elasticlunr.tokenizer.defaultSeperator = /[\s\-]+/; /** * The sperator used to split a string into tokens. Override this property to change the behaviour of * `elasticlunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens. * * @static * @see elasticlunr.tokenizer */ elasticlunr.tokenizer.seperator = elasticlunr.tokenizer.defaultSeperator; /** * Set up customized string seperator * * @param {Object} sep The customized seperator that you want to use to tokenize a string. */ elasticlunr.tokenizer.setSeperator = function(sep) { if (sep !== null && sep !== undefined && typeof(sep) === 'object') { elasticlunr.tokenizer.seperator = sep; } } /** * Reset string seperator * */ elasticlunr.tokenizer.resetSeperator = function() { elasticlunr.tokenizer.seperator = elasticlunr.tokenizer.defaultSeperator; } /** * Get string seperator * */ elasticlunr.tokenizer.getSeperator = function() { return elasticlunr.tokenizer.seperator; } /*! * elasticlunr.Pipeline * Copyright (C) 2016 Oliver Nightingale * Copyright (C) 2016 Wei Song */ /** * elasticlunr.Pipelines maintain an ordered list of functions to be applied to * both documents tokens and query tokens. * * An instance of elasticlunr.Index will contain a pipeline * with a trimmer, a stop word filter, an English stemmer. Extra * functions can be added before or after either of these functions or these * default functions can be removed. * * When run the pipeline, it will call each function in turn. * * The output of the functions in the pipeline will be passed to the next function * in the pipeline. To exclude a token from entering the index the function * should return undefined, the rest of the pipeline will not be called with * this token. * * For serialisation of pipelines to work, all functions used in an instance of * a pipeline should be registered with elasticlunr.Pipeline. Registered functions can * then be loaded. If trying to load a serialised pipeline that uses functions * that are not registered an error will be thrown. * * If not planning on serialising the pipeline then registering pipeline functions * is not necessary. * * @constructor */ elasticlunr.Pipeline = function () { this._queue = []; }; elasticlunr.Pipeline.registeredFunctions = {}; /** * Register a function in the pipeline. * * Functions that are used in the pipeline should be registered if the pipeline * needs to be serialised, or a serialised pipeline needs to be loaded. * * Registering a function does not add it to a pipeline, functions must still be * added to instances of the pipeline for them to be used when running a pipeline. * * @param {Function} fn The function to register. * @param {String} label The label to register this function with * @memberOf Pipeline */ elasticlunr.Pipeline.registerFunction = function (fn, label) { if (label in elasticlunr.Pipeline.registeredFunctions) { elasticlunr.utils.warn('Overwriting existing registered function: ' + label); } fn.label = label; elasticlunr.Pipeline.registeredFunctions[label] = fn; }; /** * Get a registered function in the pipeline. * * @param {String} label The label of registered function. * @return {Function} * @memberOf Pipeline */ elasticlunr.Pipeline.getRegisteredFunction = function (label) { if ((label in elasticlunr.Pipeline.registeredFunctions) !== true) { return null; } return elasticlunr.Pipeline.registeredFunctions[label]; }; /** * Warns if the function is not registered as a Pipeline function. * * @param {Function} fn The function to check for. * @private * @memberOf Pipeline */ elasticlunr.Pipeline.warnIfFunctionNotRegistered = function (fn) { var isRegistered = fn.label && (fn.label in this.registeredFunctions); if (!isRegistered) { elasticlunr.utils.warn('Function is not registered with pipeline. This may cause problems when serialising the index.\n', fn); } }; /** * Loads a previously serialised pipeline. * * All functions to be loaded must already be registered with elasticlunr.Pipeline. * If any function from the serialised data has not been registered then an * error will be thrown. * * @param {Object} serialised The serialised pipeline to load. * @return {elasticlunr.Pipeline} * @memberOf Pipeline */ elasticlunr.Pipeline.load = function (serialised) { var pipeline = new elasticlunr.Pipeline; serialised.forEach(function (fnName) { var fn = elasticlunr.Pipeline.getRegisteredFunction(fnName); if (fn) { pipeline.add(fn); } else { throw new Error('Cannot load un-registered function: ' + fnName); } }); return pipeline; }; /** * Adds new functions to the end of the pipeline. * * Logs a warning if the function has not been registered. * * @param {Function} functions Any number of functions to add to the pipeline. * @memberOf Pipeline */ elasticlunr.Pipeline.prototype.add = function () { var fns = Array.prototype.slice.call(arguments); fns.forEach(function (fn) { elasticlunr.Pipeline.warnIfFunctionNotRegistered(fn); this._queue.push(fn); }, this); }; /** * Adds a single function after a function that already exists in the * pipeline. * * Logs a warning if the function has not been registered. * If existingFn is not found, throw an Exception. * * @param {Function} existingFn A function that already exists in the pipeline. * @param {Function} newFn The new function to add to the pipeline. * @memberOf Pipeline */ elasticlunr.Pipeline.prototype.after = function (existingFn, newFn) { elasticlunr.Pipeline.warnIfFunctionNotRegistered(newFn); var pos = this._queue.indexOf(existingFn); if (pos === -1) { throw new Error('Cannot find existingFn'); } this._queue.splice(pos + 1, 0, newFn); }; /** * Adds a single function before a function that already exists in the * pipeline. * * Logs a warning if the function has not been registered. * If existingFn is not found, throw an Exception. * * @param {Function} existingFn A function that already exists in the pipeline. * @param {Function} newFn The new function to add to the pipeline. * @memberOf Pipeline */ elasticlunr.Pipeline.prototype.before = function (existingFn, newFn) { elasticlunr.Pipeline.warnIfFunctionNotRegistered(newFn); var pos = this._queue.indexOf(existingFn); if (pos === -1) { throw new Error('Cannot find existingFn'); } this._queue.splice(pos, 0, newFn); }; /** * Removes a function from the pipeline. * * @param {Function} fn The function to remove from the pipeline. * @memberOf Pipeline */ elasticlunr.Pipeline.prototype.remove = function (fn) { var pos = this._queue.indexOf(fn); if (pos === -1) { return; } this._queue.splice(pos, 1); }; /** * Runs the current list of functions that registered in the pipeline against the * input tokens. * * @param {Array} tokens The tokens to run through the pipeline. * @return {Array} * @memberOf Pipeline */ elasticlunr.Pipeline.prototype.run = function (tokens) { var out = [], tokenLength = tokens.length, pipelineLength = this._queue.length; for (var i = 0; i < tokenLength; i++) { var token = tokens[i]; for (var j = 0; j < pipelineLength; j++) { token = this._queue[j](token, i, tokens); if (token === void 0 || token === null) break; }; if (token !== void 0 && token !== null) out.push(token); }; return out; }; /** * Resets the pipeline by removing any existing processors. * * @memberOf Pipeline */ elasticlunr.Pipeline.prototype.reset = function () { this._queue = []; }; /** * Get the pipeline if user want to check the pipeline. * * @memberOf Pipeline */ elasticlunr.Pipeline.prototype.get = function () { return this._queue; }; /** * Returns a representation of the pipeline ready for serialisation. * Only serialize pipeline function's name. Not storing function, so when * loading the archived JSON index file, corresponding pipeline function is * added by registered function of elasticlunr.Pipeline.registeredFunctions * * Logs a warning if the function has not been registered. * * @return {Array} * @memberOf Pipeline */ elasticlunr.Pipeline.prototype.toJSON = function () { return this._queue.map(function (fn) { elasticlunr.Pipeline.warnIfFunctionNotRegistered(fn); return fn.label; }); }; /*! * elasticlunr.Index * Copyright (C) 2016 Oliver Nightingale * Copyright (C) 2016 Wei Song */ /** * elasticlunr.Index is object that manages a search index. It contains the indexes * and stores all the tokens and document lookups. It also provides the main * user facing API for the library. * * @constructor */ elasticlunr.Index = function () { this._fields = []; this._ref = 'id'; this.pipeline = new elasticlunr.Pipeline; this.documentStore = new elasticlunr.DocumentStore; this.index = {}; this.eventEmitter = new elasticlunr.EventEmitter; this._idfCache = {}; this.on('add', 'remove', 'update', (function () { this._idfCache = {}; }).bind(this)); }; /** * Bind a handler to events being emitted by the index. * * The handler can be bound to many events at the same time. * * @param {String} [eventName] The name(s) of events to bind the function to. * @param {Function} fn The serialised set to load. * @memberOf Index */ elasticlunr.Index.prototype.on = function () { var args = Array.prototype.slice.call(arguments); return this.eventEmitter.addListener.apply(this.eventEmitter, args); }; /** * Removes a handler from an event being emitted by the index. * * @param {String} eventName The name of events to remove the function from. * @param {Function} fn The serialised set to load. * @memberOf Index */ elasticlunr.Index.prototype.off = function (name, fn) { return this.eventEmitter.removeListener(name, fn); }; /** * Loads a previously serialised index. * * Issues a warning if the index being imported was serialised * by a different version of elasticlunr. * * @param {Object} serialisedData The serialised set to load. * @return {elasticlunr.Index} * @memberOf Index */ elasticlunr.Index.load = function (serialisedData) { if (serialisedData.version !== elasticlunr.version) { elasticlunr.utils.warn('version mismatch: current ' + elasticlunr.version + ' importing ' + serialisedData.version); } var idx = new this; idx._fields = serialisedData.fields; idx._ref = serialisedData.ref; idx.documentStore = elasticlunr.DocumentStore.load(serialisedData.documentStore); idx.pipeline = elasticlunr.Pipeline.load(serialisedData.pipeline); idx.index = {}; for (var field in serialisedData.index) { idx.index[field] = elasticlunr.InvertedIndex.load(serialisedData.index[field]); } return idx; }; /** * Adds a field to the list of fields that will be searchable within documents in the index. * * Remember that inner index is build based on field, which means each field has one inverted index. * * Fields should be added before any documents are added to the index, fields * that are added after documents are added to the index will only apply to new * documents added to the index. * * @param {String} fieldName The name of the field within the document that should be indexed * @return {elasticlunr.Index} * @memberOf Index */ elasticlunr.Index.prototype.addField = function (fieldName) { this._fields.push(fieldName); this.index[fieldName] = new elasticlunr.InvertedIndex; return this; }; /** * Sets the property used to uniquely identify documents added to the index, * by default this property is 'id'. * * This should only be changed before adding documents to the index, changing * the ref property without resetting the index can lead to unexpected results. * * @param {String} refName The property to use to uniquely identify the * documents in the index. * @param {Boolean} emitEvent Whether to emit add events, defaults to true * @return {elasticlunr.Index} * @memberOf Index */ elasticlunr.Index.prototype.setRef = function (refName) { this._ref = refName; return this; }; /** * * Set if the JSON format original documents are save into elasticlunr.DocumentStore * * Defaultly save all the original JSON documents. * * @param {Boolean} save Whether to save the original JSON documents. * @return {elasticlunr.Index} * @memberOf Index */ elasticlunr.Index.prototype.saveDocument = function (save) { this.documentStore = new elasticlunr.DocumentStore(save); return this; }; /** * Add a JSON format document to the index. * * This is the way new documents enter the index, this function will run the * fields from the document through the index's pipeline and then add it to * the index, it will then show up in search results. * * An 'add' event is emitted with the document that has been added and the index * the document has been added to. This event can be silenced by passing false * as the second argument to add. * * @param {Object} doc The JSON format document to add to the index. * @param {Boolean} emitEvent Whether or not to emit events, default true. * @memberOf Index */ elasticlunr.Index.prototype.addDoc = function (doc, emitEvent) { if (!doc) return; var emitEvent = emitEvent === undefined ? true : emitEvent; var docRef = doc[this._ref]; this.documentStore.addDoc(docRef, doc); this._fields.forEach(function (field) { var fieldTokens = this.pipeline.run(elasticlunr.tokenizer(doc[field])); this.documentStore.addFieldLength(docRef, field, fieldTokens.length); var tokenCount = {}; fieldTokens.forEach(function (token) { if (token in tokenCount) tokenCount[token] += 1; else tokenCount[token] = 1; }, this); for (var token in tokenCount) { var termFrequency = tokenCount[token]; termFrequency = Math.sqrt(termFrequency); this.index[field].addToken(token, { ref: docRef, tf: termFrequency }); } }, this); if (emitEvent) this.eventEmitter.emit('add', doc, this); }; /** * Removes a document from the index by doc ref. * * To make sure documents no longer show up in search results they can be * removed from the index using this method. * * A 'remove' event is emitted with the document that has been removed and the index * the document has been removed from. This event can be silenced by passing false * as the second argument to remove. * * If user setting DocumentStore not storing the documents, then remove doc by docRef is not allowed. * * @param {String|Integer} docRef The document ref to remove from the index. * @param {Boolean} emitEvent Whether to emit remove events, defaults to true * @memberOf Index */ elasticlunr.Index.prototype.removeDocByRef = function (docRef, emitEvent) { if (!docRef) return; if (this.documentStore.isDocStored() === false) { return; } if (!this.documentStore.hasDoc(docRef)) return; var doc = this.documentStore.getDoc(docRef); this.removeDoc(doc, false); }; /** * Removes a document from the index. * This remove operation could work even the original doc is not store in the DocumentStore. * * To make sure documents no longer show up in search results they can be * removed from the index using this method. * * A 'remove' event is emitted with the document that has been removed and the index * the document has been removed from. This event can be silenced by passing false * as the second argument to remove. * * * @param {Object} doc The document ref to remove from the index. * @param {Boolean} emitEvent Whether to emit remove events, defaults to true * @memberOf Index */ elasticlunr.Index.prototype.removeDoc = function (doc, emitEvent) { if (!doc) return; var emitEvent = emitEvent === undefined ? true : emitEvent; var docRef = doc[this._ref]; if (!this.documentStore.hasDoc(docRef)) return; this.documentStore.removeDoc(docRef); this._fields.forEach(function (field) { var fieldTokens = this.pipeline.run(elasticlunr.tokenizer(doc[field])); fieldTokens.forEach(function (token) { this.index[field].removeToken(token, docRef); }, this); }, this); if (emitEvent) this.eventEmitter.emit('remove', doc, this); }; /** * Updates a document in the index. * * When a document contained within the index gets updated, fields changed, * added or removed, to make sure it correctly matched against search queries, * it should be updated in the index. * * This method is just a wrapper around `remove` and `add` * * An 'update' event is emitted with the document that has been updated and the index. * This event can be silenced by passing false as the second argument to update. Only * an update event will be fired, the 'add' and 'remove' events of the underlying calls * are silenced. * * @param {Object} doc The document to update in the index. * @param {Boolean} emitEvent Whether to emit update events, defaults to true * @see Index.prototype.remove * @see Index.prototype.add * @memberOf Index */ elasticlunr.Index.prototype.updateDoc = function (doc, emitEvent) { var emitEvent = emitEvent === undefined ? true : emitEvent; this.removeDocByRef(doc[this._ref], false); this.addDoc(doc, false); if (emitEvent) this.eventEmitter.emit('update', doc, this); }; /** * Calculates the inverse document frequency for a token within the index of a field. * * @param {String} token The token to calculate the idf of. * @param {String} field The field to compute idf. * @see Index.prototype.idf * @private * @memberOf Index */ elasticlunr.Index.prototype.idf = function (term, field) { var cacheKey = "@" + field + '/' + term; if (Object.prototype.hasOwnProperty.call(this._idfCache, cacheKey)) return this._idfCache[cacheKey]; var df = this.index[field].getDocFreq(term); var idf = 1 + Math.log(this.documentStore.length / (df + 1)); this._idfCache[cacheKey] = idf; return idf; }; /** * get fields of current index instance * * @return {Array} */ elasticlunr.Index.prototype.getFields = function () { return this._fields.slice(); }; /** * Searches the index using the passed query. * Queries should be a string, multiple words are allowed. * * If config is null, will search all fields defaultly, and lead to OR based query. * If config is specified, will search specified with query time boosting. * * All query tokens are passed through the same pipeline that document tokens * are passed through, so any language processing involved will be run on every * query term. * * Each query term is expanded, so that the term 'he' might be expanded to * 'hello' and 'help' if those terms were already included in the index. * * Matching documents are returned as an array of objects, each object contains * the matching document ref, as set for this index, and the similarity score * for this document against the query. * * @param {String} query The query to search the index with. * @param {JSON} userConfig The user query config, JSON format. * @return {Object} * @see Index.prototype.idf * @see Index.prototype.documentVector * @memberOf Index */ elasticlunr.Index.prototype.search = function (query, userConfig) { if (!query) return []; var configStr = null; if (userConfig != null) { configStr = JSON.stringify(userConfig); } var config = new elasticlunr.Configuration(configStr, this.getFields()).get(); var queryTokens = this.pipeline.run(elasticlunr.tokenizer(query)); var queryResults = {}; for (var field in config) { var fieldSearchResults = this.fieldSearch(queryTokens, field, config); var fieldBoost = config[field].boost; for (var docRef in fieldSearchResults) { fieldSearchResults[docRef] = fieldSearchResults[docRef] * fieldBoost; } for (var docRef in fieldSearchResults) { if (docRef in queryResults) { queryResults[docRef] += fieldSearchResults[docRef]; } else { queryResults[docRef] = fieldSearchResults[docRef]; } } } var results = []; for (var docRef in queryResults) { results.push({ref: docRef, score: queryResults[docRef]}); } results.sort(function (a, b) { return b.score - a.score; }); return results; }; /** * search queryTokens in specified field. * * @param {Array} queryTokens The query tokens to query in this field. * @param {String} field Field to query in. * @param {elasticlunr.Configuration} config The user query config, JSON format. * @return {Object} */ elasticlunr.Index.prototype.fieldSearch = function (queryTokens, fieldName, config) { var booleanType = config[fieldName].bool; var expand = config[fieldName].expand; var boost = config[fieldName].boost; var scores = null; var docTokens = {}; // Do nothing if the boost is 0 if (boost === 0) { return; } queryTokens.forEach(function (token) { var tokens = [token]; if (expand == true) { tokens = this.index[fieldName].expandToken(token); } // Consider every query token in turn. If expanded, each query token // corresponds to a set of tokens, which is all tokens in the // index matching the pattern queryToken* . // For the set of tokens corresponding to a query token, find and score // all matching documents. Store those scores in queryTokenScores, // keyed by docRef. // Then, depending on the value of booleanType, combine the scores // for this query token with previous scores. If booleanType is OR, // then merge the scores by summing into the accumulated total, adding // new document scores are required (effectively a union operator). // If booleanType is AND, accumulate scores only if the document // has previously been scored by another query token (an intersection // operation0. // Furthermore, since when booleanType is AND, additional // query tokens can't add new documents to the result set, use the // current document set to limit the processing of each new query // token for efficiency (i.e., incremental intersection). var queryTokenScores = {}; tokens.forEach(function (key) { var docs = this.index[fieldName].getDocs(key); var idf = this.idf(key, fieldName); if (scores && booleanType == 'AND') { // special case, we can rule out documents that have been // already been filtered out because they weren't scored // by previous query token passes. var filteredDocs = {}; for (var docRef in scores) { if (docRef in docs) { filteredDocs[docRef] = docs[docRef]; } } docs = filteredDocs; } // only record appeared token for retrieved documents for the // original token, not for expaned token. // beause for doing coordNorm for a retrieved document, coordNorm only care how many // query token appear in that document. // so expanded token should not be added into docTokens, if added, this will pollute the // coordNorm if (key == token) { this.fieldSearchStats(docTokens, key, docs); } for (var docRef in docs) { var tf = this.index[fieldName].getTermFrequency(key, docRef); var fieldLength = this.documentStore.getFieldLength(docRef, fieldName); var fieldLengthNorm = 1; if (fieldLength != 0) { fieldLengthNorm = 1 / Math.sqrt(fieldLength); } var penality = 1; if (key != token) { // currently I'm not sure if this penality is enough, // need to do verification penality = (1 - (key.length - token.length) / key.length) * 0.15; } var score = tf * idf * fieldLengthNorm * penality; if (docRef in queryTokenScores) { queryTokenScores[docRef] += score; } else { queryTokenScores[docRef] = score; } } }, this); scores = this.mergeScores(scores, queryTokenScores, booleanType); }, this); scores = this.coordNorm(scores, docTokens, queryTokens.length); return scores; }; /** * Merge the scores from one set of tokens into an accumulated score table. * Exact operation depends on the op parameter. If op is 'AND', then only the * intersection of the two score lists is retained. Otherwise, the union of * the two score lists is returned. For internal use only. * * @param {Object} bool accumulated scores. Should be null on first call. * @param {String} scores new scores to merge into accumScores. * @param {Object} op merge operation (should be 'AND' or 'OR'). * */ elasticlunr.Index.prototype.mergeScores = function (accumScores, scores, op) { if (!accumScores) { return scores; } if (op == 'AND') { var intersection = {}; for (var docRef in scores) { if (docRef in accumScores) { intersection[docRef] = accumScores[docRef] + scores[docRef]; } } return intersection; } else { for (var docRef in scores) { if (docRef in accumScores) { accumScores[docRef] += scores[docRef]; } else { accumScores[docRef] = scores[docRef]; } } return accumScores; } }; /** * Record the occuring query token of retrieved doc specified by doc field. * Only for inner user. * * @param {Object} docTokens a data structure stores which token appears in the retrieved doc. * @param {String} token query token * @param {Object} docs the retrieved documents of the query token * */ elasticlunr.Index.prototype.fieldSearchStats = function (docTokens, token, docs) { for (var doc in docs) { if (doc in docTokens) { docTokens[doc].push(token); } else { docTokens[doc] = [token]; } } }; /** * coord norm the score of a doc. * if a doc contain more query tokens, then the score will larger than the doc * contains less query tokens. * * only for inner use. * * @param {Object} results first results * @param {Object} docs field search results of a token * @param {Integer} n query token number * @return {Object} */ elasticlunr.Index.prototype.coordNorm = function (scores, docTokens, n) { for (var doc in scores) { if (!(doc in docTokens)) continue; var tokens = docTokens[doc].length; scores[doc] = scores[doc] * tokens / n; } return scores; }; /** * Returns a representation of the index ready for serialisation. * * @return {Object} * @memberOf Index */ elasticlunr.Index.prototype.toJSON = function () { var indexJson = {}; this._fields.forEach(function (field) { indexJson[field] = this.index[field].toJSON(); }, this); return { version: elasticlunr.version, fields: this._fields, ref: this._ref, documentStore: this.documentStore.toJSON(), index: indexJson, pipeline: this.pipeline.toJSON() }; }; /** * Applies a plugin to the current index. * * A plugin is a function that is called with the index as its context. * Plugins can be used to customise or extend the behaviour the index * in some way. A plugin is just a function, that encapsulated the custom * behaviour that should be applied to the index. * * The plugin function will be called with the index as its argument, additional * arguments can also be passed when calling use. The function will be called * with the index as its context. * * Example: * * var myPlugin = function (idx, arg1, arg2) { * // `this` is the index to be extended * // apply any extensions etc here. * } * * var idx = elasticlunr(function () { * this.use(myPlugin, 'arg1', 'arg2') * }) * * @param {Function} plugin The plugin to apply. * @memberOf Index */ elasticlunr.Index.prototype.use = function (plugin) { var args = Array.prototype.slice.call(arguments, 1); args.unshift(this); plugin.apply(this, args); }; /*! * elasticlunr.DocumentStore * Copyright (C) 2016 Wei Song */ /** * elasticlunr.DocumentStore is a simple key-value document store used for storing sets of tokens for * documents stored in index. * * elasticlunr.DocumentStore store original JSON format documents that you could build search snippet by this original JSON document. * * user could choose whether original JSON format document should be store, if no configuration then document will be stored defaultly. * If user care more about the index size, user could select not store JSON documents, then this will has some defects, such as user * could not use JSON document to generate snippets of search results. * * @param {Boolean} save If the original JSON document should be stored. * @constructor * @module */ elasticlunr.DocumentStore = function (save) { if (save === null || save === undefined) { this._save = true; } else { this._save = save; } this.docs = {}; this.docInfo = {}; this.length = 0; }; /** * Loads a previously serialised document store * * @param {Object} serialisedData The serialised document store to load. * @return {elasticlunr.DocumentStore} */ elasticlunr.DocumentStore.load = function (serialisedData) { var store = new this; store.length = serialisedData.length; store.docs = serialisedData.docs; store.docInfo = serialisedData.docInfo; store._save = serialisedData.save; return store; }; /** * check if current instance store the original doc * * @return {Boolean} */ elasticlunr.DocumentStore.prototype.isDocStored = function () { return this._save; }; /** * Stores the given doc in the document store against the given id. * If docRef already exist, then update doc. * * Document is store by original JSON format, then you could use original document to generate search snippets. * * @param {Integer|String} docRef The key used to store the JSON format doc. * @param {Object} doc The JSON format doc. */ elasticlunr.DocumentStore.prototype.addDoc = function (docRef, doc) { if (!this.hasDoc(docRef)) this.length++; if (this._save === true) { this.docs[docRef] = clone(doc); } else { this.docs[docRef] = null; } }; /** * Retrieves the JSON doc from the document store for a given key. * * If docRef not found, return null. * If user set not storing the documents, return null. * * @param {Integer|String} docRef The key to lookup and retrieve from the document store. * @return {Object} * @memberOf DocumentStore */ elasticlunr.DocumentStore.prototype.getDoc = function (docRef) { if (this.hasDoc(docRef) === false) return null; return this.docs[docRef]; }; /** * Checks whether the document store contains a key (docRef). * * @param {Integer|String} docRef The id to look up in the document store. * @return {Boolean} * @memberOf DocumentStore */ elasticlunr.DocumentStore.prototype.hasDoc = function (docRef) { return docRef in this.docs; }; /** * Removes the value for a key in the document store. * * @param {Integer|String} docRef The id to remove from the document store. * @memberOf DocumentStore */ elasticlunr.DocumentStore.prototype.removeDoc = function (docRef) { if (!this.hasDoc(docRef)) return; delete this.docs[docRef]; delete this.docInfo[docRef]; this.length--; }; /** * Add field length of a document's field tokens from pipeline results. * The field length of a document is used to do field length normalization even without the original JSON document stored. * * @param {Integer|String} docRef document's id or reference * @param {String} fieldName field name * @param {Integer} length field length */ elasticlunr.DocumentStore.prototype.addFieldLength = function (docRef, fieldName, length) { if (docRef === null || docRef === undefined) return; if (this.hasDoc(docRef) == false) return; if (!this.docInfo[docRef]) this.docInfo[docRef] = {}; this.docInfo[docRef][fieldName] = length; }; /** * Update field length of a document's field tokens from pipeline results. * The field length of a document is used to do field length normalization even without the original JSON document stored. * * @param {Integer|String} docRef document's id or reference * @param {String} fieldName field name * @param {Integer} length field length */ elasticlunr.DocumentStore.prototype.updateFieldLength = function (docRef, fieldName, length) { if (docRef === null || docRef === undefined) return; if (this.hasDoc(docRef) == false) return; this.addFieldLength(docRef, fieldName, length); }; /** * get field length of a document by docRef * * @param {Integer|String} docRef document id or reference * @param {String} fieldName field name * @return {Integer} field length */ elasticlunr.DocumentStore.prototype.getFieldLength = function (docRef, fieldName) { if (docRef === null || docRef === undefined) return 0; if (!(docRef in this.docs)) return 0; if (!(fieldName in this.docInfo[docRef])) return 0; return this.docInfo[docRef][fieldName]; }; /** * Returns a JSON representation of the document store used for serialisation. * * @return {Object} JSON format * @memberOf DocumentStore */ elasticlunr.DocumentStore.prototype.toJSON = function () { return { docs: this.docs, docInfo: this.docInfo, length: this.length, save: this._save }; }; /** * Cloning object * * @param {Object} object in JSON format * @return {Object} copied object */ function clone(obj) { if (null === obj || "object" !== typeof obj) return obj; var copy = obj.constructor(); for (var attr in obj) { if (obj.hasOwnProperty(attr)) copy[attr] = obj[attr]; } return copy; } /*! * elasticlunr.stemmer * Copyright (C) 2016 Oliver Nightingale * Copyright (C) 2016 Wei Song * Includes code from - http://tartarus.org/~martin/PorterStemmer/js.txt */ /** * elasticlunr.stemmer is an english language stemmer, this is a JavaScript * implementation of the PorterStemmer taken from http://tartarus.org/~martin * * @module * @param {String} str The string to stem * @return {String} * @see elasticlunr.Pipeline */ elasticlunr.stemmer = (function(){ var step2list = { "ational" : "ate", "tional" : "tion", "enci" : "ence", "anci" : "ance", "izer" : "ize", "bli" : "ble", "alli" : "al", "entli" : "ent", "eli" : "e", "ousli" : "ous", "ization" : "ize", "ation" : "ate", "ator" : "ate", "alism" : "al", "iveness" : "ive", "fulness" : "ful", "ousness" : "ous", "aliti" : "al", "iviti" : "ive", "biliti" : "ble", "logi" : "log" }, step3list = { "icate" : "ic", "ative" : "", "alize" : "al", "iciti" : "ic", "ical" : "ic", "ful" : "", "ness" : "" }, c = "[^aeiou]", // consonant v = "[aeiouy]", // vowel C = c + "[^aeiouy]*", // consonant sequence V = v + "[aeiou]*", // vowel sequence mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0 meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1 mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1 s_v = "^(" + C + ")?" + v; // vowel in stem var re_mgr0 = new RegExp(mgr0); var re_mgr1 = new RegExp(mgr1); var re_meq1 = new RegExp(meq1); var re_s_v = new RegExp(s_v); var re_1a = /^(.+?)(ss|i)es$/; var re2_1a = /^(.+?)([^s])s$/; var re_1b = /^(.+?)eed$/; var re2_1b = /^(.+?)(ed|ing)$/; var re_1b_2 = /.$/; var re2_1b_2 = /(at|bl|iz)$/; var re3_1b_2 = new RegExp("([^aeiouylsz])\\1$"); var re4_1b_2 = new RegExp("^" + C + v + "[^aeiouwxy]$"); var re_1c = /^(.+?[^aeiou])y$/; var re_2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; var re_3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; var re_4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; var re2_4 = /^(.+?)(s|t)(ion)$/; var re_5 = /^(.+?)e$/; var re_5_1 = /ll$/; var re3_5 = new RegExp("^" + C + v + "[^aeiouwxy]$"); var porterStemmer = function porterStemmer(w) { var stem, suffix, firstch, re, re2, re3, re4; if (w.length < 3) { return w; } firstch = w.substr(0,1); if (firstch == "y") { w = firstch.toUpperCase() + w.substr(1); } // Step 1a re = re_1a re2 = re2_1a; if (re.test(w)) { w = w.replace(re,"$1$2"); } else if (re2.test(w)) { w = w.replace(re2,"$1$2"); } // Step 1b re = re_1b; re2 = re2_1b; if (re.test(w)) { var fp = re.exec(w); re = re_mgr0; if (re.test(fp[1])) { re = re_1b_2; w = w.replace(re,""); } } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1]; re2 = re_s_v; if (re2.test(stem)) { w = stem; re2 = re2_1b_2; re3 = re3_1b_2; re4 = re4_1b_2; if (re2.test(w)) { w = w + "e"; } else if (re3.test(w)) { re = re_1b_2; w = w.replace(re,""); } else if (re4.test(w)) { w = w + "e"; } } } // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first letter of the word (so cry -> cri, by -> by, say -> say) re = re_1c; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; w = stem + "i"; } // Step 2 re = re_2; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = re_mgr0; if (re.test(stem)) { w = stem + step2list[suffix]; } } // Step 3 re = re_3; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = re_mgr0; if (re.test(stem)) { w = stem + step3list[suffix]; } } // Step 4 re = re_4; re2 = re2_4; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = re_mgr1; if (re.test(stem)) { w = stem; } } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1] + fp[2]; re2 = re_mgr1; if (re2.test(stem)) { w = stem; } } // Step 5 re = re_5; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = re_mgr1; re2 = re_meq1; re3 = re3_5; if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) { w = stem; } } re = re_5_1; re2 = re_mgr1; if (re.test(w) && re2.test(w)) { re = re_1b_2; w = w.replace(re,""); } // and turn initial Y back to y if (firstch == "y") { w = firstch.toLowerCase() + w.substr(1); } return w; }; return porterStemmer; })(); elasticlunr.Pipeline.registerFunction(elasticlunr.stemmer, 'stemmer'); /*! * elasticlunr.stopWordFilter * Copyright (C) 2016 Oliver Nightingale * Copyright (C) 2016 Wei Song */ /** * elasticlunr.stopWordFilter is an English language stop words filter, any words * contained in the stop word list will not be passed through the filter. * * This is intended to be used in the Pipeline. If the token does not pass the * filter then undefined will be returned. * Currently this StopwordFilter using dictionary to do O(1) time complexity stop word filtering. * * @module * @param {String} token The token to pass through the filter * @return {String} * @see elasticlunr.Pipeline */ elasticlunr.stopWordFilter = function (token) { if (token && elasticlunr.stopWordFilter.stopWords[token] !== true) { return token; } }; /** * Remove predefined stop words * if user want to use customized stop words, user could use this function to delete * all predefined stopwords. * * @return {null} */ elasticlunr.clearStopWords = function () { elasticlunr.stopWordFilter.stopWords = {}; }; /** * Add customized stop words * user could use this function to add customized stop words * * @params {Array} words customized stop words * @return {null} */ elasticlunr.addStopWords = function (words) { if (words == null || Array.isArray(words) === false) return; words.forEach(function (word) { elasticlunr.stopWordFilter.stopWords[word] = true; }, this); }; /** * Reset to default stop words * user could use this function to restore default stop words * * @return {null} */ elasticlunr.resetStopWords = function () { elasticlunr.stopWordFilter.stopWords = elasticlunr.defaultStopWords; }; elasticlunr.defaultStopWords = { "": true, "a": true, "able": true, "about": true, "across": true, "after": true, "all": true, "almost": true, "also": true, "am": true, "among": true, "an": true, "and": true, "any": true, "are": true, "as": true, "at": true, "be": true, "because": true, "been": true, "but": true, "by": true, "can": true, "cannot": true, "could": true, "dear": true, "did": true, "do": true, "does": true, "either": true, "else": true, "ever": true, "every": true, "for": true, "from": true, "get": true, "got": true, "had": true, "has": true, "have": true, "he": true, "her": true, "hers": true, "him": true, "his": true, "how": true, "however": true, "i": true, "if": true, "in": true, "into": true, "is": true, "it": true, "its": true, "just": true, "least": true, "let": true, "like": true, "likely": true, "may": true, "me": true, "might": true, "most": true, "must": true, "my": true, "neither": true, "no": true, "nor": true, "not": true, "of": true, "off": true, "often": true, "on": true, "only": true, "or": true, "other": true, "our": true, "own": true, "rather": true, "said": true, "say": true, "says": true, "she": true, "should": true, "since": true, "so": true, "some": true, "than": true, "that": true, "the": true, "their": true, "them": true, "then": true, "there": true, "these": true, "they": true, "this": true, "tis": true, "to": true, "too": true, "twas": true, "us": true, "wants": true, "was": true, "we": true, "were": true, "what": true, "when": true, "where": true, "which": true, "while": true, "who": true, "whom": true, "why": true, "will": true, "with": true, "would": true, "yet": true, "you": true, "your": true }; elasticlunr.stopWordFilter.stopWords = elasticlunr.defaultStopWords; elasticlunr.Pipe