elasticlunr
Version:
Lightweight full-text search engine in Javascript for browser search and offline search.
87 lines (75 loc) • 2.37 kB
JavaScript
/*!
* elasticlunr.tokenizer
* Copyright (C) @YEAR Oliver Nightingale
* Copyright (C) @YEAR Wei Song
*/
/**
* A function for splitting a string into tokens.
* Currently English is supported as default.
* Uses `elasticlunr.tokenizer.seperator` to split strings, you could change
* the value of this property to set how you want strings are split into tokens.
* IMPORTANT: use elasticlunr.tokenizer.seperator carefully, if you are not familiar with
* text process, then you'd better not change it.
*
* @module
* @param {String} str The string that you want to tokenize.
* @see elasticlunr.tokenizer.seperator
* @return {Array}
*/
elasticlunr.tokenizer = function (str) {
if (!arguments.length || str === null || str === undefined) return [];
if (Array.isArray(str)) {
var arr = str.filter(function(token) {
if (token === null || token === undefined) {
return false;
}
return true;
});
arr = arr.map(function (t) {
return elasticlunr.utils.toString(t).toLowerCase();
});
var out = [];
arr.forEach(function(item) {
var tokens = item.split(elasticlunr.tokenizer.seperator);
out = out.concat(tokens);
}, this);
return out;
}
return str.toString().trim().toLowerCase().split(elasticlunr.tokenizer.seperator);
};
/**
* Default string seperator.
*/
elasticlunr.tokenizer.defaultSeperator = /[\s\-]+/;
/**
* The sperator used to split a string into tokens. Override this property to change the behaviour of
* `elasticlunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens.
*
* @static
* @see elasticlunr.tokenizer
*/
elasticlunr.tokenizer.seperator = elasticlunr.tokenizer.defaultSeperator;
/**
* Set up customized string seperator
*
* @param {Object} sep The customized seperator that you want to use to tokenize a string.
*/
elasticlunr.tokenizer.setSeperator = function(sep) {
if (sep !== null && sep !== undefined && typeof(sep) === 'object') {
elasticlunr.tokenizer.seperator = sep;
}
}
/**
* Reset string seperator
*
*/
elasticlunr.tokenizer.resetSeperator = function() {
elasticlunr.tokenizer.seperator = elasticlunr.tokenizer.defaultSeperator;
}
/**
* Get string seperator
*
*/
elasticlunr.tokenizer.getSeperator = function() {
return elasticlunr.tokenizer.seperator;
}