@tanishiking/aho-corasick
Version:
TypeScript implementation of the Aho-Corasick algorithm for efficient string matching
145 lines (144 loc) • 5.96 kB
JavaScript
;
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Emit = exports.defaultConfig = exports.Trie = void 0;
var trie_config_1 = require("./trie-config");
Object.defineProperty(exports, "defaultConfig", { enumerable: true, get: function () { return trie_config_1.defaultConfig; } });
var state_1 = require("./state");
var emit_1 = require("./emit");
Object.defineProperty(exports, "Emit", { enumerable: true, get: function () { return emit_1.Emit; } });
var utils_1 = require("./utils");
var interval_tree_1 = require("./interval/interval-tree");
/**
* Aho-Corasick implementation based on http://cr.yp.to/bib/1975/aho.pdf
* Port of
* robert-bor/aho-corasick https://github.com/robert-bor/aho-corasick and
* hankcs/aho-corasick https://github.com/hankcs/aho-corasick
*/
var Trie = /** @class */ (function () {
function Trie(keywords, options) {
var _this = this;
this.failureStateConstructed = false;
this.rootState = new state_1.State(0);
if (typeof keywords !== 'undefined' && keywords.length > 0) {
keywords.forEach(function (keyword) {
_this.addKeyword(keyword);
});
}
if (options) {
this.options = __assign(__assign({}, trie_config_1.defaultConfig), options);
}
else {
this.options = trie_config_1.defaultConfig;
}
}
Trie.prototype.addKeyword = function (keyword) {
if (keyword.length === 0)
return;
var currentState = this.rootState;
utils_1.stringToArray(keyword).forEach(function (char) {
currentState = currentState.addState(char);
});
currentState.addEmits([keyword]);
};
/**
* Find keywords from given text.
*
* @param text - The text to search for keywords.
*/
Trie.prototype.parseText = function (text) {
var _this = this;
this.checkForConstructedFailureStates();
var pos = 0;
var currentState = this.rootState;
var collectedEmits = [];
utils_1.stringToArray(text).forEach(function (originalChar) {
var char = _this.options.caseInsensitive ? originalChar.toLowerCase() : originalChar;
currentState = _this.getState(currentState, char);
var emits = _this.toEmits(pos, currentState);
collectedEmits.push.apply(collectedEmits, emits);
pos++;
});
// Filter out partial words.
var emits = this.options.onlyWholeWords ? this.removePartialMatches(text, collectedEmits) : collectedEmits;
// Filter out overlaps, bigger size has larger priority.
var filteredOverlaps = !this.options.allowOverlaps ? new interval_tree_1.IntervalTree(emits).removeOverlaps(emits) : emits;
return filteredOverlaps;
};
/**
* Jump to the next state, using both goto and failure.
*
* @param currentState - Current state.
* @param char - Accepted character.
* @returns Jumped state.
*/
Trie.prototype.getState = function (currentState, char) {
var state = currentState;
var newCurrentState = currentState.nextState(char);
while (newCurrentState === null) {
state = state.failure;
newCurrentState = state.nextState(char);
}
return newCurrentState;
};
Trie.prototype.checkForConstructedFailureStates = function () {
if (!this.failureStateConstructed) {
this.constructFailureStates();
}
};
Trie.prototype.constructFailureStates = function () {
var _this = this;
var queue = [];
this.rootState.failure = this.rootState;
this.rootState.getStates().forEach(function (depthOneState) {
depthOneState.failure = _this.rootState;
queue.push(depthOneState);
});
var _loop_1 = function () {
// cannot be undefined because queue.length > 0
var currentState = queue.shift();
currentState.getTransitions().forEach(function (transition) {
// This can't be null
var targetState = currentState.nextState(transition);
queue.push(targetState);
var traceFailureState = currentState.failure;
while (traceFailureState.nextState(transition) === null) {
traceFailureState = traceFailureState.failure;
}
// cannot be null because traceFailure.nextState(transition) !== null here.
var newFailureState = traceFailureState.nextState(transition);
targetState.failure = newFailureState;
targetState.addEmits(newFailureState.emits);
});
};
while (queue.length > 0) {
_loop_1();
}
this.failureStateConstructed = true;
};
Trie.prototype.removePartialMatches = function (searchText, emits) {
var start = searchText.length;
return emits.filter(function (emit) {
return ((emit.start === 0 || !utils_1.isAlphaNumeric(searchText.charAt(emit.start - 1))) &&
(emit.end + 1 == start || !utils_1.isAlphaNumeric(searchText.charAt(emit.end + 1))));
});
};
Trie.prototype.toEmits = function (end, currentState) {
var emits = currentState.emits;
return emits.map(function (emit) {
return new emit_1.Emit(end - utils_1.stringToArray(emit).length + 1, end, emit);
});
};
return Trie;
}());
exports.Trie = Trie;