UNPKG

@tanishiking/aho-corasick

Version:

TypeScript implementation of the Aho-Corasick algorithm for efficient string matching

145 lines (144 loc) 5.96 kB
"use strict"; var __assign = (this && this.__assign) || function () { __assign = Object.assign || function(t) { for (var s, i = 1, n = arguments.length; i < n; i++) { s = arguments[i]; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p]; } return t; }; return __assign.apply(this, arguments); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.Emit = exports.defaultConfig = exports.Trie = void 0; var trie_config_1 = require("./trie-config"); Object.defineProperty(exports, "defaultConfig", { enumerable: true, get: function () { return trie_config_1.defaultConfig; } }); var state_1 = require("./state"); var emit_1 = require("./emit"); Object.defineProperty(exports, "Emit", { enumerable: true, get: function () { return emit_1.Emit; } }); var utils_1 = require("./utils"); var interval_tree_1 = require("./interval/interval-tree"); /** * Aho-Corasick implementation based on http://cr.yp.to/bib/1975/aho.pdf * Port of * robert-bor/aho-corasick https://github.com/robert-bor/aho-corasick and * hankcs/aho-corasick https://github.com/hankcs/aho-corasick */ var Trie = /** @class */ (function () { function Trie(keywords, options) { var _this = this; this.failureStateConstructed = false; this.rootState = new state_1.State(0); if (typeof keywords !== 'undefined' && keywords.length > 0) { keywords.forEach(function (keyword) { _this.addKeyword(keyword); }); } if (options) { this.options = __assign(__assign({}, trie_config_1.defaultConfig), options); } else { this.options = trie_config_1.defaultConfig; } } Trie.prototype.addKeyword = function (keyword) { if (keyword.length === 0) return; var currentState = this.rootState; utils_1.stringToArray(keyword).forEach(function (char) { currentState = currentState.addState(char); }); currentState.addEmits([keyword]); }; /** * Find keywords from given text. * * @param text - The text to search for keywords. */ Trie.prototype.parseText = function (text) { var _this = this; this.checkForConstructedFailureStates(); var pos = 0; var currentState = this.rootState; var collectedEmits = []; utils_1.stringToArray(text).forEach(function (originalChar) { var char = _this.options.caseInsensitive ? originalChar.toLowerCase() : originalChar; currentState = _this.getState(currentState, char); var emits = _this.toEmits(pos, currentState); collectedEmits.push.apply(collectedEmits, emits); pos++; }); // Filter out partial words. var emits = this.options.onlyWholeWords ? this.removePartialMatches(text, collectedEmits) : collectedEmits; // Filter out overlaps, bigger size has larger priority. var filteredOverlaps = !this.options.allowOverlaps ? new interval_tree_1.IntervalTree(emits).removeOverlaps(emits) : emits; return filteredOverlaps; }; /** * Jump to the next state, using both goto and failure. * * @param currentState - Current state. * @param char - Accepted character. * @returns Jumped state. */ Trie.prototype.getState = function (currentState, char) { var state = currentState; var newCurrentState = currentState.nextState(char); while (newCurrentState === null) { state = state.failure; newCurrentState = state.nextState(char); } return newCurrentState; }; Trie.prototype.checkForConstructedFailureStates = function () { if (!this.failureStateConstructed) { this.constructFailureStates(); } }; Trie.prototype.constructFailureStates = function () { var _this = this; var queue = []; this.rootState.failure = this.rootState; this.rootState.getStates().forEach(function (depthOneState) { depthOneState.failure = _this.rootState; queue.push(depthOneState); }); var _loop_1 = function () { // cannot be undefined because queue.length > 0 var currentState = queue.shift(); currentState.getTransitions().forEach(function (transition) { // This can't be null var targetState = currentState.nextState(transition); queue.push(targetState); var traceFailureState = currentState.failure; while (traceFailureState.nextState(transition) === null) { traceFailureState = traceFailureState.failure; } // cannot be null because traceFailure.nextState(transition) !== null here. var newFailureState = traceFailureState.nextState(transition); targetState.failure = newFailureState; targetState.addEmits(newFailureState.emits); }); }; while (queue.length > 0) { _loop_1(); } this.failureStateConstructed = true; }; Trie.prototype.removePartialMatches = function (searchText, emits) { var start = searchText.length; return emits.filter(function (emit) { return ((emit.start === 0 || !utils_1.isAlphaNumeric(searchText.charAt(emit.start - 1))) && (emit.end + 1 == start || !utils_1.isAlphaNumeric(searchText.charAt(emit.end + 1)))); }); }; Trie.prototype.toEmits = function (end, currentState) { var emits = currentState.emits; return emits.map(function (emit) { return new emit_1.Emit(end - utils_1.stringToArray(emit).length + 1, end, emit); }); }; return Trie; }()); exports.Trie = Trie;