@tanishiking/aho-corasick
Version:
TypeScript implementation of the Aho-Corasick algorithm for efficient string matching
290 lines (289 loc) • 11 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
var trie_1 = require("./trie");
describe('Trie', function () {
describe('parseText', function () {
test('keyword and text are the same', function () {
var trie = new trie_1.Trie(['abc']);
var emits = trie.parseText('abc');
expect(emits).toHaveLength(1);
expect(emits[0]).toEqual({
end: 2,
start: 0,
keyword: 'abc',
});
});
test('test is longer than keyword', function () {
var trie = new trie_1.Trie(['abc']);
var emits = trie.parseText(' abc');
expect(emits).toHaveLength(1);
expect(emits[0]).toEqual({
end: 3,
start: 1,
keyword: 'abc',
});
});
test('various keywords one match', function () {
var trie = new trie_1.Trie(['abc', 'bcd', 'cde']);
var emits = trie.parseText('bcd');
expect(emits).toHaveLength(1);
expect(emits[0]).toEqual({
end: 2,
start: 0,
keyword: 'bcd',
});
});
test('ushers test', function () {
var trie = new trie_1.Trie(['hers', 'his', 'she', 'he']);
var emits = trie.parseText('ushers');
expect(emits).toHaveLength(3); // she @ 3, he @ 3, hers @ 5
expect(emits[0]).toEqual({
start: 1,
end: 3,
keyword: 'she',
});
expect(emits[1]).toEqual({
start: 2,
end: 3,
keyword: 'he',
});
expect(emits[2]).toEqual({
start: 2,
end: 5,
keyword: 'hers',
});
});
// TODO pressure test
test('misleading test', function () {
var trie = new trie_1.Trie(['hers']);
var emits = trie.parseText('h he her hers');
expect(emits).toHaveLength(1);
expect(emits[0]).toEqual({
start: 9,
end: 12,
keyword: 'hers',
});
});
test('recipes', function () {
var trie = new trie_1.Trie(['veal', 'cauliflower', 'broccoli', 'tomatoes']);
var emits = trie.parseText('2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli');
expect(emits).toHaveLength(4);
var sorted = emits.sort(function (a, b) { return a.start - b.start; });
expect(sorted[0]).toEqual({
start: 2,
end: 12,
keyword: 'cauliflower',
});
expect(sorted[1]).toEqual({
start: 18,
end: 25,
keyword: 'tomatoes',
});
expect(sorted[2]).toEqual({
start: 40,
end: 43,
keyword: 'veal',
});
expect(sorted[3]).toEqual({
start: 51,
end: 58,
keyword: 'broccoli',
});
});
test('long and short overlapping', function () {
var trie = new trie_1.Trie(['he', 'hehehehe']);
var emits = trie.parseText('hehehehehe');
var sorted = emits.sort(function (a, b) { return a.start - b.start; });
expect(sorted[0]).toEqual({
start: 0,
end: 1,
keyword: 'he',
});
expect(sorted[1]).toEqual({
start: 0,
end: 7,
keyword: 'hehehehe',
});
expect(sorted[2]).toEqual({
start: 2,
end: 3,
keyword: 'he',
});
expect(sorted[3]).toEqual({
start: 2,
end: 9,
keyword: 'hehehehe',
});
expect(sorted[4]).toEqual({
start: 4,
end: 5,
keyword: 'he',
});
expect(sorted[5]).toEqual({
start: 6,
end: 7,
keyword: 'he',
});
expect(sorted[6]).toEqual({
start: 8,
end: 9,
keyword: 'he',
});
});
test('non-overlapping', function () {
var trie = new trie_1.Trie(['ab', 'cba', 'ababc'], { allowOverlaps: false });
var emits = trie.parseText('ababcbab');
expect(emits).toHaveLength(2);
expect(emits[0]).toEqual({
start: 0,
end: 4,
keyword: 'ababc',
});
expect(emits[1]).toEqual({
start: 6,
end: 7,
keyword: 'ab',
});
});
test('start of Churchill speech', function () {
var keywords = ['T', 'u', 'ur', 'r', 'urn', 'ni', 'i', 'in', 'n', 'urning'];
var trie = new trie_1.Trie(keywords, { allowOverlaps: false });
var emits = trie.parseText('Turning');
expect(emits).toHaveLength(2);
});
test('bug5InGithubReportedByXCurry', function () {
var trie = new trie_1.Trie(['turning', 'once', 'again', 'börkü'], { caseInsensitive: true, onlyWholeWords: true });
var emits = trie.parseText('TurninG OnCe AgAiN BÖRKÜ');
expect(emits).toHaveLength(4);
expect(emits[0]).toEqual({
start: 0,
end: 6,
keyword: 'turning',
});
expect(emits[1]).toEqual({
start: 8,
end: 11,
keyword: 'once',
});
expect(emits[2]).toEqual({
start: 13,
end: 17,
keyword: 'again',
});
expect(emits[3]).toEqual({
start: 19,
end: 23,
keyword: 'börkü',
});
});
test('case-insensitive', function () {
var trie = new trie_1.Trie(['turning', 'once', 'again', 'börkü'], { caseInsensitive: true });
var emits = trie.parseText('TurninG OnCe AgAiN BÖRKÜ');
expect(emits).toHaveLength(4);
expect(emits[0]).toEqual({
start: 0,
end: 6,
keyword: 'turning',
});
expect(emits[1]).toEqual({
start: 8,
end: 11,
keyword: 'once',
});
expect(emits[2]).toEqual({
start: 13,
end: 17,
keyword: 'again',
});
expect(emits[3]).toEqual({
start: 19,
end: 23,
keyword: 'börkü',
});
});
test('partial match', function () {
var trie = new trie_1.Trie(['sugar'], { onlyWholeWords: true });
var emits = trie.parseText('sugarcane sugarcane sugar canesugar');
expect(emits).toHaveLength(1);
expect(emits[0]).toEqual({
start: 20,
end: 24,
keyword: 'sugar',
});
});
// Test offered by dwyerk, https://github.com/robert-bor/aho-corasick/issues/8
test('unicode string: issue 8', function () {
var target = 'LİKE THIS'; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
var trie = new trie_1.Trie(['this'], { caseInsensitive: true });
expect(target.substring(5, 9)).toBe('THIS'); // Java does it the right way
var emits = trie.parseText(target);
expect(emits).toHaveLength(1);
expect(emits[0]).toEqual({
start: 5,
end: 8,
keyword: 'this',
});
});
test('unicode string', function () {
var target = '𩸽 LOVE'; // The first characrer ('𩸽') is Unicode
var trie = new trie_1.Trie(['𩸽']);
expect(target.substring(0, 2)).toBe('𩸽');
var emits = trie.parseText(target);
expect(emits).toHaveLength(1);
expect(emits[0]).toEqual({
start: 0,
end: 1,
keyword: '𩸽',
});
});
test('contains unicode string', function () {
var target = '𩸽 LOVE'; // The first characrer ('𩸽') is Unicode
var trie = new trie_1.Trie(['LOVE']);
expect(target.substring(3, 7)).toBe('LOVE');
var emits = trie.parseText(target);
expect(emits).toHaveLength(1);
expect(emits[0]).toEqual({
start: 3,
end: 6,
keyword: 'LOVE',
});
});
});
// describe('tokenize', () => {
// public void tokenizeFullSentence()
// {
// Trie trie = new Trie();
// trie.addKeyword("Alpha");
// trie.addKeyword("Beta");
// trie.addKeyword("Gamma");
// Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
// assertEquals(7, tokens.start());
// Iterator<Token> tokensIt = tokens.iterator();
// assertEquals("Hear: ", tokensIt.next().getFragment());
// assertEquals("Alpha", tokensIt.next().getFragment());
// assertEquals(" team first, ", tokensIt.next().getFragment());
// assertEquals("Beta", tokensIt.next().getFragment());
// assertEquals(" from the rear, ", tokensIt.next().getFragment());
// assertEquals("Gamma", tokensIt.next().getFragment());
// assertEquals(" in reserve", tokensIt.next().getFragment());
// }
// })
// @Test
// public void tokenizeTokensInSequence()
// {
// Trie trie = new Trie();
// trie.addKeyword("Alpha");
// trie.addKeyword("Beta");
// trie.addKeyword("Gamma");
// Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
// assertEquals(5, tokens.start());
// }
// // Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7
// @Test
// public void zeroLengthTestBug7InGithubReportedByXCurry()
// {
// Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
// trie.addKeyword("");
// trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
// }
});