UNPKG

@tanishiking/aho-corasick

Version:

TypeScript implementation of the Aho-Corasick algorithm for efficient string matching

290 lines (289 loc) 11 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); var trie_1 = require("./trie"); describe('Trie', function () { describe('parseText', function () { test('keyword and text are the same', function () { var trie = new trie_1.Trie(['abc']); var emits = trie.parseText('abc'); expect(emits).toHaveLength(1); expect(emits[0]).toEqual({ end: 2, start: 0, keyword: 'abc', }); }); test('test is longer than keyword', function () { var trie = new trie_1.Trie(['abc']); var emits = trie.parseText(' abc'); expect(emits).toHaveLength(1); expect(emits[0]).toEqual({ end: 3, start: 1, keyword: 'abc', }); }); test('various keywords one match', function () { var trie = new trie_1.Trie(['abc', 'bcd', 'cde']); var emits = trie.parseText('bcd'); expect(emits).toHaveLength(1); expect(emits[0]).toEqual({ end: 2, start: 0, keyword: 'bcd', }); }); test('ushers test', function () { var trie = new trie_1.Trie(['hers', 'his', 'she', 'he']); var emits = trie.parseText('ushers'); expect(emits).toHaveLength(3); // she @ 3, he @ 3, hers @ 5 expect(emits[0]).toEqual({ start: 1, end: 3, keyword: 'she', }); expect(emits[1]).toEqual({ start: 2, end: 3, keyword: 'he', }); expect(emits[2]).toEqual({ start: 2, end: 5, keyword: 'hers', }); }); // TODO pressure test test('misleading test', function () { var trie = new trie_1.Trie(['hers']); var emits = trie.parseText('h he her hers'); expect(emits).toHaveLength(1); expect(emits[0]).toEqual({ start: 9, end: 12, keyword: 'hers', }); }); test('recipes', function () { var trie = new trie_1.Trie(['veal', 'cauliflower', 'broccoli', 'tomatoes']); var emits = trie.parseText('2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli'); expect(emits).toHaveLength(4); var sorted = emits.sort(function (a, b) { return a.start - b.start; }); expect(sorted[0]).toEqual({ start: 2, end: 12, keyword: 'cauliflower', }); expect(sorted[1]).toEqual({ start: 18, end: 25, keyword: 'tomatoes', }); expect(sorted[2]).toEqual({ start: 40, end: 43, keyword: 'veal', }); expect(sorted[3]).toEqual({ start: 51, end: 58, keyword: 'broccoli', }); }); test('long and short overlapping', function () { var trie = new trie_1.Trie(['he', 'hehehehe']); var emits = trie.parseText('hehehehehe'); var sorted = emits.sort(function (a, b) { return a.start - b.start; }); expect(sorted[0]).toEqual({ start: 0, end: 1, keyword: 'he', }); expect(sorted[1]).toEqual({ start: 0, end: 7, keyword: 'hehehehe', }); expect(sorted[2]).toEqual({ start: 2, end: 3, keyword: 'he', }); expect(sorted[3]).toEqual({ start: 2, end: 9, keyword: 'hehehehe', }); expect(sorted[4]).toEqual({ start: 4, end: 5, keyword: 'he', }); expect(sorted[5]).toEqual({ start: 6, end: 7, keyword: 'he', }); expect(sorted[6]).toEqual({ start: 8, end: 9, keyword: 'he', }); }); test('non-overlapping', function () { var trie = new trie_1.Trie(['ab', 'cba', 'ababc'], { allowOverlaps: false }); var emits = trie.parseText('ababcbab'); expect(emits).toHaveLength(2); expect(emits[0]).toEqual({ start: 0, end: 4, keyword: 'ababc', }); expect(emits[1]).toEqual({ start: 6, end: 7, keyword: 'ab', }); }); test('start of Churchill speech', function () { var keywords = ['T', 'u', 'ur', 'r', 'urn', 'ni', 'i', 'in', 'n', 'urning']; var trie = new trie_1.Trie(keywords, { allowOverlaps: false }); var emits = trie.parseText('Turning'); expect(emits).toHaveLength(2); }); test('bug5InGithubReportedByXCurry', function () { var trie = new trie_1.Trie(['turning', 'once', 'again', 'börkü'], { caseInsensitive: true, onlyWholeWords: true }); var emits = trie.parseText('TurninG OnCe AgAiN BÖRKÜ'); expect(emits).toHaveLength(4); expect(emits[0]).toEqual({ start: 0, end: 6, keyword: 'turning', }); expect(emits[1]).toEqual({ start: 8, end: 11, keyword: 'once', }); expect(emits[2]).toEqual({ start: 13, end: 17, keyword: 'again', }); expect(emits[3]).toEqual({ start: 19, end: 23, keyword: 'börkü', }); }); test('case-insensitive', function () { var trie = new trie_1.Trie(['turning', 'once', 'again', 'börkü'], { caseInsensitive: true }); var emits = trie.parseText('TurninG OnCe AgAiN BÖRKÜ'); expect(emits).toHaveLength(4); expect(emits[0]).toEqual({ start: 0, end: 6, keyword: 'turning', }); expect(emits[1]).toEqual({ start: 8, end: 11, keyword: 'once', }); expect(emits[2]).toEqual({ start: 13, end: 17, keyword: 'again', }); expect(emits[3]).toEqual({ start: 19, end: 23, keyword: 'börkü', }); }); test('partial match', function () { var trie = new trie_1.Trie(['sugar'], { onlyWholeWords: true }); var emits = trie.parseText('sugarcane sugarcane sugar canesugar'); expect(emits).toHaveLength(1); expect(emits[0]).toEqual({ start: 20, end: 24, keyword: 'sugar', }); }); // Test offered by dwyerk, https://github.com/robert-bor/aho-corasick/issues/8 test('unicode string: issue 8', function () { var target = 'LİKE THIS'; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char var trie = new trie_1.Trie(['this'], { caseInsensitive: true }); expect(target.substring(5, 9)).toBe('THIS'); // Java does it the right way var emits = trie.parseText(target); expect(emits).toHaveLength(1); expect(emits[0]).toEqual({ start: 5, end: 8, keyword: 'this', }); }); test('unicode string', function () { var target = '𩸽 LOVE'; // The first characrer ('𩸽') is Unicode var trie = new trie_1.Trie(['𩸽']); expect(target.substring(0, 2)).toBe('𩸽'); var emits = trie.parseText(target); expect(emits).toHaveLength(1); expect(emits[0]).toEqual({ start: 0, end: 1, keyword: '𩸽', }); }); test('contains unicode string', function () { var target = '𩸽 LOVE'; // The first characrer ('𩸽') is Unicode var trie = new trie_1.Trie(['LOVE']); expect(target.substring(3, 7)).toBe('LOVE'); var emits = trie.parseText(target); expect(emits).toHaveLength(1); expect(emits[0]).toEqual({ start: 3, end: 6, keyword: 'LOVE', }); }); }); // describe('tokenize', () => { // public void tokenizeFullSentence() // { // Trie trie = new Trie(); // trie.addKeyword("Alpha"); // trie.addKeyword("Beta"); // trie.addKeyword("Gamma"); // Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); // assertEquals(7, tokens.start()); // Iterator<Token> tokensIt = tokens.iterator(); // assertEquals("Hear: ", tokensIt.next().getFragment()); // assertEquals("Alpha", tokensIt.next().getFragment()); // assertEquals(" team first, ", tokensIt.next().getFragment()); // assertEquals("Beta", tokensIt.next().getFragment()); // assertEquals(" from the rear, ", tokensIt.next().getFragment()); // assertEquals("Gamma", tokensIt.next().getFragment()); // assertEquals(" in reserve", tokensIt.next().getFragment()); // } // }) // @Test // public void tokenizeTokensInSequence() // { // Trie trie = new Trie(); // trie.addKeyword("Alpha"); // trie.addKeyword("Beta"); // trie.addKeyword("Gamma"); // Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma"); // assertEquals(5, tokens.start()); // } // // Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7 // @Test // public void zeroLengthTestBug7InGithubReportedByXCurry() // { // Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive(); // trie.addKeyword(""); // trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel."); // } });