UNPKG

parsey

Version:

A parser for context-free grammars

39 lines (34 loc) 1.11 kB
/** * Provides a function for tokenizing a sentence given some grammar * * @module lib/tokenizer */ 'use strict'; /** * Tokenizes a sentence given some grammar by finding all terminal symbols * within the grammar and splitting the sentence by each of those symbols * * @function * @param {string} sent - Sentence or string to be split/tokenized * @param {Rule[]|CFG} grammar - Set of Rules that define a language * @return {string[]} Tokens/the sentence, split by each terminal character * found within the grammar */ module.exports = (sent, grammar) => { let terms = grammar.reduce( (tokens, rule) => tokens.concat( rule.filter((sym) => typeof sym === 'string' || sym instanceof RegExp) ), []) , tokens = terms .map((token) => { if (typeof token === 'string') { return token.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, '\\$&'); } return token.source; }) , delims = RegExp('(' + tokens.join('|') + ')'); return sent .split(delims) .map((item) => item.trim()) .filter((item) => item !== ''); };