UNPKG

parsey

Version:

A parser for context-free grammars

github.com/patgrasso/parsey

patgrasso/parsey

372 lines (328 loc) • 11.3 kB

JavaScript

/** * Provides functions for parsing sentences with a pre-constructed grammar. * The parsing algorithm is an implementation of the earley top-down chart * parser. The parse tree is constructed from the chart using depth-first search * * @module lib/parser */ /*eslint no-console:0*/ 'use strict'; const rules = require('./rules'); const tokenize = require('./tokenizer'); /** * Tokenizes, then parses, the input string with the given grammar. The result * is a parse tree represented with plain objects. See * {@link module:lib/parser.dfs|dfs()} for an example of the structure of a * parse tree * * @function parse * @memberof module:lib/parser * @param {string} sent - Input string to parse * @param {Rule[]|CFG} grammar - Set of Rules that define a language * @param {Function} [tokenizer=#tokenize} - Function that accepts a string and * a grammar (optionally) and splits the input string into tokens, each * representing a symbol in the language * @return {object} Root node of the parse tree */ function parse(sent, grammar, tokenizer) { let tokens = (tokenizer || tokenize)(sent, grammar || rules.rules); let states = earley(tokens, grammar || rules.rules); return dfs(states, tokens); } /** * Parses the input tokens using the earley top-down chart parsing algorithm * to product a set of states, each containing a list of earley items * * @function earley * @memberof module:lib/parser * @param {string[]} tokens - Sequence of symbols to be parsed * @param {Rule[]|CFG} grammar - Set of rules that define a language * @return {state[]} Set of 'states', each of which contains a list of earley * items. Each earley item looks something like this: * <pre><code> * { * name: [string], * rule: [Rule], * position: [number], * origin: [number] * } * </code></pre> * An earley item represents a completed parse of some individual rule. The * position should be equivalent to rule.length, and the origin, despite its * name, describes the state at which parse finished. * * This means that an earley item <i>should</i> exist in state 0 with an * origin equivalent to the number of tokens passed in to indicate that the * entire input was parsed successfully for some rule */ function earley(tokens, grammar) { let states = Array.apply(null, Array(tokens.length + 1)).map(() => []); var i, j; let rulePairs = grammar.map((rule) => ({ name : rule.lhs.name, rule : rule, position: 0, origin : 0 })); [].push.apply(states[0], rulePairs); for (i = 0; i <= tokens.length; i += 1) { for (j = 0; j < states[i].length; j += 1) { predict(tokens, states, i, j, grammar); scan(tokens, states, i, j); complete(tokens, states, i, j); } } return swap(removeUnfinishedItems(states)); } /** * Prediction stage in the earley algorithm * {@link http://loup-vaillant.fr/tutorials/earley-parsing/recogniser} * * This also avoids adding duplicate rules to a state, a pitfall caused by * left-recursive grammars * * @function predict * @param {string[]} tokens - Input tokens being parsed * @param {state[]} states - Set of lists of earley items * @param {number} i - Index of the earley state to be processed * @param {number} j - Index of the earley item to be processed within the state * @param {Rule[]|CFG} grammar */ function predict(tokens, states, i, j, grammar) { let curr = states[i][j]; // prediction if (curr.rule[curr.position] instanceof rules.Sym) { grammar.forEach((rule) => { let stateHasItem = states[i].filter((earleyItem) => { return earleyItem.rule === rule && curr.position === 0;//earleyItem.position; }).length > 0; if (!stateHasItem) { states[i].push({ name : rule.lhs.name, rule : rule, position: 0, origin : i }); } }); } } /** * Scanning stage in the earley algorithm * {@link http://loup-vaillant.fr/tutorials/earley-parsing/recogniser} * * @function scan * @param {string[]} tokens - Input tokens being parsed * @param {state[]} states - Set of lists of earley items * @param {number} i - Index of the earley state to be processed * @param {number} j - Index of the earley item to be processed within the state */ function scan(tokens, states, i, j) { let newItem , curr = states[i][j]; // scan if (curr.rule[curr.position] instanceof RegExp) { // regex matches token if (curr.rule[curr.position].test(tokens[i]) && i < states.length) { newItem = Object.assign({}, curr); newItem.position += 1; states[i + 1].push(newItem); } } if (typeof curr.rule[curr.position] === 'string') { // string equals token if (curr.rule[curr.position] === tokens[i] && i < states.length) { newItem = Object.assign({}, curr); newItem.position += 1; states[i + 1].push(newItem); } } } /** * Completion stage in the earley algorithm * {@link http://loup-vaillant.fr/tutorials/earley-parsing/recogniser} * * @function complete * @param {string[]} tokens - Input tokens being parsed * @param {state[]} states - Set of lists of earley items * @param {number} i - Index of the earley state to be processed * @param {number} j - Index of the earley item to be processed within the state */ function complete(tokens, states, i, j) { let newItem , curr = states[i][j]; // completion (check first because the position may be out of bounds) if (curr.position >= curr.rule.length) { states[curr.origin].forEach((earleyItem) => { if (earleyItem.rule[earleyItem.position] === curr.rule.lhs) { let stateHasItem = states[i].filter((ei) => { return ei.rule === earleyItem.rule && ei.position === earleyItem.position + 1 && ei.origin === earleyItem.origin; }).length > 0; if (stateHasItem) { return; } newItem = Object.assign({}, earleyItem); newItem.position += 1; states[i].push(newItem); } }); } } /** * Removes earley items from each state that failed to completely parse through. * In other words, removes earley items whose position is less than the length * of its rule * * @function removeUnfinishedItems * @param {state[]} states - Set of lists of earley items * @return {state[]} Set of lists of completed earley items */ function removeUnfinishedItems(states) { return states.map((state) => state.filter((earleyItem) => { return earleyItem.position >= earleyItem.rule.length; })); } /** * Places earley items in the states in which they originated, as opposed to the * states in which they finished parsing, and set their `origin` properties to * the state in which they finished. * * This allows a depth-first search of the chart to move forwards through the * graph, which is more intuitive than having to move backwards * * @function swap * @param {state[]} states - Set of lists of earley items * @return {state[]} Set of lists of earley items, but each item now exists in * the state at which it originated, and the <code>origin</code> property of * each item points to the state at which the parse completed */ function swap(states) { let newStates = Array.apply(null, Array(states.length)).map(() => []); states.forEach((state, i) => { state.forEach((earleyItem) => { newStates[earleyItem.origin].push(earleyItem); earleyItem.origin = i; }); }); return newStates; } /** * Performs a depth-first search on the chart generated by {@link #earley()} in * order to construct a parse tree, an example of which is shown below * * @example * { * item: <Rule sum -> [factor, '+', factor]>, * children: [ * { // first symbol - 'factor' * item: <Rule factor -> [/\d+/]>, * children: [ * '2' * ] * }, * '+', // second symbol * { // third symbol - another 'factor' * item: <Rule factor -> [/\d+/]>, * children: [ * '3' * ] * } * ] * } * * @function dfs * @memberof module:lib/parser * @param {state[]} states - Set of lists of earley items * @param {string[]} tokens - Input tokens to be parsed * @return {object} Root node of the parse tree */ function dfs(states, tokens) { let root = states[0].reduce((best, curr) => { if (best == null || curr.origin > best.origin) { return curr; } return best; }, null); if (root == null) { throw new SyntaxError(`Parsing error near '${tokens[0]}' `); } if (root.origin !== tokens.length) { throw new SyntaxError(`Parsing error near '${tokens[root.origin]}' `); } return { item : root.rule, children: dfsHelper(states, root, 0, 0, tokens) }; } /** * Recursive function that explores a specific earley item, constructs the parse * tree for it, then sends it up the chimney! * * @function dfsHelper * @param {state[]} states - Set of lists of earley items * @param {earleyItem} root - Current earley item being explored, a tree for * which is to be constructed * @param {number} state - Current state/index of our current position in the * list of tokens * @param {number} depth - Index/position in the root's rule (RHS). In other * words, index of the next symbol to match or explore * @param {string[]} tokens - List of input tokens * @return {null|object[]} Null if the search provided NO results for the * given node, or a list of tree nodes, which are the respective parse trees * of each of the root rule's RHS symbols */ function dfsHelper(states, root, state, depth, tokens) { var edges; // Base case: we finished the root rule if (state === root.origin && depth === root.rule.length) { return []; } // If the current production symbol is a terminal if (root.rule[depth] instanceof RegExp) { if (root.rule[depth].test(tokens[state])) { let subMatch = dfsHelper(states, root, state + 1, depth + 1, tokens); if (subMatch) { return [tokens[state]].concat(subMatch); } } return null; } else if (typeof root.rule[depth] === 'string') { if (root.rule[depth] === tokens[state]) { let subMatch = dfsHelper(states, root, state + 1, depth + 1, tokens); if (subMatch) { return [tokens[state]].concat(subMatch); } } return null; } // Otherwise, it must be a non-terminal edges = states[state] .filter((item) => item.rule.lhs === root.rule[depth]) .map((item) => { let subMatch = dfsHelper(states, root, item.origin, depth + 1, tokens); if (subMatch) { return [{ item : item.rule, children: dfsHelper(states, item, state, 0, tokens) }].concat(subMatch); } return null; }) .filter((list) => list); if (edges.length > 1) { let diffs = edges.filter( (tree) => JSON.stringify(tree) !== JSON.stringify(edges[0]) ); if (diffs.length > 0) { //console.log('Ambiguity\n' + JSON.stringify(edges, null, 2)); console.log('Ambiguous rules'); } } return edges[0]; } module.exports.parse = parse; module.exports.earley = earley; module.exports.dfs = dfs;