UNPKG

parsing

Version:

JSON grammar-based parser

github.com/asmblah/parsing

asmblah/parsing

812 lines (700 loc) • 32.5 kB

JavaScript

/* * Parsing - JSON grammar-based parser * Copyright (c) Dan Phillimore (asmblah) * http://asmblah.github.com/parsing/ * * Released under the MIT license * https://github.com/asmblah/parsing/raw/master/MIT-LICENSE.txt */ 'use strict'; var _ = require('microdash'), countNewlines = require('./countNewlines'), findLastNewlineFrom = require('./findLastNewlineFrom'), hasOwn = {}.hasOwnProperty, AbortException = require('./Exception/Abort'), Component = require('./Component'), Exception = require('./Exception/Exception'), ParseException = require('./Exception/Parse'), Rule = require('./Rule'); function Parser(grammarSpec, stderr, options) { var context; options = options || {}; context = options.context || {}; this.errorHandler = null; this.furthestIgnoreMatch = null; this.furthestIgnoreMatchOffset = -1; this.furthestMatch = null; this.furthestMatchOffset = -1; this.grammarSpec = grammarSpec; this.matchCaches = []; this.options = options; this.rules = null; this.state = null; this.stderr = stderr; (function (parser) { // Ensure the regex is anchored to the start of the string so it matches the very next characters function anchorRegex(regex) { if (regex.source.charAt(0) !== '^') { regex = new RegExp('^(?:' + regex.source + ')', regex.toString().match(/[^\/]*$/)[0]); } return regex; } // Speed up repeated match tests in complex grammars by caching component matches function createMatchCache() { // Use an array for the match caches, as it can be cleared easily by just zeroing .length // and the indexes into the cache are numeric anyway (input string offsets) var matchCache = []; parser.matchCaches.push(matchCache); return matchCache; } var qualifiers = { // Like "(...)" grouping - 'arg' is an array of components that must all match 'allOf': function (text, offset, line, lineOffset, arg, args, options) { var firstLine = null, firstLineOffset = null, lines = 0, lastLineOffset = lineOffset, matches = [], textLength = 0, textOffset = null; _.each(arg, function (component) { var componentMatch = component.match( text, offset + (textOffset || 0) + textLength, line + lines, lastLineOffset, options ); if (componentMatch === null) { matches = null; return false; } matches.push(componentMatch.components); if (componentMatch.isEmpty) { // Empty matches are possible when an "optionally" qualifier is used, // which must be treated specially as they should not fail the parent match return; } if (firstLine === null) { firstLine = componentMatch.firstLine; firstLineOffset = componentMatch.firstLineOffset; } lines += componentMatch.lines; lastLineOffset = componentMatch.lastLineOffset; textLength += componentMatch.textLength; if (textOffset === null) { textOffset = componentMatch.textOffset; } else { textLength += componentMatch.textOffset; } }); if (firstLine === null) { firstLine = 0; firstLineOffset = 0; } return matches ? { components: matches, firstLine: firstLine, firstLineOffset: firstLineOffset, lines: lines, lastLine: line + lines, lastLineOffset: lastLineOffset, textLength: textLength, textOffset: textOffset || 0 } : null; }, // Like "|" (alternation) - 'arg' is an array of components, one of which must match 'oneOf': function (text, offset, line, lineOffset, arg, args, options) { var match = null; _.each(arg, function (component) { var componentMatch = component.match(text, offset, line, lineOffset, options); if (componentMatch !== null) { match = componentMatch; return false; } }); return match; }, // Like "+" - 'arg' is a component, which must match one or more times consecutively 'oneOrMoreOf': function (text, offset, line, lineOffset, arg, args, options) { var componentMatch, firstLine = null, firstLineOffset = null, lines = 0, lastLineOffset = lineOffset, matches = [], textLength = 0, textOffset = null; while ( ( componentMatch = arg.match( text, offset + (textOffset || 0) + textLength, line + lines, lastLineOffset, options ) ) !== null ) { lines += componentMatch.lines; lastLineOffset = componentMatch.lastLineOffset; textLength += componentMatch.textLength; matches.push(componentMatch.components); if (firstLine === null) { firstLine = componentMatch.firstLine; firstLineOffset = componentMatch.firstLineOffset; } if (textOffset === null) { textOffset = componentMatch.textOffset; } else { textLength += componentMatch.textOffset; } } if (firstLine === null) { firstLine = 0; firstLineOffset = 0; } return matches.length > 0 ? { components: matches, firstLine: firstLine, firstLineOffset: firstLineOffset, lines: lines, lastLine: line + lines, lastLineOffset: lastLineOffset, textLength: textLength, textOffset: textOffset || 0 } : null; }, // Like "?" - 'arg' is a component which may or may not match 'optionally': function (text, offset, line, lineOffset, arg, args, options) { var match = arg.match(text, offset, line, lineOffset, options); if (match) { if (args.wrapInArray) { return { components: [match.components], firstLine: match.firstLine, firstLineOffset: match.firstLineOffset, lines: match.lines, lastLine: match.lastLine, lastLineOffset: match.lastLineOffset, textLength: match.textLength, textOffset: match.textOffset }; } return match; } return { isEmpty: true, components: args.wrapInArray ? [] : '', firstLine: line, firstLineOffset: lineOffset, lines: 0, lastLine: line, lastLineOffset: lineOffset, textLength: 0, textOffset: 0 }; }, // Refers to another rule 'rule': function (text, offset, line, lineOffset, arg, args, options) { var expectedText = hasOwn.call(args, 'text') ? args.text : null, match = arg.match(text, offset, line, lineOffset, options); if (match === null) { return null; } return (expectedText === null || text.substr(offset + match.textOffset, match.textLength) === expectedText) ? match : null; }, // Matches a regex, constant string, another rule or calls a callback for a dynamic match 'what': function (text, offset, line, lineOffset, arg, args, options) { var captureIndex, lines, match, result, whitespaceLength = 0, whitespaceLines = 0, whitespaceLastLineOffset = lineOffset; function skipWhitespace() { var match; if (parser.ignoreRule && options.ignoreWhitespace !== false && args.ignoreWhitespace !== false) { // Prevent infinite recursion of whitespace skipper while ( ( match = parser.ignoreRule.match( text, offset + whitespaceLength, line + whitespaceLines, whitespaceLastLineOffset, {ignoreWhitespace: false} ) ) ) { whitespaceLines += match.lines; whitespaceLastLineOffset = match.lastLineOffset; whitespaceLength += match.textLength; } } } function replace(string) { if (args.replace) { _.each(args.replace, function (data) { string = string.replace(data.pattern, data.replacement); }); } return string; } if (_.isString(arg)) { skipWhitespace(); if (text.substr(offset + whitespaceLength, arg.length) === arg) { lines = countNewlines(arg); return { components: arg, // First line should be the first line _after_ any skipped leading whitespace firstLine: line + whitespaceLines, firstLineOffset: whitespaceLastLineOffset, // Lines should be the total no. of lines _including_ whitespace lines: whitespaceLines + lines, lastLine: line + whitespaceLines + lines, lastLineOffset: findLastNewlineFrom(text, offset + whitespaceLength + arg.length - 1), textLength: arg.length, textOffset: whitespaceLength }; } } else if (arg instanceof RegExp) { skipWhitespace(); // TODO: Optimise so we dont need to do this substr - // perhaps use regex sticky flag and set .lastIndexOf where supported? match = text.substr(offset + whitespaceLength).match(arg); if (match) { captureIndex = args.captureIndex || 0; lines = countNewlines(match[0]); return { components: replace(match[captureIndex]), // First line should be the first line _after_ any skipped leading whitespace firstLine: line + whitespaceLines, firstLineOffset: whitespaceLastLineOffset, // Lines should be the total no. of lines _including_ whitespace lines: whitespaceLines + lines, lastLine: line + whitespaceLines + lines, // NB: All regexes are anchored, so we can rely on the last newline position // in the matched substring like this lastLineOffset: findLastNewlineFrom(text, offset + whitespaceLength + match[0].length - 1), // Always return the entire match length even though we may have only captured part of it textLength: match[0].length, textOffset: whitespaceLength }; } } else if (arg instanceof Component) { result = arg.match(text, offset, line, lineOffset, options); if (_.isString(result)) { result = replace(result); } else if (result && _.isString(result.components)) { result.components = replace(result.components); } return result; } else if (_.isFunction(arg)) { // Used by eg. the special <BOF> and <EOF> rules skipWhitespace(); return arg( text, offset, whitespaceLength, whitespaceLines, line + whitespaceLines, whitespaceLastLineOffset, options ); } else { throw new Exception('Parser "what" qualifier :: Invalid argument "' + arg + '"'); } return null; }, // Like "*" 'zeroOrMoreOf': function (text, offset, line, lineOffset, arg, args, options) { var componentMatch, firstLine = null, firstLineOffset = null, lines = 0, lastLineOffset = lineOffset, matches = [], textLength = 0, textOffset = null; while ( ( componentMatch = arg.match( text, offset + (textOffset || 0) + textLength, line + lines, lastLineOffset, options ) ) !== null ) { lines += componentMatch.lines; lastLineOffset = componentMatch.lastLineOffset; textLength += componentMatch.textLength; matches.push(componentMatch.components); if (firstLine === null) { firstLine = componentMatch.firstLine; firstLineOffset = componentMatch.firstLineOffset; } if (textOffset === null) { textOffset = componentMatch.textOffset; } else { textLength += componentMatch.textOffset; } } if (firstLine === null) { firstLine = 0; firstLineOffset = 0; } return { components: matches, firstLine: firstLine, firstLineOffset: firstLineOffset, lines: lines, lastLine: line + lines, lastLineOffset: lastLineOffset, textLength: textLength, textOffset: textOffset || 0 }; } }, originalRules = {}, rules = {}; // Special BeginningOfFile rule rules['<BOF>'] = new Rule(parser, context, createMatchCache(), '<BOF>', null, null); rules['<BOF>'].setComponent(new Component(parser, context, 'what', qualifiers.what, function ( text, offset, textOffset, textOffsetLines, firstLine, firstLineOffset ) { return offset === 0 ? { components: '', // First line should be the first line _after_ any skipped leading whitespace firstLine: firstLine, firstLineOffset: firstLineOffset, // Lines should be the total no. of lines _including_ whitespace lines: textOffsetLines, lastLine: firstLine, lastLineOffset: findLastNewlineFrom(text, offset + textOffset), textLength: 0, textOffset: textOffset } : null; }, {}, null)); // Special EndOfFile rule rules['<EOF>'] = new Rule(parser, context, createMatchCache(), '<EOF>', null, null); rules['<EOF>'].setComponent(new Component(parser, context, 'what', qualifiers.what, function ( text, offset, textOffset, textOffsetLines, firstLine, firstLineOffset ) { return offset + textOffset === text.length ? { components: '', // First line should be the first line _after_ any skipped leading whitespace firstLine: firstLine, firstLineOffset: firstLineOffset, // Lines should be the total no. of lines _including_ whitespace lines: textOffsetLines, lastLine: firstLine, lastLineOffset: findLastNewlineFrom(text, offset + textOffset), textLength: 0, textOffset: textOffset } : null; }, {}, null)); // Go through and create objects for all rules in this grammar first so we can set up circular references function createRule(ruleSpec, name) { return new Rule( parser, context, createMatchCache(), name, ruleSpec.captureAs || null, ruleSpec.ifNoMatch || null, ruleSpec.processor || null, ruleSpec.options || null ); } _.each(grammarSpec.rules, function (ruleSpec, name) { var rule = createRule(ruleSpec, name); // Store 'original' rules here too, as rules may be overridden by options originalRules[name] = rule; rules[name] = rule; }); _.each(options.rules || {}, function (ruleSpec, name) { // Create custom rule objects rules[name] = createRule(ruleSpec, name); }); function defineRule(ruleSpec, ruleName, rules, selfReferencingRuleMap) { function createComponent(componentSpec) { var arg, args = {}, name = null, qualifierName = null; // Component is a group if (_.isArray(componentSpec)) { qualifierName = 'allOf'; arg = []; _.each(componentSpec, function (componentSpec, index) { arg[index] = createComponent(componentSpec); }); // Component is the name of another rule } else if (_.isString(componentSpec)) { qualifierName = 'rule'; arg = rules[componentSpec]; if (!arg) { throw new Exception('Parser :: Invalid component - no rule with name "' + componentSpec + '" exists'); } // Component is a regex terminal } else if (componentSpec instanceof RegExp) { componentSpec = anchorRegex(componentSpec); qualifierName = 'what'; arg = componentSpec; } else if (_.isPlainObject(componentSpec)) { _.each(qualifiers, function (qualifier, name) { var value; if (hasOwn.call(componentSpec, name)) { value = componentSpec[name]; qualifierName = name; if (qualifierName === 'oneOf') { arg = []; _.each(value, function (value, index) { arg[index] = createComponent(value); }); } else if (qualifierName === 'optionally') { arg = createComponent(value); } else { arg = (value instanceof RegExp) ? anchorRegex(value) : createComponent(value); } // Qualifier found, stop searching return false; } }); if (!qualifierName) { if (Object.keys(componentSpec).length !== 1) { throw new Exception('Parser :: Invalid component - no valid qualifier referenced by spec: ' + JSON.stringify(componentSpec)); } (function () { var name = Object.keys(componentSpec)[0]; qualifierName = 'rule'; arg = rules[name]; if (!arg) { throw new Exception('Parser :: Invalid component - no rule with name "' + name + '" exists'); } args.text = componentSpec[name]; }()); } // Pull all arguments out of component spec, excluding the qualifier itself and name (if specified) _.each(componentSpec, function (value, name) { if (name !== qualifierName && name !== 'name') { args[name] = value; } }); // Get component name if specified if (hasOwn.call(componentSpec, 'name')) { name = componentSpec.name; } } else { throw new Exception('Parser :: Invalid componentSpec "' + componentSpec + '" specified'); } // Custom rule refers to the original in grammar spec if ( qualifierName === 'rule' && arg.name === ruleName && hasOwn.call(selfReferencingRuleMap, ruleName) ) { arg = selfReferencingRuleMap[ruleName]; } if (!qualifiers[qualifierName]) { throw new Exception('Parser :: Invalid component - qualifier name "' + qualifierName + '" is invalid'); } return new Component( parser, context, qualifierName, qualifiers[qualifierName], arg, args, name, parser.options.captureAllBounds ? grammarSpec.bounds || 'bounds' : null ); } rules[ruleName].setComponent(createComponent(ruleSpec.components || ruleSpec)); } _.each(grammarSpec.rules, function (ruleSpec, ruleName) { if (hasOwn.call(options.rules || {}, ruleName)) { // Rule has been overridden: initialise its rule object in `originalRules`, // as any references of an overridden rule to itself will actually refer // back to the original rule and not the one built from the new, overridden spec. defineRule(ruleSpec, ruleName, originalRules, rules); return; } defineRule(ruleSpec, ruleName, rules, rules); }); _.each(options.rules || {}, function (ruleSpec, ruleName) { defineRule(ruleSpec, ruleName, rules, originalRules); }); parser.rules = rules; parser.ignoreRule = rules[grammarSpec.ignore] || null; parser.startRule = rules[grammarSpec.start]; }(this)); } _.extend(Parser.prototype, { /** * Clears the match cache for all rules of the loaded grammar */ clearMatchCache: function () { _.each(this.matchCaches, function (matchCache) { matchCache.length = 0; }); }, getErrorHandler: function () { var parser = this; if (!parser.errorHandler && parser.grammarSpec.ErrorHandler) { parser.errorHandler = new parser.grammarSpec.ErrorHandler(parser.stderr, parser.getState()); } return parser.errorHandler; }, /** * Fetches the 0-based offset into the input string at the end * of the furthest match into the string * * @returns {number} */ getFurthestMatchEnd: function () { var parser = this; if (parser.furthestIgnoreMatchOffset > parser.furthestMatchOffset) { return parser.furthestIgnoreMatchOffset + parser.furthestIgnoreMatch.textLength; } return parser.furthestMatchOffset + (parser.furthestMatch ? parser.furthestMatch.textLength : 0); }, /** * Fetches the 0-based offset into the input string at the start * of the furthest match into the string * * @returns {number} */ getFurthestMatchStart: function () { var parser = this; if (parser.furthestIgnoreMatchOffset > parser.furthestMatchOffset) { return parser.furthestIgnoreMatchOffset; } return parser.furthestMatchOffset; }, getState: function () { var parser = this; if (!parser.state && parser.grammarSpec.State) { parser.state = new parser.grammarSpec.State(); } return parser.state; }, logFurthestIgnoreMatch: function (match, offset) { var parser = this; if (offset >= parser.furthestIgnoreMatchOffset && match.textLength > 0) { parser.furthestIgnoreMatch = match; parser.furthestIgnoreMatchOffset = offset; } }, logFurthestMatch: function (match, offset) { var parser = this; if (offset >= parser.furthestMatchOffset && match.textLength > 0) { parser.furthestMatch = match; parser.furthestMatchOffset = offset; } }, /** * Parses the given input text using the loaded grammar, optionally from a given start/entry rule * * @param {string} text * @param {Object} options * @param {string=} startRule * @return {Object} */ parse: function (text, options, startRule) { var parser = this, error, errorHandler = parser.getErrorHandler(), furthestMatchEnd, rule = startRule ? parser.rules[startRule] : parser.startRule, match, matchEnd = 0, matchLine, matchLastLineOffset, matchStart, message, whitespaceMatch; parser.clearMatchCache(); parser.furthestIgnoreMatch = null; parser.furthestIgnoreMatchOffset = -1; parser.furthestMatch = null; parser.furthestMatchOffset = -1; try { match = rule.match(text, 0, 0, 0, options); if (match) { matchLine = match.lines; matchLastLineOffset = match.textOffset + match.lastLineOffset; matchEnd = match.textOffset + match.textLength; // Skip any trailing whitespace if the grammar specifies it if (parser.ignoreRule) { while ( (whitespaceMatch = parser.ignoreRule.match( text, matchEnd, matchLine, matchLastLineOffset, // Prevent infinite recursion of whitespace skipper {ignoreWhitespace: false}) ) ) { matchLine += whitespaceMatch.lines; matchLastLineOffset += match.lastLineOffset; matchEnd += whitespaceMatch.textOffset + whitespaceMatch.textLength; } } } } catch (error) { if (!(error instanceof AbortException)) { throw error; } // The custom ErrorHandler returned a result rather than throwing, so return it from here return error.getResult(); } if (match === null || matchEnd < text.length) { // Determine the furthest offset the parser managed to parse to furthestMatchEnd = parser.getFurthestMatchEnd(); if (furthestMatchEnd === -1) { matchStart = -1; message = 'No match'; } else { matchStart = match ? match.textOffset : parser.getFurthestMatchStart(); if (furthestMatchEnd === text.length) { message = 'Unexpected end of file'; } else { message = 'Unexpected "' + text.charAt(furthestMatchEnd) + '"'; } } error = new ParseException( 'Parser.parse() :: ' + message, text, matchStart, furthestMatchEnd, {} ); if (!errorHandler) { throw error; } return errorHandler.handle(error); } return match.components; } }); module.exports = Parser;