UNPKG

chevrotain

Version:

Chevrotain is a high performance fault tolerant javascript parsing DSL for building recursive decent parsers

github.com/SAP/chevrotain

587 lines • 30 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); var lexer_1 = require("./lexer"); var utils_1 = require("../utils/utils"); var tokens_1 = require("./tokens"); var LexerDefinitionErrorType; (function (LexerDefinitionErrorType) { LexerDefinitionErrorType[LexerDefinitionErrorType["MISSING_PATTERN"] = 0] = "MISSING_PATTERN"; LexerDefinitionErrorType[LexerDefinitionErrorType["INVALID_PATTERN"] = 1] = "INVALID_PATTERN"; LexerDefinitionErrorType[LexerDefinitionErrorType["EOI_ANCHOR_FOUND"] = 2] = "EOI_ANCHOR_FOUND"; LexerDefinitionErrorType[LexerDefinitionErrorType["UNSUPPORTED_FLAGS_FOUND"] = 3] = "UNSUPPORTED_FLAGS_FOUND"; LexerDefinitionErrorType[LexerDefinitionErrorType["DUPLICATE_PATTERNS_FOUND"] = 4] = "DUPLICATE_PATTERNS_FOUND"; LexerDefinitionErrorType[LexerDefinitionErrorType["INVALID_GROUP_TYPE_FOUND"] = 5] = "INVALID_GROUP_TYPE_FOUND"; LexerDefinitionErrorType[LexerDefinitionErrorType["PUSH_MODE_DOES_NOT_EXIST"] = 6] = "PUSH_MODE_DOES_NOT_EXIST"; LexerDefinitionErrorType[LexerDefinitionErrorType["MULTI_MODE_LEXER_WITHOUT_DEFAULT_MODE"] = 7] = "MULTI_MODE_LEXER_WITHOUT_DEFAULT_MODE"; LexerDefinitionErrorType[LexerDefinitionErrorType["MULTI_MODE_LEXER_WITHOUT_MODES_PROPERTY"] = 8] = "MULTI_MODE_LEXER_WITHOUT_MODES_PROPERTY"; LexerDefinitionErrorType[LexerDefinitionErrorType["MULTI_MODE_LEXER_DEFAULT_MODE_VALUE_DOES_NOT_EXIST"] = 9] = "MULTI_MODE_LEXER_DEFAULT_MODE_VALUE_DOES_NOT_EXIST"; LexerDefinitionErrorType[LexerDefinitionErrorType["LEXER_DEFINITION_CANNOT_CONTAIN_UNDEFINED"] = 10] = "LEXER_DEFINITION_CANNOT_CONTAIN_UNDEFINED"; LexerDefinitionErrorType[LexerDefinitionErrorType["SOI_ANCHOR_FOUND"] = 11] = "SOI_ANCHOR_FOUND"; LexerDefinitionErrorType[LexerDefinitionErrorType["EMPTY_MATCH_PATTERN"] = 12] = "EMPTY_MATCH_PATTERN"; LexerDefinitionErrorType[LexerDefinitionErrorType["NO_LINE_BREAKS_FLAGS"] = 13] = "NO_LINE_BREAKS_FLAGS"; LexerDefinitionErrorType[LexerDefinitionErrorType["UNREACHABLE_PATTERN"] = 14] = "UNREACHABLE_PATTERN"; })(LexerDefinitionErrorType = exports.LexerDefinitionErrorType || (exports.LexerDefinitionErrorType = {})); var DEFAULT_LEXER_CONFIG = { deferDefinitionErrorsHandling: false, positionTracking: "full", lineTerminatorsPattern: /\n|\r\n?/g }; Object.freeze(DEFAULT_LEXER_CONFIG); var Lexer = /** @class */ (function () { /** * @param {SingleModeLexerDefinition | IMultiModeLexerDefinition} lexerDefinition - * Structure composed of constructor functions for the Tokens types this lexer will support. * * In the case of {SingleModeLexerDefinition} the structure is simply an array of TokenTypes. * In the case of {IMultiModeLexerDefinition} the structure is an object with two properties: * 1. a "modes" property where each value is an array of TokenTypes. * 2. a "defaultMode" property specifying the initial lexer mode. * * for example: * { * "modes" : { * "modeX" : [Token1, Token2] * "modeY" : [Token3, Token4] * } * * "defaultMode" : "modeY" * } * * A lexer with {MultiModesDefinition} is simply multiple Lexers where only one (mode) can be active at the same time. * This is useful for lexing languages where there are different lexing rules depending on context. * * The current lexing mode is selected via a "mode stack". * The last (peek) value in the stack will be the current mode of the lexer. * * Each Token Type can define that it will cause the Lexer to (after consuming an "instance" of the Token): * 1. PUSH_MODE : push a new mode to the "mode stack" * 2. POP_MODE : pop the last mode from the "mode stack" * * Examples: * export class Attribute { * static PATTERN = ... * static PUSH_MODE = "modeY" * } * * export class EndAttribute { * static PATTERN = ... * static POP_MODE = true * } * * The TokenTypes must be in one of these forms: * * 1. With a PATTERN property that has a RegExp value for tokens to match: * example: -->class Integer { static PATTERN = /[1-9]\d }<-- * * 2. With a PATTERN property that has the value of the var Lexer.NA defined above. * This is a convenience form used to avoid matching Token classes that only act as categories. * example: -->class Keyword { static PATTERN = NA }<-- * * * The following RegExp patterns are not supported: * a. '$' for match at end of input * b. /b global flag * c. /m multi-line flag * * The Lexer will identify the first pattern that matches, Therefor the order of Token Constructors may be significant. * For example when one pattern may match a prefix of another pattern. * * Note that there are situations in which we may wish to order the longer pattern after the shorter one. * For example: keywords vs Identifiers. * 'do'(/do/) and 'donald'(/w+) * * * If the Identifier pattern appears before the 'do' pattern, both 'do' and 'donald' * will be lexed as an Identifier. * * * If the 'do' pattern appears before the Identifier pattern 'do' will be lexed correctly as a keyword. * however 'donald' will be lexed as TWO separate tokens: keyword 'do' and identifier 'nald'. * * To resolve this problem, add a static property on the keyword's constructor named: LONGER_ALT * example: * * export class Identifier extends Keyword { static PATTERN = /[_a-zA-Z][_a-zA-Z0-9]/ } * export class Keyword Token { * static PATTERN = Lexer.NA * static LONGER_ALT = Identifier * } * export class Do extends Keyword { static PATTERN = /do/ } * export class While extends Keyword { static PATTERN = /while/ } * export class Return extends Keyword { static PATTERN = /return/ } * * The lexer will then also attempt to match a (longer) Identifier each time a keyword is matched. * * * @param {ILexerConfig} [config=DEFAULT_LEXER_CONFIG] - * The Lexer's configuration @see {ILexerConfig} for details. */ function Lexer(lexerDefinition, config) { if (config === void 0) { config = DEFAULT_LEXER_CONFIG; } var _this = this; this.lexerDefinition = lexerDefinition; this.lexerDefinitionErrors = []; this.patternIdxToConfig = {}; this.modes = []; this.emptyGroups = {}; this.config = undefined; this.trackStartLines = true; this.trackEndLines = true; this.hasCustom = false; if (typeof config === "boolean") { throw Error("The second argument to the Lexer constructor is now an ILexerConfig Object.\n" + "a boolean 2nd argument is no longer supported"); } // todo: defaults func? this.config = utils_1.merge(DEFAULT_LEXER_CONFIG, config); if (this.config.lineTerminatorsPattern === DEFAULT_LEXER_CONFIG.lineTerminatorsPattern) { // optimized built-in implementation for the defaults definition of lineTerminators this.config.lineTerminatorsPattern = lexer_1.LineTerminatorOptimizedTester; } this.trackStartLines = /full|onlyStart/i.test(this.config.positionTracking); this.trackEndLines = /full/i.test(this.config.positionTracking); var hasOnlySingleMode = true; var actualDefinition; // Convert SingleModeLexerDefinition into a IMultiModeLexerDefinition. if (utils_1.isArray(lexerDefinition)) { actualDefinition = { modes: {} }; actualDefinition.modes[lexer_1.DEFAULT_MODE] = utils_1.cloneArr(lexerDefinition); actualDefinition[lexer_1.DEFAULT_MODE] = lexer_1.DEFAULT_MODE; } else { // no conversion needed, input should already be a IMultiModeLexerDefinition hasOnlySingleMode = false; actualDefinition = utils_1.cloneObj(lexerDefinition); } this.lexerDefinitionErrors = this.lexerDefinitionErrors.concat(lexer_1.performRuntimeChecks(actualDefinition, this.trackStartLines)); // for extra robustness to avoid throwing an none informative error message actualDefinition.modes = actualDefinition.modes ? actualDefinition.modes : {}; // an error of undefined TokenTypes will be detected in "performRuntimeChecks" above. // this transformation is to increase robustness in the case of partially invalid lexer definition. utils_1.forEach(actualDefinition.modes, function (currModeValue, currModeName) { actualDefinition.modes[currModeName] = utils_1.reject(currModeValue, function (currTokType) { return utils_1.isUndefined(currTokType); }); }); var allModeNames = utils_1.keys(actualDefinition.modes); utils_1.forEach(actualDefinition.modes, function (currModDef, currModName) { _this.modes.push(currModName); _this.lexerDefinitionErrors = _this.lexerDefinitionErrors.concat(lexer_1.validatePatterns(currModDef, allModeNames)); // If definition errors were encountered, the analysis phase may fail unexpectedly/ // Considering a lexer with definition errors may never be used, there is no point // to performing the analysis anyhow... if (utils_1.isEmpty(_this.lexerDefinitionErrors)) { tokens_1.augmentTokenTypes(currModDef); var currAnalyzeResult = lexer_1.analyzeTokenTypes(currModDef); _this.patternIdxToConfig[currModName] = currAnalyzeResult.patternIdxToConfig; _this.emptyGroups = utils_1.merge(_this.emptyGroups, currAnalyzeResult.emptyGroups); _this.hasCustom = currAnalyzeResult.hasCustom || _this.hasCustom; } }); this.defaultMode = actualDefinition.defaultMode; if (!utils_1.isEmpty(this.lexerDefinitionErrors) && !this.config.deferDefinitionErrorsHandling) { var allErrMessages = utils_1.map(this.lexerDefinitionErrors, function (error) { return error.message; }); var allErrMessagesString = allErrMessages.join("-----------------------\n"); throw new Error("Errors detected in definition of Lexer:\n" + allErrMessagesString); } // Choose the relevant internal implementations for this specific parser. // These implementations should be in-lined by the JavaScript engine // to provide optimal performance in each scenario. if (lexer_1.SUPPORT_STICKY) { this.chopInput = utils_1.IDENTITY; this.match = this.matchWithTest; } else { this.updateLastIndex = utils_1.NOOP; this.match = this.matchWithExec; } if (hasOnlySingleMode) { this.handleModes = utils_1.NOOP; } if (this.trackStartLines === false) { this.computeNewColumn = utils_1.IDENTITY; } if (this.trackEndLines === false) { this.updateTokenEndLineColumnLocation = utils_1.NOOP; } if (/full/i.test(this.config.positionTracking)) { this.createTokenInstance = this.createFullToken; } else if (/onlyStart/i.test(this.config.positionTracking)) { this.createTokenInstance = this.createStartOnlyToken; } else if (/onlyOffset/i.test(this.config.positionTracking)) { this.createTokenInstance = this.createOffsetOnlyToken; } else { throw Error("Invalid <positionTracking> config option: \"" + this.config.positionTracking + "\""); } if (this.hasCustom) { this.addToken = this.addTokenUsingPush; } else { this.addToken = this.addTokenUsingMemberAccess; } } /** * Will lex(Tokenize) a string. * Note that this can be called repeatedly on different strings as this method * does not modify the state of the Lexer. * * @param {string} text - The string to lex * @param {string} [initialMode] - The initial Lexer Mode to start with, by default this will be the first mode in the lexer's * definition. If the lexer has no explicit modes it will be the implicit single 'default_mode' mode. * * @returns {ILexingResult} */ Lexer.prototype.tokenize = function (text, initialMode) { if (initialMode === void 0) { initialMode = this.defaultMode; } if (!utils_1.isEmpty(this.lexerDefinitionErrors)) { var allErrMessages = utils_1.map(this.lexerDefinitionErrors, function (error) { return error.message; }); var allErrMessagesString = allErrMessages.join("-----------------------\n"); throw new Error("Unable to Tokenize because Errors detected in definition of Lexer:\n" + allErrMessagesString); } var lexResult = this.tokenizeInternal(text, initialMode); return lexResult; }; // There is quite a bit of duplication between this and "tokenizeInternalLazy" // This is intentional due to performance considerations. Lexer.prototype.tokenizeInternal = function (text, initialMode) { var _this = this; var i, j, matchAltImage, longerAltIdx, matchedImage, imageLength, group, tokType, newToken, errLength, droppedChar, msg, match; var orgText = text; var orgLength = orgText.length; var offset = 0; var matchedTokensIndex = 0; // initializing the tokensArray to the "guessed" size. // guessing too little will still reduce the number of array re-sizes on pushes. // guessing too large (Tested by guessing x4 too large) may cost a bit more of memory // but would still have a faster runtime by avoiding (All but one) array resizing. var guessedNumberOfTokens = this.hasCustom ? 0 // will break custom token pattern APIs the matchedTokens array will contain undefined elements. : Math.floor(text.length / 10); var matchedTokens = new Array(guessedNumberOfTokens); var errors = []; var line = this.trackStartLines ? 1 : undefined; var column = this.trackStartLines ? 1 : undefined; var groups = lexer_1.cloneEmptyGroups(this.emptyGroups); var trackLines = this.trackStartLines; var lineTerminatorPattern = this.config.lineTerminatorsPattern; var currModePatternsLength = 0; var patternIdxToConfig = []; var modeStack = []; var pop_mode = function (popToken) { // TODO: perhaps avoid this error in the edge case there is no more input? if (modeStack.length === 1 && // if we have both a POP_MODE and a PUSH_MODE this is in-fact a "transition" // So no error should occur. popToken.tokenType.PUSH_MODE === undefined) { // if we try to pop the last mode there lexer will no longer have ANY mode. // thus the pop is ignored, an error will be created and the lexer will continue parsing in the previous mode. var msg_1 = "Unable to pop Lexer Mode after encountering Token ->" + popToken.image + "<- The Mode Stack is empty"; errors.push({ offset: popToken.startOffset, line: popToken.startLine !== undefined ? popToken.startLine : undefined, column: popToken.startColumn !== undefined ? popToken.startColumn : undefined, length: popToken.image.length, message: msg_1 }); } else { modeStack.pop(); var newMode = utils_1.last(modeStack); patternIdxToConfig = _this.patternIdxToConfig[newMode]; currModePatternsLength = patternIdxToConfig.length; } }; function push_mode(newMode) { modeStack.push(newMode); patternIdxToConfig = this.patternIdxToConfig[newMode]; currModePatternsLength = patternIdxToConfig.length; } // this pattern seems to avoid a V8 de-optimization, although that de-optimization does not // seem to matter performance wise. push_mode.call(this, initialMode); var currConfig; while (offset < orgLength) { matchedImage = null; for (i = 0; i < currModePatternsLength; i++) { currConfig = patternIdxToConfig[i]; var currPattern = currConfig.pattern; // manually in-lined because > 600 chars won't be in-lined in V8 var singleCharCode = currConfig.short; if (singleCharCode !== false) { if (orgText.charCodeAt(offset) === singleCharCode) { // single character string matchedImage = currPattern; } } else if (currConfig.isCustom === true) { match = currPattern.exec(orgText, offset, matchedTokens, groups); matchedImage = match !== null ? match[0] : match; } else { this.updateLastIndex(currPattern, offset); matchedImage = this.match(currPattern, text, offset); } if (matchedImage !== null) { // even though this pattern matched we must try a another longer alternative. // this can be used to prioritize keywords over identifiers longerAltIdx = currConfig.longerAlt; if (longerAltIdx !== undefined) { // TODO: micro optimize, avoid extra prop access // by saving/linking longerAlt on the original config? var longerAltConfig = patternIdxToConfig[longerAltIdx]; var longerAltPattern = longerAltConfig.pattern; // single Char can never be a longer alt so no need to test it. // manually in-lined because > 600 chars won't be in-lined in V8 if (longerAltConfig.isCustom === true) { match = longerAltPattern.exec(orgText, offset, matchedTokens, groups); matchAltImage = match !== null ? match[0] : match; } else { this.updateLastIndex(longerAltPattern, offset); matchAltImage = this.match(longerAltPattern, text, offset); } if (matchAltImage && matchAltImage.length > matchedImage.length) { matchedImage = matchAltImage; currConfig = longerAltConfig; } } break; } } // successful match if (matchedImage !== null) { // matchedImage = match[0] imageLength = matchedImage.length; group = currConfig.group; if (group !== undefined) { tokType = currConfig.tokenTypeIdx; // TODO: "offset + imageLength" and the new column may be computed twice in case of "full" location information inside // createFullToken method newToken = this.createTokenInstance(matchedImage, offset, tokType, currConfig.tokenType, line, column, imageLength); if (group === false) { matchedTokensIndex = this.addToken(matchedTokens, matchedTokensIndex, newToken); } else { groups[group].push(newToken); } } text = this.chopInput(text, imageLength); offset = offset + imageLength; // TODO: with newlines the column may be assigned twice column = this.computeNewColumn(column, imageLength); if (trackLines === true && currConfig.canLineTerminator === true) { var numOfLTsInMatch = 0; var foundTerminator = void 0; var lastLTEndOffset = void 0; lineTerminatorPattern.lastIndex = 0; do { foundTerminator = lineTerminatorPattern.test(matchedImage); if (foundTerminator === true) { lastLTEndOffset = lineTerminatorPattern.lastIndex - 1; numOfLTsInMatch++; } } while (foundTerminator); if (numOfLTsInMatch !== 0) { line = line + numOfLTsInMatch; column = imageLength - lastLTEndOffset; this.updateTokenEndLineColumnLocation(newToken, group, lastLTEndOffset, numOfLTsInMatch, line, column, imageLength); } } // will be NOOP if no modes present this.handleModes(i, currConfig, pop_mode, push_mode, newToken); } else { // error recovery, drop characters until we identify a valid token's start point var errorStartOffset = offset; var errorLine = line; var errorColumn = column; var foundResyncPoint = false; while (!foundResyncPoint && offset < orgLength) { // drop chars until we succeed in matching something droppedChar = orgText.charCodeAt(offset); // Identity Func (when sticky flag is enabled) text = this.chopInput(text, 1); offset++; for (j = 0; j < currModePatternsLength; j++) { var currConfig_1 = patternIdxToConfig[j]; var currPattern = currConfig_1.pattern; // manually in-lined because > 600 chars won't be in-lined in V8 var singleCharCode = currConfig_1.short; if (singleCharCode !== false) { if (orgText.charCodeAt(offset) === singleCharCode) { // single character string foundResyncPoint = true; } } else if (currConfig_1.isCustom === true) { foundResyncPoint = currPattern.exec(orgText, offset, matchedTokens, groups) !== null; } else { this.updateLastIndex(currPattern, offset); foundResyncPoint = currPattern.exec(text) !== null; } if (foundResyncPoint === true) { break; } } } errLength = offset - errorStartOffset; // at this point we either re-synced or reached the end of the input text msg = "unexpected character: ->" + orgText.charAt(errorStartOffset) + "<- at offset: " + errorStartOffset + "," + (" skipped " + (offset - errorStartOffset) + " characters."); errors.push({ offset: errorStartOffset, line: errorLine, column: errorColumn, length: errLength, message: msg }); } } // if we do have custom patterns which push directly into the if (!this.hasCustom) { // if we guessed a too large size for the tokens array this will shrink it to the right size. matchedTokens.length = matchedTokensIndex; } return { tokens: matchedTokens, groups: groups, errors: errors }; }; Lexer.prototype.handleModes = function (i, config, pop_mode, push_mode, newToken) { if (config.pop === true) { // need to save the PUSH_MODE property as if the mode is popped // patternIdxToPopMode is updated to reflect the new mode after popping the stack var pushMode = config.push; pop_mode(newToken); if (pushMode !== undefined) { push_mode.call(this, pushMode); } } else if (config.push !== undefined) { push_mode.call(this, config.push); } }; Lexer.prototype.chopInput = function (text, length) { return text.substring(length); }; Lexer.prototype.updateLastIndex = function (regExp, newLastIndex) { regExp.lastIndex = newLastIndex; }; // TODO: decrease this under 600 characters? inspect stripping comments option in TSC compiler Lexer.prototype.updateTokenEndLineColumnLocation = function (newToken, group, lastLTIdx, numOfLTsInMatch, line, column, imageLength) { var lastCharIsLT, fixForEndingInLT; if (group !== undefined) { // a none skipped multi line Token, need to update endLine/endColumn lastCharIsLT = lastLTIdx === imageLength - 1; fixForEndingInLT = lastCharIsLT ? -1 : 0; if (!(numOfLTsInMatch === 1 && lastCharIsLT === true)) { // if a token ends in a LT that last LT only affects the line numbering of following Tokens newToken.endLine = line + fixForEndingInLT; // the last LT in a token does not affect the endColumn either as the [columnStart ... columnEnd) // inclusive to exclusive range. newToken.endColumn = column - 1 + -fixForEndingInLT; } // else single LT in the last character of a token, no need to modify the endLine/EndColumn } }; Lexer.prototype.computeNewColumn = function (oldColumn, imageLength) { return oldColumn + imageLength; }; // Place holder, will be replaced by the correct variant according to the locationTracking option at runtime. /* istanbul ignore next - place holder */ Lexer.prototype.createTokenInstance = function () { var args = []; for (var _i = 0; _i < arguments.length; _i++) { args[_i] = arguments[_i]; } return null; }; Lexer.prototype.createOffsetOnlyToken = function (image, startOffset, tokenTypeIdx, tokenType) { return { image: image, startOffset: startOffset, tokenTypeIdx: tokenTypeIdx, tokenType: tokenType }; }; Lexer.prototype.createStartOnlyToken = function (image, startOffset, tokenTypeIdx, tokenType, startLine, startColumn) { return { image: image, startOffset: startOffset, startLine: startLine, startColumn: startColumn, tokenTypeIdx: tokenTypeIdx, tokenType: tokenType }; }; Lexer.prototype.createFullToken = function (image, startOffset, tokenTypeIdx, tokenType, startLine, startColumn, imageLength) { return { image: image, startOffset: startOffset, endOffset: startOffset + imageLength - 1, startLine: startLine, endLine: startLine, startColumn: startColumn, endColumn: startColumn + imageLength - 1, tokenTypeIdx: tokenTypeIdx, tokenType: tokenType }; }; // Place holder, will be replaced by the correct variant according to the locationTracking option at runtime. /* istanbul ignore next - place holder */ Lexer.prototype.addToken = function (tokenVector, index, tokenToAdd) { return 666; }; Lexer.prototype.addTokenUsingPush = function (tokenVector, index, tokenToAdd) { tokenVector.push(tokenToAdd); return index; }; Lexer.prototype.addTokenUsingMemberAccess = function (tokenVector, index, tokenToAdd) { tokenVector[index] = tokenToAdd; index++; return index; }; /* istanbul ignore next - place holder to be replaced with chosen alternative at runtime */ Lexer.prototype.match = function (pattern, text, offset) { return null; }; Lexer.prototype.matchWithTest = function (pattern, text, offset) { var found = pattern.test(text); if (found === true) { return text.substring(offset, pattern.lastIndex); } return null; }; Lexer.prototype.matchWithExec = function (pattern, text) { var regExpArray = pattern.exec(text); return regExpArray !== null ? regExpArray[0] : regExpArray; }; Lexer.SKIPPED = "This marks a skipped Token pattern, this means each token identified by it will" + "be consumed and then thrown into oblivion, this can be used to for example to completely ignore whitespace."; Lexer.NA = /NOT_APPLICABLE/; return Lexer; }()); exports.Lexer = Lexer; //# sourceMappingURL=lexer_public.js.map