chevrotain
Version:
Chevrotain is a high performance fault tolerant javascript parsing DSL for building recursive decent parsers
913 lines • 40 kB
JavaScript
var __extends = (this && this.__extends) || (function () {
var extendStatics = function (d, b) {
extendStatics = Object.setPrototypeOf ||
({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||
function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };
return extendStatics(d, b);
};
return function (d, b) {
extendStatics(d, b);
function __() { this.constructor = d; }
d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.charCodeToOptimizedIndex = exports.minOptimizationVal = exports.buildLineBreakIssueMessage = exports.LineTerminatorOptimizedTester = exports.isShortPattern = exports.isCustomPattern = exports.cloneEmptyGroups = exports.performWarningRuntimeChecks = exports.performRuntimeChecks = exports.addStickyFlag = exports.addStartOfInput = exports.findUnreachablePatterns = exports.findModesThatDoNotExist = exports.findInvalidGroupType = exports.findDuplicatePatterns = exports.findUnsupportedFlags = exports.findStartOfInputAnchor = exports.findEmptyMatchRegExps = exports.findEndOfInputAnchor = exports.findInvalidPatterns = exports.findMissingPatterns = exports.validatePatterns = exports.analyzeTokenTypes = exports.enableSticky = exports.disableSticky = exports.SUPPORT_STICKY = exports.MODES = exports.DEFAULT_MODE = void 0;
var regexp_to_ast_1 = require("regexp-to-ast");
var lexer_public_1 = require("./lexer_public");
var utils_1 = require("../utils/utils");
var reg_exp_1 = require("./reg_exp");
var reg_exp_parser_1 = require("./reg_exp_parser");
var PATTERN = "PATTERN";
exports.DEFAULT_MODE = "defaultMode";
exports.MODES = "modes";
exports.SUPPORT_STICKY = typeof new RegExp("(?:)").sticky === "boolean";
function disableSticky() {
exports.SUPPORT_STICKY = false;
}
exports.disableSticky = disableSticky;
function enableSticky() {
exports.SUPPORT_STICKY = true;
}
exports.enableSticky = enableSticky;
function analyzeTokenTypes(tokenTypes, options) {
options = utils_1.defaults(options, {
useSticky: exports.SUPPORT_STICKY,
debug: false,
safeMode: false,
positionTracking: "full",
lineTerminatorCharacters: ["\r", "\n"],
tracer: function (msg, action) { return action(); }
});
var tracer = options.tracer;
tracer("initCharCodeToOptimizedIndexMap", function () {
initCharCodeToOptimizedIndexMap();
});
var onlyRelevantTypes;
tracer("Reject Lexer.NA", function () {
onlyRelevantTypes = utils_1.reject(tokenTypes, function (currType) {
return currType[PATTERN] === lexer_public_1.Lexer.NA;
});
});
var hasCustom = false;
var allTransformedPatterns;
tracer("Transform Patterns", function () {
hasCustom = false;
allTransformedPatterns = utils_1.map(onlyRelevantTypes, function (currType) {
var currPattern = currType[PATTERN];
/* istanbul ignore else */
if (utils_1.isRegExp(currPattern)) {
var regExpSource = currPattern.source;
if (regExpSource.length === 1 &&
// only these regExp meta characters which can appear in a length one regExp
regExpSource !== "^" &&
regExpSource !== "$" &&
regExpSource !== "." &&
!currPattern.ignoreCase) {
return regExpSource;
}
else if (regExpSource.length === 2 &&
regExpSource[0] === "\\" &&
// not a meta character
!utils_1.contains([
"d",
"D",
"s",
"S",
"t",
"r",
"n",
"t",
"0",
"c",
"b",
"B",
"f",
"v",
"w",
"W"
], regExpSource[1])) {
// escaped meta Characters: /\+/ /\[/
// or redundant escaping: /\a/
// without the escaping "\"
return regExpSource[1];
}
else {
return options.useSticky
? addStickyFlag(currPattern)
: addStartOfInput(currPattern);
}
}
else if (utils_1.isFunction(currPattern)) {
hasCustom = true;
// CustomPatternMatcherFunc - custom patterns do not require any transformations, only wrapping in a RegExp Like object
return { exec: currPattern };
}
else if (utils_1.has(currPattern, "exec")) {
hasCustom = true;
// ICustomPattern
return currPattern;
}
else if (typeof currPattern === "string") {
if (currPattern.length === 1) {
return currPattern;
}
else {
var escapedRegExpString = currPattern.replace(/[\\^$.*+?()[\]{}|]/g, "\\$&");
var wrappedRegExp = new RegExp(escapedRegExpString);
return options.useSticky
? addStickyFlag(wrappedRegExp)
: addStartOfInput(wrappedRegExp);
}
}
else {
throw Error("non exhaustive match");
}
});
});
var patternIdxToType;
var patternIdxToGroup;
var patternIdxToLongerAltIdx;
var patternIdxToPushMode;
var patternIdxToPopMode;
tracer("misc mapping", function () {
patternIdxToType = utils_1.map(onlyRelevantTypes, function (currType) { return currType.tokenTypeIdx; });
patternIdxToGroup = utils_1.map(onlyRelevantTypes, function (clazz) {
var groupName = clazz.GROUP;
/* istanbul ignore next */
if (groupName === lexer_public_1.Lexer.SKIPPED) {
return undefined;
}
else if (utils_1.isString(groupName)) {
return groupName;
}
else if (utils_1.isUndefined(groupName)) {
return false;
}
else {
throw Error("non exhaustive match");
}
});
patternIdxToLongerAltIdx = utils_1.map(onlyRelevantTypes, function (clazz) {
var longerAltType = clazz.LONGER_ALT;
if (longerAltType) {
var longerAltIdx = utils_1.indexOf(onlyRelevantTypes, longerAltType);
return longerAltIdx;
}
});
patternIdxToPushMode = utils_1.map(onlyRelevantTypes, function (clazz) { return clazz.PUSH_MODE; });
patternIdxToPopMode = utils_1.map(onlyRelevantTypes, function (clazz) {
return utils_1.has(clazz, "POP_MODE");
});
});
var patternIdxToCanLineTerminator;
tracer("Line Terminator Handling", function () {
var lineTerminatorCharCodes = getCharCodes(options.lineTerminatorCharacters);
patternIdxToCanLineTerminator = utils_1.map(onlyRelevantTypes, function (tokType) { return false; });
if (options.positionTracking !== "onlyOffset") {
patternIdxToCanLineTerminator = utils_1.map(onlyRelevantTypes, function (tokType) {
if (utils_1.has(tokType, "LINE_BREAKS")) {
return tokType.LINE_BREAKS;
}
else {
if (checkLineBreaksIssues(tokType, lineTerminatorCharCodes) === false) {
return reg_exp_1.canMatchCharCode(lineTerminatorCharCodes, tokType.PATTERN);
}
}
});
}
});
var patternIdxToIsCustom;
var patternIdxToShort;
var emptyGroups;
var patternIdxToConfig;
tracer("Misc Mapping #2", function () {
patternIdxToIsCustom = utils_1.map(onlyRelevantTypes, isCustomPattern);
patternIdxToShort = utils_1.map(allTransformedPatterns, isShortPattern);
emptyGroups = utils_1.reduce(onlyRelevantTypes, function (acc, clazz) {
var groupName = clazz.GROUP;
if (utils_1.isString(groupName) && !(groupName === lexer_public_1.Lexer.SKIPPED)) {
acc[groupName] = [];
}
return acc;
}, {});
patternIdxToConfig = utils_1.map(allTransformedPatterns, function (x, idx) {
return {
pattern: allTransformedPatterns[idx],
longerAlt: patternIdxToLongerAltIdx[idx],
canLineTerminator: patternIdxToCanLineTerminator[idx],
isCustom: patternIdxToIsCustom[idx],
short: patternIdxToShort[idx],
group: patternIdxToGroup[idx],
push: patternIdxToPushMode[idx],
pop: patternIdxToPopMode[idx],
tokenTypeIdx: patternIdxToType[idx],
tokenType: onlyRelevantTypes[idx]
};
});
});
var canBeOptimized = true;
var charCodeToPatternIdxToConfig = [];
if (!options.safeMode) {
tracer("First Char Optimization", function () {
charCodeToPatternIdxToConfig = utils_1.reduce(onlyRelevantTypes, function (result, currTokType, idx) {
if (typeof currTokType.PATTERN === "string") {
var charCode = currTokType.PATTERN.charCodeAt(0);
var optimizedIdx = charCodeToOptimizedIndex(charCode);
addToMapOfArrays(result, optimizedIdx, patternIdxToConfig[idx]);
}
else if (utils_1.isArray(currTokType.START_CHARS_HINT)) {
var lastOptimizedIdx_1;
utils_1.forEach(currTokType.START_CHARS_HINT, function (charOrInt) {
var charCode = typeof charOrInt === "string"
? charOrInt.charCodeAt(0)
: charOrInt;
var currOptimizedIdx = charCodeToOptimizedIndex(charCode);
// Avoid adding the config multiple times
/* istanbul ignore else */
// - Difficult to check this scenario effects as it is only a performance
// optimization that does not change correctness
if (lastOptimizedIdx_1 !== currOptimizedIdx) {
lastOptimizedIdx_1 = currOptimizedIdx;
addToMapOfArrays(result, currOptimizedIdx, patternIdxToConfig[idx]);
}
});
}
else if (utils_1.isRegExp(currTokType.PATTERN)) {
if (currTokType.PATTERN.unicode) {
canBeOptimized = false;
if (options.ensureOptimizations) {
utils_1.PRINT_ERROR("" + reg_exp_1.failedOptimizationPrefixMsg +
("\tUnable to analyze < " + currTokType.PATTERN.toString() + " > pattern.\n") +
"\tThe regexp unicode flag is not currently supported by the regexp-to-ast library.\n" +
"\tThis will disable the lexer's first char optimizations.\n" +
"\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#UNICODE_OPTIMIZE");
}
}
else {
var optimizedCodes = reg_exp_1.getOptimizedStartCodesIndices(currTokType.PATTERN, options.ensureOptimizations);
/* istanbul ignore if */
// start code will only be empty given an empty regExp or failure of regexp-to-ast library
// the first should be a different validation and the second cannot be tested.
if (utils_1.isEmpty(optimizedCodes)) {
// we cannot understand what codes may start possible matches
// The optimization correctness requires knowing start codes for ALL patterns.
// Not actually sure this is an error, no debug message
canBeOptimized = false;
}
utils_1.forEach(optimizedCodes, function (code) {
addToMapOfArrays(result, code, patternIdxToConfig[idx]);
});
}
}
else {
if (options.ensureOptimizations) {
utils_1.PRINT_ERROR("" + reg_exp_1.failedOptimizationPrefixMsg +
("\tTokenType: <" + currTokType.name + "> is using a custom token pattern without providing <start_chars_hint> parameter.\n") +
"\tThis will disable the lexer's first char optimizations.\n" +
"\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#CUSTOM_OPTIMIZE");
}
canBeOptimized = false;
}
return result;
}, []);
});
}
tracer("ArrayPacking", function () {
charCodeToPatternIdxToConfig = utils_1.packArray(charCodeToPatternIdxToConfig);
});
return {
emptyGroups: emptyGroups,
patternIdxToConfig: patternIdxToConfig,
charCodeToPatternIdxToConfig: charCodeToPatternIdxToConfig,
hasCustom: hasCustom,
canBeOptimized: canBeOptimized
};
}
exports.analyzeTokenTypes = analyzeTokenTypes;
function validatePatterns(tokenTypes, validModesNames) {
var errors = [];
var missingResult = findMissingPatterns(tokenTypes);
errors = errors.concat(missingResult.errors);
var invalidResult = findInvalidPatterns(missingResult.valid);
var validTokenTypes = invalidResult.valid;
errors = errors.concat(invalidResult.errors);
errors = errors.concat(validateRegExpPattern(validTokenTypes));
errors = errors.concat(findInvalidGroupType(validTokenTypes));
errors = errors.concat(findModesThatDoNotExist(validTokenTypes, validModesNames));
errors = errors.concat(findUnreachablePatterns(validTokenTypes));
return errors;
}
exports.validatePatterns = validatePatterns;
function validateRegExpPattern(tokenTypes) {
var errors = [];
var withRegExpPatterns = utils_1.filter(tokenTypes, function (currTokType) {
return utils_1.isRegExp(currTokType[PATTERN]);
});
errors = errors.concat(findEndOfInputAnchor(withRegExpPatterns));
errors = errors.concat(findStartOfInputAnchor(withRegExpPatterns));
errors = errors.concat(findUnsupportedFlags(withRegExpPatterns));
errors = errors.concat(findDuplicatePatterns(withRegExpPatterns));
errors = errors.concat(findEmptyMatchRegExps(withRegExpPatterns));
return errors;
}
function findMissingPatterns(tokenTypes) {
var tokenTypesWithMissingPattern = utils_1.filter(tokenTypes, function (currType) {
return !utils_1.has(currType, PATTERN);
});
var errors = utils_1.map(tokenTypesWithMissingPattern, function (currType) {
return {
message: "Token Type: ->" +
currType.name +
"<- missing static 'PATTERN' property",
type: lexer_public_1.LexerDefinitionErrorType.MISSING_PATTERN,
tokenTypes: [currType]
};
});
var valid = utils_1.difference(tokenTypes, tokenTypesWithMissingPattern);
return { errors: errors, valid: valid };
}
exports.findMissingPatterns = findMissingPatterns;
function findInvalidPatterns(tokenTypes) {
var tokenTypesWithInvalidPattern = utils_1.filter(tokenTypes, function (currType) {
var pattern = currType[PATTERN];
return (!utils_1.isRegExp(pattern) &&
!utils_1.isFunction(pattern) &&
!utils_1.has(pattern, "exec") &&
!utils_1.isString(pattern));
});
var errors = utils_1.map(tokenTypesWithInvalidPattern, function (currType) {
return {
message: "Token Type: ->" +
currType.name +
"<- static 'PATTERN' can only be a RegExp, a" +
" Function matching the {CustomPatternMatcherFunc} type or an Object matching the {ICustomPattern} interface.",
type: lexer_public_1.LexerDefinitionErrorType.INVALID_PATTERN,
tokenTypes: [currType]
};
});
var valid = utils_1.difference(tokenTypes, tokenTypesWithInvalidPattern);
return { errors: errors, valid: valid };
}
exports.findInvalidPatterns = findInvalidPatterns;
var end_of_input = /[^\\][\$]/;
function findEndOfInputAnchor(tokenTypes) {
var EndAnchorFinder = /** @class */ (function (_super) {
__extends(EndAnchorFinder, _super);
function EndAnchorFinder() {
var _this = _super !== null && _super.apply(this, arguments) || this;
_this.found = false;
return _this;
}
EndAnchorFinder.prototype.visitEndAnchor = function (node) {
this.found = true;
};
return EndAnchorFinder;
}(regexp_to_ast_1.BaseRegExpVisitor));
var invalidRegex = utils_1.filter(tokenTypes, function (currType) {
var pattern = currType[PATTERN];
try {
var regexpAst = reg_exp_parser_1.getRegExpAst(pattern);
var endAnchorVisitor = new EndAnchorFinder();
endAnchorVisitor.visit(regexpAst);
return endAnchorVisitor.found;
}
catch (e) {
// old behavior in case of runtime exceptions with regexp-to-ast.
/* istanbul ignore next - cannot ensure an error in regexp-to-ast*/
return end_of_input.test(pattern.source);
}
});
var errors = utils_1.map(invalidRegex, function (currType) {
return {
message: "Unexpected RegExp Anchor Error:\n" +
"\tToken Type: ->" +
currType.name +
"<- static 'PATTERN' cannot contain end of input anchor '$'\n" +
"\tSee chevrotain.io/docs/guide/resolving_lexer_errors.html#ANCHORS" +
"\tfor details.",
type: lexer_public_1.LexerDefinitionErrorType.EOI_ANCHOR_FOUND,
tokenTypes: [currType]
};
});
return errors;
}
exports.findEndOfInputAnchor = findEndOfInputAnchor;
function findEmptyMatchRegExps(tokenTypes) {
var matchesEmptyString = utils_1.filter(tokenTypes, function (currType) {
var pattern = currType[PATTERN];
return pattern.test("");
});
var errors = utils_1.map(matchesEmptyString, function (currType) {
return {
message: "Token Type: ->" +
currType.name +
"<- static 'PATTERN' must not match an empty string",
type: lexer_public_1.LexerDefinitionErrorType.EMPTY_MATCH_PATTERN,
tokenTypes: [currType]
};
});
return errors;
}
exports.findEmptyMatchRegExps = findEmptyMatchRegExps;
var start_of_input = /[^\\[][\^]|^\^/;
function findStartOfInputAnchor(tokenTypes) {
var StartAnchorFinder = /** @class */ (function (_super) {
__extends(StartAnchorFinder, _super);
function StartAnchorFinder() {
var _this = _super !== null && _super.apply(this, arguments) || this;
_this.found = false;
return _this;
}
StartAnchorFinder.prototype.visitStartAnchor = function (node) {
this.found = true;
};
return StartAnchorFinder;
}(regexp_to_ast_1.BaseRegExpVisitor));
var invalidRegex = utils_1.filter(tokenTypes, function (currType) {
var pattern = currType[PATTERN];
try {
var regexpAst = reg_exp_parser_1.getRegExpAst(pattern);
var startAnchorVisitor = new StartAnchorFinder();
startAnchorVisitor.visit(regexpAst);
return startAnchorVisitor.found;
}
catch (e) {
// old behavior in case of runtime exceptions with regexp-to-ast.
/* istanbul ignore next - cannot ensure an error in regexp-to-ast*/
return start_of_input.test(pattern.source);
}
});
var errors = utils_1.map(invalidRegex, function (currType) {
return {
message: "Unexpected RegExp Anchor Error:\n" +
"\tToken Type: ->" +
currType.name +
"<- static 'PATTERN' cannot contain start of input anchor '^'\n" +
"\tSee https://chevrotain.io/docs/guide/resolving_lexer_errors.html#ANCHORS" +
"\tfor details.",
type: lexer_public_1.LexerDefinitionErrorType.SOI_ANCHOR_FOUND,
tokenTypes: [currType]
};
});
return errors;
}
exports.findStartOfInputAnchor = findStartOfInputAnchor;
function findUnsupportedFlags(tokenTypes) {
var invalidFlags = utils_1.filter(tokenTypes, function (currType) {
var pattern = currType[PATTERN];
return pattern instanceof RegExp && (pattern.multiline || pattern.global);
});
var errors = utils_1.map(invalidFlags, function (currType) {
return {
message: "Token Type: ->" +
currType.name +
"<- static 'PATTERN' may NOT contain global('g') or multiline('m')",
type: lexer_public_1.LexerDefinitionErrorType.UNSUPPORTED_FLAGS_FOUND,
tokenTypes: [currType]
};
});
return errors;
}
exports.findUnsupportedFlags = findUnsupportedFlags;
// This can only test for identical duplicate RegExps, not semantically equivalent ones.
function findDuplicatePatterns(tokenTypes) {
var found = [];
var identicalPatterns = utils_1.map(tokenTypes, function (outerType) {
return utils_1.reduce(tokenTypes, function (result, innerType) {
if (outerType.PATTERN.source === innerType.PATTERN.source &&
!utils_1.contains(found, innerType) &&
innerType.PATTERN !== lexer_public_1.Lexer.NA) {
// this avoids duplicates in the result, each Token Type may only appear in one "set"
// in essence we are creating Equivalence classes on equality relation.
found.push(innerType);
result.push(innerType);
return result;
}
return result;
}, []);
});
identicalPatterns = utils_1.compact(identicalPatterns);
var duplicatePatterns = utils_1.filter(identicalPatterns, function (currIdenticalSet) {
return currIdenticalSet.length > 1;
});
var errors = utils_1.map(duplicatePatterns, function (setOfIdentical) {
var tokenTypeNames = utils_1.map(setOfIdentical, function (currType) {
return currType.name;
});
var dupPatternSrc = utils_1.first(setOfIdentical).PATTERN;
return {
message: "The same RegExp pattern ->" + dupPatternSrc + "<-" +
("has been used in all of the following Token Types: " + tokenTypeNames.join(", ") + " <-"),
type: lexer_public_1.LexerDefinitionErrorType.DUPLICATE_PATTERNS_FOUND,
tokenTypes: setOfIdentical
};
});
return errors;
}
exports.findDuplicatePatterns = findDuplicatePatterns;
function findInvalidGroupType(tokenTypes) {
var invalidTypes = utils_1.filter(tokenTypes, function (clazz) {
if (!utils_1.has(clazz, "GROUP")) {
return false;
}
var group = clazz.GROUP;
return group !== lexer_public_1.Lexer.SKIPPED && group !== lexer_public_1.Lexer.NA && !utils_1.isString(group);
});
var errors = utils_1.map(invalidTypes, function (currType) {
return {
message: "Token Type: ->" +
currType.name +
"<- static 'GROUP' can only be Lexer.SKIPPED/Lexer.NA/A String",
type: lexer_public_1.LexerDefinitionErrorType.INVALID_GROUP_TYPE_FOUND,
tokenTypes: [currType]
};
});
return errors;
}
exports.findInvalidGroupType = findInvalidGroupType;
function findModesThatDoNotExist(tokenTypes, validModes) {
var invalidModes = utils_1.filter(tokenTypes, function (clazz) {
return (clazz.PUSH_MODE !== undefined && !utils_1.contains(validModes, clazz.PUSH_MODE));
});
var errors = utils_1.map(invalidModes, function (tokType) {
var msg = "Token Type: ->" + tokType.name + "<- static 'PUSH_MODE' value cannot refer to a Lexer Mode ->" + tokType.PUSH_MODE + "<-" +
"which does not exist";
return {
message: msg,
type: lexer_public_1.LexerDefinitionErrorType.PUSH_MODE_DOES_NOT_EXIST,
tokenTypes: [tokType]
};
});
return errors;
}
exports.findModesThatDoNotExist = findModesThatDoNotExist;
function findUnreachablePatterns(tokenTypes) {
var errors = [];
var canBeTested = utils_1.reduce(tokenTypes, function (result, tokType, idx) {
var pattern = tokType.PATTERN;
if (pattern === lexer_public_1.Lexer.NA) {
return result;
}
// a more comprehensive validation for all forms of regExps would require
// deeper regExp analysis capabilities
if (utils_1.isString(pattern)) {
result.push({ str: pattern, idx: idx, tokenType: tokType });
}
else if (utils_1.isRegExp(pattern) && noMetaChar(pattern)) {
result.push({ str: pattern.source, idx: idx, tokenType: tokType });
}
return result;
}, []);
utils_1.forEach(tokenTypes, function (tokType, testIdx) {
utils_1.forEach(canBeTested, function (_a) {
var str = _a.str, idx = _a.idx, tokenType = _a.tokenType;
if (testIdx < idx && testTokenType(str, tokType.PATTERN)) {
var msg = "Token: ->" + tokenType.name + "<- can never be matched.\n" +
("Because it appears AFTER the Token Type ->" + tokType.name + "<-") +
"in the lexer's definition.\n" +
"See https://chevrotain.io/docs/guide/resolving_lexer_errors.html#UNREACHABLE";
errors.push({
message: msg,
type: lexer_public_1.LexerDefinitionErrorType.UNREACHABLE_PATTERN,
tokenTypes: [tokType, tokenType]
});
}
});
});
return errors;
}
exports.findUnreachablePatterns = findUnreachablePatterns;
function testTokenType(str, pattern) {
/* istanbul ignore else */
if (utils_1.isRegExp(pattern)) {
var regExpArray = pattern.exec(str);
return regExpArray !== null && regExpArray.index === 0;
}
else if (utils_1.isFunction(pattern)) {
// maintain the API of custom patterns
return pattern(str, 0, [], {});
}
else if (utils_1.has(pattern, "exec")) {
// maintain the API of custom patterns
return pattern.exec(str, 0, [], {});
}
else if (typeof pattern === "string") {
return pattern === str;
}
else {
throw Error("non exhaustive match");
}
}
function noMetaChar(regExp) {
//https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp
var metaChars = [
".",
"\\",
"[",
"]",
"|",
"^",
"$",
"(",
")",
"?",
"*",
"+",
"{"
];
return (utils_1.find(metaChars, function (char) { return regExp.source.indexOf(char) !== -1; }) === undefined);
}
function addStartOfInput(pattern) {
var flags = pattern.ignoreCase ? "i" : "";
// always wrapping in a none capturing group preceded by '^' to make sure matching can only work on start of input.
// duplicate/redundant start of input markers have no meaning (/^^^^A/ === /^A/)
return new RegExp("^(?:" + pattern.source + ")", flags);
}
exports.addStartOfInput = addStartOfInput;
function addStickyFlag(pattern) {
var flags = pattern.ignoreCase ? "iy" : "y";
// always wrapping in a none capturing group preceded by '^' to make sure matching can only work on start of input.
// duplicate/redundant start of input markers have no meaning (/^^^^A/ === /^A/)
return new RegExp("" + pattern.source, flags);
}
exports.addStickyFlag = addStickyFlag;
function performRuntimeChecks(lexerDefinition, trackLines, lineTerminatorCharacters) {
var errors = [];
// some run time checks to help the end users.
if (!utils_1.has(lexerDefinition, exports.DEFAULT_MODE)) {
errors.push({
message: "A MultiMode Lexer cannot be initialized without a <" +
exports.DEFAULT_MODE +
"> property in its definition\n",
type: lexer_public_1.LexerDefinitionErrorType.MULTI_MODE_LEXER_WITHOUT_DEFAULT_MODE
});
}
if (!utils_1.has(lexerDefinition, exports.MODES)) {
errors.push({
message: "A MultiMode Lexer cannot be initialized without a <" +
exports.MODES +
"> property in its definition\n",
type: lexer_public_1.LexerDefinitionErrorType.MULTI_MODE_LEXER_WITHOUT_MODES_PROPERTY
});
}
if (utils_1.has(lexerDefinition, exports.MODES) &&
utils_1.has(lexerDefinition, exports.DEFAULT_MODE) &&
!utils_1.has(lexerDefinition.modes, lexerDefinition.defaultMode)) {
errors.push({
message: "A MultiMode Lexer cannot be initialized with a " + exports.DEFAULT_MODE + ": <" + lexerDefinition.defaultMode + ">" +
"which does not exist\n",
type: lexer_public_1.LexerDefinitionErrorType.MULTI_MODE_LEXER_DEFAULT_MODE_VALUE_DOES_NOT_EXIST
});
}
if (utils_1.has(lexerDefinition, exports.MODES)) {
utils_1.forEach(lexerDefinition.modes, function (currModeValue, currModeName) {
utils_1.forEach(currModeValue, function (currTokType, currIdx) {
if (utils_1.isUndefined(currTokType)) {
errors.push({
message: "A Lexer cannot be initialized using an undefined Token Type. Mode:" +
("<" + currModeName + "> at index: <" + currIdx + ">\n"),
type: lexer_public_1.LexerDefinitionErrorType.LEXER_DEFINITION_CANNOT_CONTAIN_UNDEFINED
});
}
});
});
}
return errors;
}
exports.performRuntimeChecks = performRuntimeChecks;
function performWarningRuntimeChecks(lexerDefinition, trackLines, lineTerminatorCharacters) {
var warnings = [];
var hasAnyLineBreak = false;
var allTokenTypes = utils_1.compact(utils_1.flatten(utils_1.mapValues(lexerDefinition.modes, function (tokTypes) { return tokTypes; })));
var concreteTokenTypes = utils_1.reject(allTokenTypes, function (currType) { return currType[PATTERN] === lexer_public_1.Lexer.NA; });
var terminatorCharCodes = getCharCodes(lineTerminatorCharacters);
if (trackLines) {
utils_1.forEach(concreteTokenTypes, function (tokType) {
var currIssue = checkLineBreaksIssues(tokType, terminatorCharCodes);
if (currIssue !== false) {
var message = buildLineBreakIssueMessage(tokType, currIssue);
var warningDescriptor = {
message: message,
type: currIssue.issue,
tokenType: tokType
};
warnings.push(warningDescriptor);
}
else {
// we don't want to attempt to scan if the user explicitly specified the line_breaks option.
if (utils_1.has(tokType, "LINE_BREAKS")) {
if (tokType.LINE_BREAKS === true) {
hasAnyLineBreak = true;
}
}
else {
if (reg_exp_1.canMatchCharCode(terminatorCharCodes, tokType.PATTERN)) {
hasAnyLineBreak = true;
}
}
}
});
}
if (trackLines && !hasAnyLineBreak) {
warnings.push({
message: "Warning: No LINE_BREAKS Found.\n" +
"\tThis Lexer has been defined to track line and column information,\n" +
"\tBut none of the Token Types can be identified as matching a line terminator.\n" +
"\tSee https://chevrotain.io/docs/guide/resolving_lexer_errors.html#LINE_BREAKS \n" +
"\tfor details.",
type: lexer_public_1.LexerDefinitionErrorType.NO_LINE_BREAKS_FLAGS
});
}
return warnings;
}
exports.performWarningRuntimeChecks = performWarningRuntimeChecks;
function cloneEmptyGroups(emptyGroups) {
var clonedResult = {};
var groupKeys = utils_1.keys(emptyGroups);
utils_1.forEach(groupKeys, function (currKey) {
var currGroupValue = emptyGroups[currKey];
/* istanbul ignore else */
if (utils_1.isArray(currGroupValue)) {
clonedResult[currKey] = [];
}
else {
throw Error("non exhaustive match");
}
});
return clonedResult;
}
exports.cloneEmptyGroups = cloneEmptyGroups;
// TODO: refactor to avoid duplication
function isCustomPattern(tokenType) {
var pattern = tokenType.PATTERN;
/* istanbul ignore else */
if (utils_1.isRegExp(pattern)) {
return false;
}
else if (utils_1.isFunction(pattern)) {
// CustomPatternMatcherFunc - custom patterns do not require any transformations, only wrapping in a RegExp Like object
return true;
}
else if (utils_1.has(pattern, "exec")) {
// ICustomPattern
return true;
}
else if (utils_1.isString(pattern)) {
return false;
}
else {
throw Error("non exhaustive match");
}
}
exports.isCustomPattern = isCustomPattern;
function isShortPattern(pattern) {
if (utils_1.isString(pattern) && pattern.length === 1) {
return pattern.charCodeAt(0);
}
else {
return false;
}
}
exports.isShortPattern = isShortPattern;
/**
* Faster than using a RegExp for default newline detection during lexing.
*/
exports.LineTerminatorOptimizedTester = {
// implements /\n|\r\n?/g.test
test: function (text) {
var len = text.length;
for (var i = this.lastIndex; i < len; i++) {
var c = text.charCodeAt(i);
if (c === 10) {
this.lastIndex = i + 1;
return true;
}
else if (c === 13) {
if (text.charCodeAt(i + 1) === 10) {
this.lastIndex = i + 2;
}
else {
this.lastIndex = i + 1;
}
return true;
}
}
return false;
},
lastIndex: 0
};
function checkLineBreaksIssues(tokType, lineTerminatorCharCodes) {
if (utils_1.has(tokType, "LINE_BREAKS")) {
// if the user explicitly declared the line_breaks option we will respect their choice
// and assume it is correct.
return false;
}
else {
/* istanbul ignore else */
if (utils_1.isRegExp(tokType.PATTERN)) {
try {
reg_exp_1.canMatchCharCode(lineTerminatorCharCodes, tokType.PATTERN);
}
catch (e) {
/* istanbul ignore next - to test this we would have to mock <canMatchCharCode> to throw an error */
return {
issue: lexer_public_1.LexerDefinitionErrorType.IDENTIFY_TERMINATOR,
errMsg: e.message
};
}
return false;
}
else if (utils_1.isString(tokType.PATTERN)) {
// string literal patterns can always be analyzed to detect line terminator usage
return false;
}
else if (isCustomPattern(tokType)) {
// custom token types
return { issue: lexer_public_1.LexerDefinitionErrorType.CUSTOM_LINE_BREAK };
}
else {
throw Error("non exhaustive match");
}
}
}
function buildLineBreakIssueMessage(tokType, details) {
/* istanbul ignore else */
if (details.issue === lexer_public_1.LexerDefinitionErrorType.IDENTIFY_TERMINATOR) {
return ("Warning: unable to identify line terminator usage in pattern.\n" +
("\tThe problem is in the <" + tokType.name + "> Token Type\n") +
("\t Root cause: " + details.errMsg + ".\n") +
"\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#IDENTIFY_TERMINATOR");
}
else if (details.issue === lexer_public_1.LexerDefinitionErrorType.CUSTOM_LINE_BREAK) {
return ("Warning: A Custom Token Pattern should specify the <line_breaks> option.\n" +
("\tThe problem is in the <" + tokType.name + "> Token Type\n") +
"\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#CUSTOM_LINE_BREAK");
}
else {
throw Error("non exhaustive match");
}
}
exports.buildLineBreakIssueMessage = buildLineBreakIssueMessage;
function getCharCodes(charsOrCodes) {
var charCodes = utils_1.map(charsOrCodes, function (numOrString) {
if (utils_1.isString(numOrString) && numOrString.length > 0) {
return numOrString.charCodeAt(0);
}
else {
return numOrString;
}
});
return charCodes;
}
function addToMapOfArrays(map, key, value) {
if (map[key] === undefined) {
map[key] = [value];
}
else {
map[key].push(value);
}
}
exports.minOptimizationVal = 256;
/**
* We ae mapping charCode above ASCI (256) into buckets each in the size of 256.
* This is because ASCI are the most common start chars so each one of those will get its own
* possible token configs vector.
*
* Tokens starting with charCodes "above" ASCI are uncommon, so we can "afford"
* to place these into buckets of possible token configs, What we gain from
* this is avoiding the case of creating an optimization 'charCodeToPatternIdxToConfig'
* which would contain 10,000+ arrays of small size (e.g unicode Identifiers scenario).
* Our 'charCodeToPatternIdxToConfig' max size will now be:
* 256 + (2^16 / 2^8) - 1 === 511
*
* note the hack for fast division integer part extraction
* See: https://stackoverflow.com/a/4228528
*/
function charCodeToOptimizedIndex(charCode) {
return charCode < exports.minOptimizationVal
? charCode
: charCodeToOptimizedIdxMap[charCode];
}
exports.charCodeToOptimizedIndex = charCodeToOptimizedIndex;
/**
* This is a compromise between cold start / hot running performance
* Creating this array takes ~3ms on a modern machine,
* But if we perform the computation at runtime as needed the CSS Lexer benchmark
* performance degrades by ~10%
*
* TODO: Perhaps it should be lazy initialized only if a charCode > 255 is used.
*/
var charCodeToOptimizedIdxMap = [];
function initCharCodeToOptimizedIndexMap() {
if (utils_1.isEmpty(charCodeToOptimizedIdxMap)) {
charCodeToOptimizedIdxMap = new Array(65536);
for (var i = 0; i < 65536; i++) {
/* tslint:disable */
charCodeToOptimizedIdxMap[i] = i > 255 ? 255 + ~~(i / 255) : i;
/* tslint:enable */
}
}
}
//# sourceMappingURL=lexer.js.map
;