chevrotain
Version:
Chevrotain is a high performance fault tolerant javascript parsing DSL for building recursive decent parsers
261 lines • 12.2 kB
JavaScript
var __extends = (this && this.__extends) || (function () {
var extendStatics = function (d, b) {
extendStatics = Object.setPrototypeOf ||
({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||
function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };
return extendStatics(d, b);
};
return function (d, b) {
extendStatics(d, b);
function __() { this.constructor = d; }
d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.canMatchCharCode = exports.firstCharOptimizedIndices = exports.getOptimizedStartCodesIndices = exports.failedOptimizationPrefixMsg = void 0;
var regexp_to_ast_1 = require("regexp-to-ast");
var utils_1 = require("../utils/utils");
var reg_exp_parser_1 = require("./reg_exp_parser");
var lexer_1 = require("./lexer");
var complementErrorMessage = "Complement Sets are not supported for first char optimization";
exports.failedOptimizationPrefixMsg = 'Unable to use "first char" lexer optimizations:\n';
function getOptimizedStartCodesIndices(regExp, ensureOptimizations) {
if (ensureOptimizations === void 0) { ensureOptimizations = false; }
try {
var ast = reg_exp_parser_1.getRegExpAst(regExp);
var firstChars = firstCharOptimizedIndices(ast.value, {}, ast.flags.ignoreCase);
return firstChars;
}
catch (e) {
/* istanbul ignore next */
// Testing this relies on the regexp-to-ast library having a bug... */
// TODO: only the else branch needs to be ignored, try to fix with newer prettier / tsc
if (e.message === complementErrorMessage) {
if (ensureOptimizations) {
utils_1.PRINT_WARNING("" + exports.failedOptimizationPrefixMsg +
("\tUnable to optimize: < " + regExp.toString() + " >\n") +
"\tComplement Sets cannot be automatically optimized.\n" +
"\tThis will disable the lexer's first char optimizations.\n" +
"\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#COMPLEMENT for details.");
}
}
else {
var msgSuffix = "";
if (ensureOptimizations) {
msgSuffix =
"\n\tThis will disable the lexer's first char optimizations.\n" +
"\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#REGEXP_PARSING for details.";
}
utils_1.PRINT_ERROR(exports.failedOptimizationPrefixMsg + "\n" +
("\tFailed parsing: < " + regExp.toString() + " >\n") +
("\tUsing the regexp-to-ast library version: " + regexp_to_ast_1.VERSION + "\n") +
"\tPlease open an issue at: https://github.com/bd82/regexp-to-ast/issues" +
msgSuffix);
}
}
return [];
}
exports.getOptimizedStartCodesIndices = getOptimizedStartCodesIndices;
function firstCharOptimizedIndices(ast, result, ignoreCase) {
switch (ast.type) {
case "Disjunction":
for (var i = 0; i < ast.value.length; i++) {
firstCharOptimizedIndices(ast.value[i], result, ignoreCase);
}
break;
case "Alternative":
var terms = ast.value;
for (var i = 0; i < terms.length; i++) {
var term = terms[i];
// skip terms that cannot effect the first char results
switch (term.type) {
case "EndAnchor":
// A group back reference cannot affect potential starting char.
// because if a back reference is the first production than automatically
// the group being referenced has had to come BEFORE so its codes have already been added
case "GroupBackReference":
// assertions do not affect potential starting codes
case "Lookahead":
case "NegativeLookahead":
case "StartAnchor":
case "WordBoundary":
case "NonWordBoundary":
continue;
}
var atom = term;
switch (atom.type) {
case "Character":
addOptimizedIdxToResult(atom.value, result, ignoreCase);
break;
case "Set":
if (atom.complement === true) {
throw Error(complementErrorMessage);
}
utils_1.forEach(atom.value, function (code) {
if (typeof code === "number") {
addOptimizedIdxToResult(code, result, ignoreCase);
}
else {
// range
var range = code;
// cannot optimize when ignoreCase is
if (ignoreCase === true) {
for (var rangeCode = range.from; rangeCode <= range.to; rangeCode++) {
addOptimizedIdxToResult(rangeCode, result, ignoreCase);
}
}
// Optimization (2 orders of magnitude less work for very large ranges)
else {
// handle unoptimized values
for (var rangeCode = range.from; rangeCode <= range.to && rangeCode < lexer_1.minOptimizationVal; rangeCode++) {
addOptimizedIdxToResult(rangeCode, result, ignoreCase);
}
// Less common charCode where we optimize for faster init time, by using larger "buckets"
if (range.to >= lexer_1.minOptimizationVal) {
var minUnOptVal = range.from >= lexer_1.minOptimizationVal
? range.from
: lexer_1.minOptimizationVal;
var maxUnOptVal = range.to;
var minOptIdx = lexer_1.charCodeToOptimizedIndex(minUnOptVal);
var maxOptIdx = lexer_1.charCodeToOptimizedIndex(maxUnOptVal);
for (var currOptIdx = minOptIdx; currOptIdx <= maxOptIdx; currOptIdx++) {
result[currOptIdx] = currOptIdx;
}
}
}
}
});
break;
case "Group":
firstCharOptimizedIndices(atom.value, result, ignoreCase);
break;
/* istanbul ignore next */
default:
throw Error("Non Exhaustive Match");
}
// reached a mandatory production, no more **start** codes can be found on this alternative
var isOptionalQuantifier = atom.quantifier !== undefined && atom.quantifier.atLeast === 0;
if (
// A group may be optional due to empty contents /(?:)/
// or if everything inside it is optional /((a)?)/
(atom.type === "Group" && isWholeOptional(atom) === false) ||
// If this term is not a group it may only be optional if it has an optional quantifier
(atom.type !== "Group" && isOptionalQuantifier === false)) {
break;
}
}
break;
/* istanbul ignore next */
default:
throw Error("non exhaustive match!");
}
// console.log(Object.keys(result).length)
return utils_1.values(result);
}
exports.firstCharOptimizedIndices = firstCharOptimizedIndices;
function addOptimizedIdxToResult(code, result, ignoreCase) {
var optimizedCharIdx = lexer_1.charCodeToOptimizedIndex(code);
result[optimizedCharIdx] = optimizedCharIdx;
if (ignoreCase === true) {
handleIgnoreCase(code, result);
}
}
function handleIgnoreCase(code, result) {
var char = String.fromCharCode(code);
var upperChar = char.toUpperCase();
/* istanbul ignore else */
if (upperChar !== char) {
var optimizedCharIdx = lexer_1.charCodeToOptimizedIndex(upperChar.charCodeAt(0));
result[optimizedCharIdx] = optimizedCharIdx;
}
else {
var lowerChar = char.toLowerCase();
if (lowerChar !== char) {
var optimizedCharIdx = lexer_1.charCodeToOptimizedIndex(lowerChar.charCodeAt(0));
result[optimizedCharIdx] = optimizedCharIdx;
}
}
}
function findCode(setNode, targetCharCodes) {
return utils_1.find(setNode.value, function (codeOrRange) {
if (typeof codeOrRange === "number") {
return utils_1.contains(targetCharCodes, codeOrRange);
}
else {
// range
var range_1 = codeOrRange;
return (utils_1.find(targetCharCodes, function (targetCode) { return range_1.from <= targetCode && targetCode <= range_1.to; }) !== undefined);
}
});
}
function isWholeOptional(ast) {
if (ast.quantifier && ast.quantifier.atLeast === 0) {
return true;
}
if (!ast.value) {
return false;
}
return utils_1.isArray(ast.value)
? utils_1.every(ast.value, isWholeOptional)
: isWholeOptional(ast.value);
}
var CharCodeFinder = /** @class */ (function (_super) {
__extends(CharCodeFinder, _super);
function CharCodeFinder(targetCharCodes) {
var _this = _super.call(this) || this;
_this.targetCharCodes = targetCharCodes;
_this.found = false;
return _this;
}
CharCodeFinder.prototype.visitChildren = function (node) {
// No need to keep looking...
if (this.found === true) {
return;
}
// switch lookaheads as they do not actually consume any characters thus
// finding a charCode at lookahead context does not mean that regexp can actually contain it in a match.
switch (node.type) {
case "Lookahead":
this.visitLookahead(node);
return;
case "NegativeLookahead":
this.visitNegativeLookahead(node);
return;
}
_super.prototype.visitChildren.call(this, node);
};
CharCodeFinder.prototype.visitCharacter = function (node) {
if (utils_1.contains(this.targetCharCodes, node.value)) {
this.found = true;
}
};
CharCodeFinder.prototype.visitSet = function (node) {
if (node.complement) {
if (findCode(node, this.targetCharCodes) === undefined) {
this.found = true;
}
}
else {
if (findCode(node, this.targetCharCodes) !== undefined) {
this.found = true;
}
}
};
return CharCodeFinder;
}(regexp_to_ast_1.BaseRegExpVisitor));
function canMatchCharCode(charCodes, pattern) {
if (pattern instanceof RegExp) {
var ast = reg_exp_parser_1.getRegExpAst(pattern);
var charCodeFinder = new CharCodeFinder(charCodes);
charCodeFinder.visit(ast);
return charCodeFinder.found;
}
else {
return (utils_1.find(pattern, function (char) {
return utils_1.contains(charCodes, char.charCodeAt(0));
}) !== undefined);
}
}
exports.canMatchCharCode = canMatchCharCode;
//# sourceMappingURL=reg_exp.js.map
;