UNPKG

chevrotain

Version:

Chevrotain is a high performance fault tolerant javascript parsing DSL for building recursive decent parsers

261 lines 12.2 kB
"use strict"; var __extends = (this && this.__extends) || (function () { var extendStatics = function (d, b) { extendStatics = Object.setPrototypeOf || ({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) || function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; }; return extendStatics(d, b); }; return function (d, b) { extendStatics(d, b); function __() { this.constructor = d; } d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __()); }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.canMatchCharCode = exports.firstCharOptimizedIndices = exports.getOptimizedStartCodesIndices = exports.failedOptimizationPrefixMsg = void 0; var regexp_to_ast_1 = require("regexp-to-ast"); var utils_1 = require("../utils/utils"); var reg_exp_parser_1 = require("./reg_exp_parser"); var lexer_1 = require("./lexer"); var complementErrorMessage = "Complement Sets are not supported for first char optimization"; exports.failedOptimizationPrefixMsg = 'Unable to use "first char" lexer optimizations:\n'; function getOptimizedStartCodesIndices(regExp, ensureOptimizations) { if (ensureOptimizations === void 0) { ensureOptimizations = false; } try { var ast = reg_exp_parser_1.getRegExpAst(regExp); var firstChars = firstCharOptimizedIndices(ast.value, {}, ast.flags.ignoreCase); return firstChars; } catch (e) { /* istanbul ignore next */ // Testing this relies on the regexp-to-ast library having a bug... */ // TODO: only the else branch needs to be ignored, try to fix with newer prettier / tsc if (e.message === complementErrorMessage) { if (ensureOptimizations) { utils_1.PRINT_WARNING("" + exports.failedOptimizationPrefixMsg + ("\tUnable to optimize: < " + regExp.toString() + " >\n") + "\tComplement Sets cannot be automatically optimized.\n" + "\tThis will disable the lexer's first char optimizations.\n" + "\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#COMPLEMENT for details."); } } else { var msgSuffix = ""; if (ensureOptimizations) { msgSuffix = "\n\tThis will disable the lexer's first char optimizations.\n" + "\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#REGEXP_PARSING for details."; } utils_1.PRINT_ERROR(exports.failedOptimizationPrefixMsg + "\n" + ("\tFailed parsing: < " + regExp.toString() + " >\n") + ("\tUsing the regexp-to-ast library version: " + regexp_to_ast_1.VERSION + "\n") + "\tPlease open an issue at: https://github.com/bd82/regexp-to-ast/issues" + msgSuffix); } } return []; } exports.getOptimizedStartCodesIndices = getOptimizedStartCodesIndices; function firstCharOptimizedIndices(ast, result, ignoreCase) { switch (ast.type) { case "Disjunction": for (var i = 0; i < ast.value.length; i++) { firstCharOptimizedIndices(ast.value[i], result, ignoreCase); } break; case "Alternative": var terms = ast.value; for (var i = 0; i < terms.length; i++) { var term = terms[i]; // skip terms that cannot effect the first char results switch (term.type) { case "EndAnchor": // A group back reference cannot affect potential starting char. // because if a back reference is the first production than automatically // the group being referenced has had to come BEFORE so its codes have already been added case "GroupBackReference": // assertions do not affect potential starting codes case "Lookahead": case "NegativeLookahead": case "StartAnchor": case "WordBoundary": case "NonWordBoundary": continue; } var atom = term; switch (atom.type) { case "Character": addOptimizedIdxToResult(atom.value, result, ignoreCase); break; case "Set": if (atom.complement === true) { throw Error(complementErrorMessage); } utils_1.forEach(atom.value, function (code) { if (typeof code === "number") { addOptimizedIdxToResult(code, result, ignoreCase); } else { // range var range = code; // cannot optimize when ignoreCase is if (ignoreCase === true) { for (var rangeCode = range.from; rangeCode <= range.to; rangeCode++) { addOptimizedIdxToResult(rangeCode, result, ignoreCase); } } // Optimization (2 orders of magnitude less work for very large ranges) else { // handle unoptimized values for (var rangeCode = range.from; rangeCode <= range.to && rangeCode < lexer_1.minOptimizationVal; rangeCode++) { addOptimizedIdxToResult(rangeCode, result, ignoreCase); } // Less common charCode where we optimize for faster init time, by using larger "buckets" if (range.to >= lexer_1.minOptimizationVal) { var minUnOptVal = range.from >= lexer_1.minOptimizationVal ? range.from : lexer_1.minOptimizationVal; var maxUnOptVal = range.to; var minOptIdx = lexer_1.charCodeToOptimizedIndex(minUnOptVal); var maxOptIdx = lexer_1.charCodeToOptimizedIndex(maxUnOptVal); for (var currOptIdx = minOptIdx; currOptIdx <= maxOptIdx; currOptIdx++) { result[currOptIdx] = currOptIdx; } } } } }); break; case "Group": firstCharOptimizedIndices(atom.value, result, ignoreCase); break; /* istanbul ignore next */ default: throw Error("Non Exhaustive Match"); } // reached a mandatory production, no more **start** codes can be found on this alternative var isOptionalQuantifier = atom.quantifier !== undefined && atom.quantifier.atLeast === 0; if ( // A group may be optional due to empty contents /(?:)/ // or if everything inside it is optional /((a)?)/ (atom.type === "Group" && isWholeOptional(atom) === false) || // If this term is not a group it may only be optional if it has an optional quantifier (atom.type !== "Group" && isOptionalQuantifier === false)) { break; } } break; /* istanbul ignore next */ default: throw Error("non exhaustive match!"); } // console.log(Object.keys(result).length) return utils_1.values(result); } exports.firstCharOptimizedIndices = firstCharOptimizedIndices; function addOptimizedIdxToResult(code, result, ignoreCase) { var optimizedCharIdx = lexer_1.charCodeToOptimizedIndex(code); result[optimizedCharIdx] = optimizedCharIdx; if (ignoreCase === true) { handleIgnoreCase(code, result); } } function handleIgnoreCase(code, result) { var char = String.fromCharCode(code); var upperChar = char.toUpperCase(); /* istanbul ignore else */ if (upperChar !== char) { var optimizedCharIdx = lexer_1.charCodeToOptimizedIndex(upperChar.charCodeAt(0)); result[optimizedCharIdx] = optimizedCharIdx; } else { var lowerChar = char.toLowerCase(); if (lowerChar !== char) { var optimizedCharIdx = lexer_1.charCodeToOptimizedIndex(lowerChar.charCodeAt(0)); result[optimizedCharIdx] = optimizedCharIdx; } } } function findCode(setNode, targetCharCodes) { return utils_1.find(setNode.value, function (codeOrRange) { if (typeof codeOrRange === "number") { return utils_1.contains(targetCharCodes, codeOrRange); } else { // range var range_1 = codeOrRange; return (utils_1.find(targetCharCodes, function (targetCode) { return range_1.from <= targetCode && targetCode <= range_1.to; }) !== undefined); } }); } function isWholeOptional(ast) { if (ast.quantifier && ast.quantifier.atLeast === 0) { return true; } if (!ast.value) { return false; } return utils_1.isArray(ast.value) ? utils_1.every(ast.value, isWholeOptional) : isWholeOptional(ast.value); } var CharCodeFinder = /** @class */ (function (_super) { __extends(CharCodeFinder, _super); function CharCodeFinder(targetCharCodes) { var _this = _super.call(this) || this; _this.targetCharCodes = targetCharCodes; _this.found = false; return _this; } CharCodeFinder.prototype.visitChildren = function (node) { // No need to keep looking... if (this.found === true) { return; } // switch lookaheads as they do not actually consume any characters thus // finding a charCode at lookahead context does not mean that regexp can actually contain it in a match. switch (node.type) { case "Lookahead": this.visitLookahead(node); return; case "NegativeLookahead": this.visitNegativeLookahead(node); return; } _super.prototype.visitChildren.call(this, node); }; CharCodeFinder.prototype.visitCharacter = function (node) { if (utils_1.contains(this.targetCharCodes, node.value)) { this.found = true; } }; CharCodeFinder.prototype.visitSet = function (node) { if (node.complement) { if (findCode(node, this.targetCharCodes) === undefined) { this.found = true; } } else { if (findCode(node, this.targetCharCodes) !== undefined) { this.found = true; } } }; return CharCodeFinder; }(regexp_to_ast_1.BaseRegExpVisitor)); function canMatchCharCode(charCodes, pattern) { if (pattern instanceof RegExp) { var ast = reg_exp_parser_1.getRegExpAst(pattern); var charCodeFinder = new CharCodeFinder(charCodes); charCodeFinder.visit(ast); return charCodeFinder.found; } else { return (utils_1.find(pattern, function (char) { return utils_1.contains(charCodes, char.charCodeAt(0)); }) !== undefined); } } exports.canMatchCharCode = canMatchCharCode; //# sourceMappingURL=reg_exp.js.map