UNPKG

chevrotain

Version:

Chevrotain is a high performance fault tolerant javascript parsing DSL for building recursive decent parsers

chevrotain.io/docs/

Chevrotain/chevrotain

255 lines • 11.6 kB

JavaScript

var __extends = (this && this.__extends) || (function () { var extendStatics = function (d, b) { extendStatics = Object.setPrototypeOf || ({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) || function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; }; return extendStatics(d, b); }; return function (d, b) { extendStatics(d, b); function __() { this.constructor = d; } d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __()); }; })(); import { VERSION, BaseRegExpVisitor } from "regexp-to-ast"; import { forEach, contains, PRINT_ERROR, PRINT_WARNING, find, isArray, every, values } from "../utils/utils"; import { getRegExpAst } from "./reg_exp_parser"; import { charCodeToOptimizedIndex, minOptimizationVal } from "./lexer"; var complementErrorMessage = "Complement Sets are not supported for first char optimization"; export var failedOptimizationPrefixMsg = 'Unable to use "first char" lexer optimizations:\n'; export function getOptimizedStartCodesIndices(regExp, ensureOptimizations) { if (ensureOptimizations === void 0) { ensureOptimizations = false; } try { var ast = getRegExpAst(regExp); var firstChars = firstCharOptimizedIndices(ast.value, {}, ast.flags.ignoreCase); return firstChars; } catch (e) { /* istanbul ignore next */ // Testing this relies on the regexp-to-ast library having a bug... */ // TODO: only the else branch needs to be ignored, try to fix with newer prettier / tsc if (e.message === complementErrorMessage) { if (ensureOptimizations) { PRINT_WARNING("" + failedOptimizationPrefixMsg + ("\tUnable to optimize: < " + regExp.toString() + " >\n") + "\tComplement Sets cannot be automatically optimized.\n" + "\tThis will disable the lexer's first char optimizations.\n" + "\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#COMPLEMENT for details."); } } else { var msgSuffix = ""; if (ensureOptimizations) { msgSuffix = "\n\tThis will disable the lexer's first char optimizations.\n" + "\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#REGEXP_PARSING for details."; } PRINT_ERROR(failedOptimizationPrefixMsg + "\n" + ("\tFailed parsing: < " + regExp.toString() + " >\n") + ("\tUsing the regexp-to-ast library version: " + VERSION + "\n") + "\tPlease open an issue at: https://github.com/bd82/regexp-to-ast/issues" + msgSuffix); } } return []; } export function firstCharOptimizedIndices(ast, result, ignoreCase) { switch (ast.type) { case "Disjunction": for (var i = 0; i < ast.value.length; i++) { firstCharOptimizedIndices(ast.value[i], result, ignoreCase); } break; case "Alternative": var terms = ast.value; for (var i = 0; i < terms.length; i++) { var term = terms[i]; // skip terms that cannot effect the first char results switch (term.type) { case "EndAnchor": // A group back reference cannot affect potential starting char. // because if a back reference is the first production than automatically // the group being referenced has had to come BEFORE so its codes have already been added case "GroupBackReference": // assertions do not affect potential starting codes case "Lookahead": case "NegativeLookahead": case "StartAnchor": case "WordBoundary": case "NonWordBoundary": continue; } var atom = term; switch (atom.type) { case "Character": addOptimizedIdxToResult(atom.value, result, ignoreCase); break; case "Set": if (atom.complement === true) { throw Error(complementErrorMessage); } forEach(atom.value, function (code) { if (typeof code === "number") { addOptimizedIdxToResult(code, result, ignoreCase); } else { // range var range = code; // cannot optimize when ignoreCase is if (ignoreCase === true) { for (var rangeCode = range.from; rangeCode <= range.to; rangeCode++) { addOptimizedIdxToResult(rangeCode, result, ignoreCase); } } // Optimization (2 orders of magnitude less work for very large ranges) else { // handle unoptimized values for (var rangeCode = range.from; rangeCode <= range.to && rangeCode < minOptimizationVal; rangeCode++) { addOptimizedIdxToResult(rangeCode, result, ignoreCase); } // Less common charCode where we optimize for faster init time, by using larger "buckets" if (range.to >= minOptimizationVal) { var minUnOptVal = range.from >= minOptimizationVal ? range.from : minOptimizationVal; var maxUnOptVal = range.to; var minOptIdx = charCodeToOptimizedIndex(minUnOptVal); var maxOptIdx = charCodeToOptimizedIndex(maxUnOptVal); for (var currOptIdx = minOptIdx; currOptIdx <= maxOptIdx; currOptIdx++) { result[currOptIdx] = currOptIdx; } } } } }); break; case "Group": firstCharOptimizedIndices(atom.value, result, ignoreCase); break; /* istanbul ignore next */ default: throw Error("Non Exhaustive Match"); } // reached a mandatory production, no more **start** codes can be found on this alternative var isOptionalQuantifier = atom.quantifier !== undefined && atom.quantifier.atLeast === 0; if ( // A group may be optional due to empty contents /(?:)/ // or if everything inside it is optional /((a)?)/ (atom.type === "Group" && isWholeOptional(atom) === false) || // If this term is not a group it may only be optional if it has an optional quantifier (atom.type !== "Group" && isOptionalQuantifier === false)) { break; } } break; /* istanbul ignore next */ default: throw Error("non exhaustive match!"); } // console.log(Object.keys(result).length) return values(result); } function addOptimizedIdxToResult(code, result, ignoreCase) { var optimizedCharIdx = charCodeToOptimizedIndex(code); result[optimizedCharIdx] = optimizedCharIdx; if (ignoreCase === true) { handleIgnoreCase(code, result); } } function handleIgnoreCase(code, result) { var char = String.fromCharCode(code); var upperChar = char.toUpperCase(); /* istanbul ignore else */ if (upperChar !== char) { var optimizedCharIdx = charCodeToOptimizedIndex(upperChar.charCodeAt(0)); result[optimizedCharIdx] = optimizedCharIdx; } else { var lowerChar = char.toLowerCase(); if (lowerChar !== char) { var optimizedCharIdx = charCodeToOptimizedIndex(lowerChar.charCodeAt(0)); result[optimizedCharIdx] = optimizedCharIdx; } } } function findCode(setNode, targetCharCodes) { return find(setNode.value, function (codeOrRange) { if (typeof codeOrRange === "number") { return contains(targetCharCodes, codeOrRange); } else { // range var range_1 = codeOrRange; return (find(targetCharCodes, function (targetCode) { return range_1.from <= targetCode && targetCode <= range_1.to; }) !== undefined); } }); } function isWholeOptional(ast) { if (ast.quantifier && ast.quantifier.atLeast === 0) { return true; } if (!ast.value) { return false; } return isArray(ast.value) ? every(ast.value, isWholeOptional) : isWholeOptional(ast.value); } var CharCodeFinder = /** @class */ (function (_super) { __extends(CharCodeFinder, _super); function CharCodeFinder(targetCharCodes) { var _this = _super.call(this) || this; _this.targetCharCodes = targetCharCodes; _this.found = false; return _this; } CharCodeFinder.prototype.visitChildren = function (node) { // No need to keep looking... if (this.found === true) { return; } // switch lookaheads as they do not actually consume any characters thus // finding a charCode at lookahead context does not mean that regexp can actually contain it in a match. switch (node.type) { case "Lookahead": this.visitLookahead(node); return; case "NegativeLookahead": this.visitNegativeLookahead(node); return; } _super.prototype.visitChildren.call(this, node); }; CharCodeFinder.prototype.visitCharacter = function (node) { if (contains(this.targetCharCodes, node.value)) { this.found = true; } }; CharCodeFinder.prototype.visitSet = function (node) { if (node.complement) { if (findCode(node, this.targetCharCodes) === undefined) { this.found = true; } } else { if (findCode(node, this.targetCharCodes) !== undefined) { this.found = true; } } }; return CharCodeFinder; }(BaseRegExpVisitor)); export function canMatchCharCode(charCodes, pattern) { if (pattern instanceof RegExp) { var ast = getRegExpAst(pattern); var charCodeFinder = new CharCodeFinder(charCodes); charCodeFinder.visit(ast); return charCodeFinder.found; } else { return (find(pattern, function (char) { return contains(charCodes, char.charCodeAt(0)); }) !== undefined); } } //# sourceMappingURL=reg_exp.js.map