chevrotain
Version:
Chevrotain is a high performance fault tolerant javascript parsing DSL for building recursive decent parsers
316 lines (290 loc) • 9.87 kB
text/typescript
import {
Alternative,
Atom,
BaseRegExpVisitor,
Character,
Disjunction,
Group,
Set,
} from "@chevrotain/regexp-to-ast";
import { every, find, forEach, includes, isArray, values } from "lodash-es";
import { PRINT_ERROR, PRINT_WARNING } from "@chevrotain/utils";
import { ASTNode, getRegExpAst } from "./reg_exp_parser.js";
import { charCodeToOptimizedIndex, minOptimizationVal } from "./lexer.js";
const complementErrorMessage =
"Complement Sets are not supported for first char optimization";
export const failedOptimizationPrefixMsg =
'Unable to use "first char" lexer optimizations:\n';
export function getOptimizedStartCodesIndices(
regExp: RegExp,
ensureOptimizations = false,
): number[] {
try {
const ast = getRegExpAst(regExp);
const firstChars = firstCharOptimizedIndices(
ast.value,
{},
ast.flags.ignoreCase,
);
return firstChars;
} catch (e) {
/* istanbul ignore next */
// Testing this relies on the regexp-to-ast library having a bug... */
// TODO: only the else branch needs to be ignored, try to fix with newer prettier / tsc
if (e.message === complementErrorMessage) {
if (ensureOptimizations) {
PRINT_WARNING(
`${failedOptimizationPrefixMsg}` +
`\tUnable to optimize: < ${regExp.toString()} >\n` +
"\tComplement Sets cannot be automatically optimized.\n" +
"\tThis will disable the lexer's first char optimizations.\n" +
"\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#COMPLEMENT for details.",
);
}
} else {
let msgSuffix = "";
if (ensureOptimizations) {
msgSuffix =
"\n\tThis will disable the lexer's first char optimizations.\n" +
"\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#REGEXP_PARSING for details.";
}
PRINT_ERROR(
`${failedOptimizationPrefixMsg}\n` +
`\tFailed parsing: < ${regExp.toString()} >\n` +
`\tUsing the @chevrotain/regexp-to-ast library\n` +
"\tPlease open an issue at: https://github.com/chevrotain/chevrotain/issues" +
msgSuffix,
);
}
}
return [];
}
export function firstCharOptimizedIndices(
ast: ASTNode,
result: { [charCode: number]: number },
ignoreCase: boolean,
): number[] {
switch (ast.type) {
case "Disjunction":
for (let i = 0; i < ast.value.length; i++) {
firstCharOptimizedIndices(ast.value[i], result, ignoreCase);
}
break;
case "Alternative":
const terms = ast.value;
for (let i = 0; i < terms.length; i++) {
const term = terms[i];
// skip terms that cannot effect the first char results
switch (term.type) {
case "EndAnchor":
// A group back reference cannot affect potential starting char.
// because if a back reference is the first production than automatically
// the group being referenced has had to come BEFORE so its codes have already been added
case "GroupBackReference":
// assertions do not affect potential starting codes
case "Lookahead":
case "NegativeLookahead":
case "StartAnchor":
case "WordBoundary":
case "NonWordBoundary":
continue;
}
const atom = term;
switch (atom.type) {
case "Character":
addOptimizedIdxToResult(atom.value, result, ignoreCase);
break;
case "Set":
if (atom.complement === true) {
throw Error(complementErrorMessage);
}
forEach(atom.value, (code) => {
if (typeof code === "number") {
addOptimizedIdxToResult(code, result, ignoreCase);
} else {
// range
const range = code as any;
// cannot optimize when ignoreCase is
if (ignoreCase === true) {
for (
let rangeCode = range.from;
rangeCode <= range.to;
rangeCode++
) {
addOptimizedIdxToResult(rangeCode, result, ignoreCase);
}
}
// Optimization (2 orders of magnitude less work for very large ranges)
else {
// handle unoptimized values
for (
let rangeCode = range.from;
rangeCode <= range.to && rangeCode < minOptimizationVal;
rangeCode++
) {
addOptimizedIdxToResult(rangeCode, result, ignoreCase);
}
// Less common charCode where we optimize for faster init time, by using larger "buckets"
if (range.to >= minOptimizationVal) {
const minUnOptVal =
range.from >= minOptimizationVal
? range.from
: minOptimizationVal;
const maxUnOptVal = range.to;
const minOptIdx = charCodeToOptimizedIndex(minUnOptVal);
const maxOptIdx = charCodeToOptimizedIndex(maxUnOptVal);
for (
let currOptIdx = minOptIdx;
currOptIdx <= maxOptIdx;
currOptIdx++
) {
result[currOptIdx] = currOptIdx;
}
}
}
}
});
break;
case "Group":
firstCharOptimizedIndices(atom.value, result, ignoreCase);
break;
/* istanbul ignore next */
default:
throw Error("Non Exhaustive Match");
}
// reached a mandatory production, no more **start** codes can be found on this alternative
const isOptionalQuantifier =
atom.quantifier !== undefined && atom.quantifier.atLeast === 0;
if (
// A group may be optional due to empty contents /(?:)/
// or if everything inside it is optional /((a)?)/
(atom.type === "Group" && isWholeOptional(atom) === false) ||
// If this term is not a group it may only be optional if it has an optional quantifier
(atom.type !== "Group" && isOptionalQuantifier === false)
) {
break;
}
}
break;
/* istanbul ignore next */
default:
throw Error("non exhaustive match!");
}
// console.log(Object.keys(result).length)
return values(result);
}
function addOptimizedIdxToResult(
code: number,
result: { [charCode: number]: number },
ignoreCase: boolean,
) {
const optimizedCharIdx = charCodeToOptimizedIndex(code);
result[optimizedCharIdx] = optimizedCharIdx;
if (ignoreCase === true) {
handleIgnoreCase(code, result);
}
}
function handleIgnoreCase(
code: number,
result: { [charCode: number]: number },
) {
const char = String.fromCharCode(code);
const upperChar = char.toUpperCase();
/* istanbul ignore else */
if (upperChar !== char) {
const optimizedCharIdx = charCodeToOptimizedIndex(upperChar.charCodeAt(0));
result[optimizedCharIdx] = optimizedCharIdx;
} else {
const lowerChar = char.toLowerCase();
if (lowerChar !== char) {
const optimizedCharIdx = charCodeToOptimizedIndex(
lowerChar.charCodeAt(0),
);
result[optimizedCharIdx] = optimizedCharIdx;
}
}
}
function findCode(setNode: Set, targetCharCodes: number[]) {
return find(setNode.value, (codeOrRange) => {
if (typeof codeOrRange === "number") {
return includes(targetCharCodes, codeOrRange);
} else {
// range
const range = <any>codeOrRange;
return (
find(
targetCharCodes,
(targetCode) => range.from <= targetCode && targetCode <= range.to,
) !== undefined
);
}
});
}
function isWholeOptional(ast: any): boolean {
const quantifier = (ast as Atom).quantifier;
if (quantifier && quantifier.atLeast === 0) {
return true;
}
if (!ast.value) {
return false;
}
return isArray(ast.value)
? every(ast.value, isWholeOptional)
: isWholeOptional(ast.value);
}
class CharCodeFinder extends BaseRegExpVisitor {
found: boolean = false;
constructor(private targetCharCodes: number[]) {
super();
}
visitChildren(node: ASTNode) {
// No need to keep looking...
if (this.found === true) {
return;
}
// switch lookaheads as they do not actually consume any characters thus
// finding a charCode at lookahead context does not mean that regexp can actually contain it in a match.
switch (node.type) {
case "Lookahead":
this.visitLookahead(node);
return;
case "NegativeLookahead":
this.visitNegativeLookahead(node);
return;
}
super.visitChildren(node);
}
visitCharacter(node: Character) {
if (includes(this.targetCharCodes, node.value)) {
this.found = true;
}
}
visitSet(node: Set) {
if (node.complement) {
if (findCode(node, this.targetCharCodes) === undefined) {
this.found = true;
}
} else {
if (findCode(node, this.targetCharCodes) !== undefined) {
this.found = true;
}
}
}
}
export function canMatchCharCode(
charCodes: number[],
pattern: RegExp | string,
) {
if (pattern instanceof RegExp) {
const ast = getRegExpAst(pattern);
const charCodeFinder = new CharCodeFinder(charCodes);
charCodeFinder.visit(ast);
return charCodeFinder.found;
} else {
return (
find(<any>pattern, (char) => {
return includes(charCodes, (<string>char).charCodeAt(0));
}) !== undefined
);
}
}