chevrotain

import { Alternative, Atom, BaseRegExpVisitor, Character, Disjunction, Group, Set, } from "@chevrotain/regexp-to-ast"; import { every, find, forEach, includes, isArray, values } from "lodash-es"; import { PRINT_ERROR, PRINT_WARNING } from "@chevrotain/utils"; import { ASTNode, getRegExpAst } from "./reg_exp_parser.js"; import { charCodeToOptimizedIndex, minOptimizationVal } from "./lexer.js"; const complementErrorMessage = "Complement Sets are not supported for first char optimization"; export const failedOptimizationPrefixMsg = 'Unable to use "first char" lexer optimizations:\n'; export function getOptimizedStartCodesIndices( regExp: RegExp, ensureOptimizations = false, ): number[] { try { const ast = getRegExpAst(regExp); const firstChars = firstCharOptimizedIndices( ast.value, {}, ast.flags.ignoreCase, ); return firstChars; } catch (e) { /* istanbul ignore next */ // Testing this relies on the regexp-to-ast library having a bug... */ // TODO: only the else branch needs to be ignored, try to fix with newer prettier / tsc if (e.message === complementErrorMessage) { if (ensureOptimizations) { PRINT_WARNING( `${failedOptimizationPrefixMsg}` + `\tUnable to optimize: < ${regExp.toString()} >\n` + "\tComplement Sets cannot be automatically optimized.\n" + "\tThis will disable the lexer's first char optimizations.\n" + "\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#COMPLEMENT for details.", ); } } else { let msgSuffix = ""; if (ensureOptimizations) { msgSuffix = "\n\tThis will disable the lexer's first char optimizations.\n" + "\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#REGEXP_PARSING for details."; } PRINT_ERROR( `${failedOptimizationPrefixMsg}\n` + `\tFailed parsing: < ${regExp.toString()} >\n` + `\tUsing the @chevrotain/regexp-to-ast library\n` + "\tPlease open an issue at: https://github.com/chevrotain/chevrotain/issues" + msgSuffix, ); } } return []; } export function firstCharOptimizedIndices( ast: ASTNode, result: { [charCode: number]: number }, ignoreCase: boolean, ): number[] { switch (ast.type) { case "Disjunction": for (let i = 0; i < ast.value.length; i++) { firstCharOptimizedIndices(ast.value[i], result, ignoreCase); } break; case "Alternative": const terms = ast.value; for (let i = 0; i < terms.length; i++) { const term = terms[i]; // skip terms that cannot effect the first char results switch (term.type) { case "EndAnchor": // A group back reference cannot affect potential starting char. // because if a back reference is the first production than automatically // the group being referenced has had to come BEFORE so its codes have already been added case "GroupBackReference": // assertions do not affect potential starting codes case "Lookahead": case "NegativeLookahead": case "StartAnchor": case "WordBoundary": case "NonWordBoundary": continue; } const atom = term; switch (atom.type) { case "Character": addOptimizedIdxToResult(atom.value, result, ignoreCase); break; case "Set": if (atom.complement === true) { throw Error(complementErrorMessage); } forEach(atom.value, (code) => { if (typeof code === "number") { addOptimizedIdxToResult(code, result, ignoreCase); } else { // range const range = code as any; // cannot optimize when ignoreCase is if (ignoreCase === true) { for ( let rangeCode = range.from; rangeCode <= range.to; rangeCode++ ) { addOptimizedIdxToResult(rangeCode, result, ignoreCase); } } // Optimization (2 orders of magnitude less work for very large ranges) else { // handle unoptimized values for ( let rangeCode = range.from; rangeCode <= range.to && rangeCode < minOptimizationVal; rangeCode++ ) { addOptimizedIdxToResult(rangeCode, result, ignoreCase); } // Less common charCode where we optimize for faster init time, by using larger "buckets" if (range.to >= minOptimizationVal) { const minUnOptVal = range.from >= minOptimizationVal ? range.from : minOptimizationVal; const maxUnOptVal = range.to; const minOptIdx = charCodeToOptimizedIndex(minUnOptVal); const maxOptIdx = charCodeToOptimizedIndex(maxUnOptVal); for ( let currOptIdx = minOptIdx; currOptIdx <= maxOptIdx; currOptIdx++ ) { result[currOptIdx] = currOptIdx; } } } } }); break; case "Group": firstCharOptimizedIndices(atom.value, result, ignoreCase); break; /* istanbul ignore next */ default: throw Error("Non Exhaustive Match"); } // reached a mandatory production, no more **start** codes can be found on this alternative const isOptionalQuantifier = atom.quantifier !== undefined && atom.quantifier.atLeast === 0; if ( // A group may be optional due to empty contents /(?:)/ // or if everything inside it is optional /((a)?)/ (atom.type === "Group" && isWholeOptional(atom) === false) || // If this term is not a group it may only be optional if it has an optional quantifier (atom.type !== "Group" && isOptionalQuantifier === false) ) { break; } } break; /* istanbul ignore next */ default: throw Error("non exhaustive match!"); } // console.log(Object.keys(result).length) return values(result); } function addOptimizedIdxToResult( code: number, result: { [charCode: number]: number }, ignoreCase: boolean, ) { const optimizedCharIdx = charCodeToOptimizedIndex(code); result[optimizedCharIdx] = optimizedCharIdx; if (ignoreCase === true) { handleIgnoreCase(code, result); } } function handleIgnoreCase( code: number, result: { [charCode: number]: number }, ) { const char = String.fromCharCode(code); const upperChar = char.toUpperCase(); /* istanbul ignore else */ if (upperChar !== char) { const optimizedCharIdx = charCodeToOptimizedIndex(upperChar.charCodeAt(0)); result[optimizedCharIdx] = optimizedCharIdx; } else { const lowerChar = char.toLowerCase(); if (lowerChar !== char) { const optimizedCharIdx = charCodeToOptimizedIndex( lowerChar.charCodeAt(0), ); result[optimizedCharIdx] = optimizedCharIdx; } } } function findCode(setNode: Set, targetCharCodes: number[]) { return find(setNode.value, (codeOrRange) => { if (typeof codeOrRange === "number") { return includes(targetCharCodes, codeOrRange); } else { // range const range = <any>codeOrRange; return ( find( targetCharCodes, (targetCode) => range.from <= targetCode && targetCode <= range.to, ) !== undefined ); } }); } function isWholeOptional(ast: any): boolean { const quantifier = (ast as Atom).quantifier; if (quantifier && quantifier.atLeast === 0) { return true; } if (!ast.value) { return false; } return isArray(ast.value) ? every(ast.value, isWholeOptional) : isWholeOptional(ast.value); } class CharCodeFinder extends BaseRegExpVisitor { found: boolean = false; constructor(private targetCharCodes: number[]) { super(); } visitChildren(node: ASTNode) { // No need to keep looking... if (this.found === true) { return; } // switch lookaheads as they do not actually consume any characters thus // finding a charCode at lookahead context does not mean that regexp can actually contain it in a match. switch (node.type) { case "Lookahead": this.visitLookahead(node); return; case "NegativeLookahead": this.visitNegativeLookahead(node); return; } super.visitChildren(node); } visitCharacter(node: Character) { if (includes(this.targetCharCodes, node.value)) { this.found = true; } } visitSet(node: Set) { if (node.complement) { if (findCode(node, this.targetCharCodes) === undefined) { this.found = true; } } else { if (findCode(node, this.targetCharCodes) !== undefined) { this.found = true; } } } } export function canMatchCharCode( charCodes: number[], pattern: RegExp | string, ) { if (pattern instanceof RegExp) { const ast = getRegExpAst(pattern); const charCodeFinder = new CharCodeFinder(charCodes); charCodeFinder.visit(ast); return charCodeFinder.found; } else { return ( find(<any>pattern, (char) => { return includes(charCodes, (<string>char).charCodeAt(0)); }) !== undefined ); } }