chevrotain

import { VERSION, BaseRegExpVisitor } from "regexp-to-ast" import { flatten, map, forEach, contains, PRINT_ERROR, PRINT_WARNING, find, isArray, every, values } from "../utils/utils" import { getRegExpAst } from "./reg_exp_parser" import { charCodeToOptimizedIndex, minOptimizationVal } from "./lexer" const complementErrorMessage = "Complement Sets are not supported for first char optimization" export const failedOptimizationPrefixMsg = 'Unable to use "first char" lexer optimizations:\n' export function getOptimizedStartCodesIndices( regExp: RegExp, ensureOptimizations = false ): number[] { try { const ast = getRegExpAst(regExp) const firstChars = firstCharOptimizedIndices( ast.value, {}, ast.flags.ignoreCase ) return firstChars } catch (e) { /* istanbul ignore next */ // Testing this relies on the regexp-to-ast library having a bug... */ // TODO: only the else branch needs to be ignored, try to fix with newer prettier / tsc if (e.message === complementErrorMessage) { if (ensureOptimizations) { PRINT_WARNING( `${failedOptimizationPrefixMsg}` + `\tUnable to optimize: < ${regExp.toString()} >\n` + "\tComplement Sets cannot be automatically optimized.\n" + "\tThis will disable the lexer's first char optimizations.\n" + "\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#COMPLEMENT for details." ) } } else { let msgSuffix = "" if (ensureOptimizations) { msgSuffix = "\n\tThis will disable the lexer's first char optimizations.\n" + "\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#REGEXP_PARSING for details." } PRINT_ERROR( `${failedOptimizationPrefixMsg}\n` + `\tFailed parsing: < ${regExp.toString()} >\n` + `\tUsing the regexp-to-ast library version: ${VERSION}\n` + "\tPlease open an issue at: https://github.com/bd82/regexp-to-ast/issues" + msgSuffix ) } } return [] } export function firstCharOptimizedIndices(ast, result, ignoreCase): number[] { switch (ast.type) { case "Disjunction": for (let i = 0; i < ast.value.length; i++) { firstCharOptimizedIndices(ast.value[i], result, ignoreCase) } break case "Alternative": const terms = ast.value for (let i = 0; i < terms.length; i++) { const term = terms[i] // skip terms that cannot effect the first char results switch (term.type) { case "EndAnchor": // A group back reference cannot affect potential starting char. // because if a back reference is the first production than automatically // the group being referenced has had to come BEFORE so its codes have already been added case "GroupBackReference": // assertions do not affect potential starting codes case "Lookahead": case "NegativeLookahead": case "StartAnchor": case "WordBoundary": case "NonWordBoundary": continue } const atom = term switch (atom.type) { case "Character": addOptimizedIdxToResult(atom.value, result, ignoreCase) break case "Set": if (atom.complement === true) { throw Error(complementErrorMessage) } forEach(atom.value, (code) => { if (typeof code === "number") { addOptimizedIdxToResult(code, result, ignoreCase) } else { // range const range = code // cannot optimize when ignoreCase is if (ignoreCase === true) { for ( let rangeCode = range.from; rangeCode <= range.to; rangeCode++ ) { addOptimizedIdxToResult(rangeCode, result, ignoreCase) } } // Optimization (2 orders of magnitude less work for very large ranges) else { // handle unoptimized values for ( let rangeCode = range.from; rangeCode <= range.to && rangeCode < minOptimizationVal; rangeCode++ ) { addOptimizedIdxToResult(rangeCode, result, ignoreCase) } // Less common charCode where we optimize for faster init time, by using larger "buckets" if (range.to >= minOptimizationVal) { const minUnOptVal = range.from >= minOptimizationVal ? range.from : minOptimizationVal const maxUnOptVal = range.to const minOptIdx = charCodeToOptimizedIndex(minUnOptVal) const maxOptIdx = charCodeToOptimizedIndex(maxUnOptVal) for ( let currOptIdx = minOptIdx; currOptIdx <= maxOptIdx; currOptIdx++ ) { result[currOptIdx] = currOptIdx } } } } }) break case "Group": firstCharOptimizedIndices(atom.value, result, ignoreCase) break /* istanbul ignore next */ default: throw Error("Non Exhaustive Match") } // reached a mandatory production, no more **start** codes can be found on this alternative const isOptionalQuantifier = atom.quantifier !== undefined && atom.quantifier.atLeast === 0 if ( // A group may be optional due to empty contents /(?:)/ // or if everything inside it is optional /((a)?)/ (atom.type === "Group" && isWholeOptional(atom) === false) || // If this term is not a group it may only be optional if it has an optional quantifier (atom.type !== "Group" && isOptionalQuantifier === false) ) { break } } break /* istanbul ignore next */ default: throw Error("non exhaustive match!") } // console.log(Object.keys(result).length) return values(result) } function addOptimizedIdxToResult( code: number, result: number[], ignoreCase: boolean ) { const optimizedCharIdx = charCodeToOptimizedIndex(code) result[optimizedCharIdx] = optimizedCharIdx if (ignoreCase === true) { handleIgnoreCase(code, result) } } function handleIgnoreCase(code: number, result: number[]) { const char = String.fromCharCode(code) const upperChar = char.toUpperCase() /* istanbul ignore else */ if (upperChar !== char) { const optimizedCharIdx = charCodeToOptimizedIndex(upperChar.charCodeAt(0)) result[optimizedCharIdx] = optimizedCharIdx } else { const lowerChar = char.toLowerCase() if (lowerChar !== char) { const optimizedCharIdx = charCodeToOptimizedIndex(lowerChar.charCodeAt(0)) result[optimizedCharIdx] = optimizedCharIdx } } } function findCode(setNode, targetCharCodes) { return find(setNode.value, (codeOrRange) => { if (typeof codeOrRange === "number") { return contains(targetCharCodes, codeOrRange) } else { // range const range = <any>codeOrRange return ( find( targetCharCodes, (targetCode) => range.from <= targetCode && targetCode <= range.to ) !== undefined ) } }) } function isWholeOptional(ast) { if (ast.quantifier && ast.quantifier.atLeast === 0) { return true } if (!ast.value) { return false } return isArray(ast.value) ? every(ast.value, isWholeOptional) : isWholeOptional(ast.value) } class CharCodeFinder extends BaseRegExpVisitor { found: boolean = false constructor(private targetCharCodes: number[]) { super() } visitChildren(node) { // No need to keep looking... if (this.found === true) { return } // switch lookaheads as they do not actually consume any characters thus // finding a charCode at lookahead context does not mean that regexp can actually contain it in a match. switch (node.type) { case "Lookahead": this.visitLookahead(node) return case "NegativeLookahead": this.visitNegativeLookahead(node) return } super.visitChildren(node) } visitCharacter(node) { if (contains(this.targetCharCodes, node.value)) { this.found = true } } visitSet(node) { if (node.complement) { if (findCode(node, this.targetCharCodes) === undefined) { this.found = true } } else { if (findCode(node, this.targetCharCodes) !== undefined) { this.found = true } } } } export function canMatchCharCode( charCodes: number[], pattern: RegExp | string ) { if (pattern instanceof RegExp) { const ast = getRegExpAst(pattern) const charCodeFinder = new CharCodeFinder(charCodes) charCodeFinder.visit(ast) return charCodeFinder.found } else { return ( find(<any>pattern, (char) => { return contains(charCodes, (<string>char).charCodeAt(0)) }) !== undefined ) } }