chevrotain
Version:
Chevrotain is a high performance fault tolerant javascript parsing DSL for building recursive decent parsers
308 lines (282 loc) • 9.48 kB
text/typescript
import { VERSION, BaseRegExpVisitor } from "regexp-to-ast"
import {
flatten,
map,
forEach,
contains,
PRINT_ERROR,
PRINT_WARNING,
find,
isArray,
every,
values
} from "../utils/utils"
import { getRegExpAst } from "./reg_exp_parser"
import { charCodeToOptimizedIndex, minOptimizationVal } from "./lexer"
const complementErrorMessage =
"Complement Sets are not supported for first char optimization"
export const failedOptimizationPrefixMsg =
'Unable to use "first char" lexer optimizations:\n'
export function getOptimizedStartCodesIndices(
regExp: RegExp,
ensureOptimizations = false
): number[] {
try {
const ast = getRegExpAst(regExp)
const firstChars = firstCharOptimizedIndices(
ast.value,
{},
ast.flags.ignoreCase
)
return firstChars
} catch (e) {
/* istanbul ignore next */
// Testing this relies on the regexp-to-ast library having a bug... */
// TODO: only the else branch needs to be ignored, try to fix with newer prettier / tsc
if (e.message === complementErrorMessage) {
if (ensureOptimizations) {
PRINT_WARNING(
`${failedOptimizationPrefixMsg}` +
`\tUnable to optimize: < ${regExp.toString()} >\n` +
"\tComplement Sets cannot be automatically optimized.\n" +
"\tThis will disable the lexer's first char optimizations.\n" +
"\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#COMPLEMENT for details."
)
}
} else {
let msgSuffix = ""
if (ensureOptimizations) {
msgSuffix =
"\n\tThis will disable the lexer's first char optimizations.\n" +
"\tSee: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#REGEXP_PARSING for details."
}
PRINT_ERROR(
`${failedOptimizationPrefixMsg}\n` +
`\tFailed parsing: < ${regExp.toString()} >\n` +
`\tUsing the regexp-to-ast library version: ${VERSION}\n` +
"\tPlease open an issue at: https://github.com/bd82/regexp-to-ast/issues" +
msgSuffix
)
}
}
return []
}
export function firstCharOptimizedIndices(ast, result, ignoreCase): number[] {
switch (ast.type) {
case "Disjunction":
for (let i = 0; i < ast.value.length; i++) {
firstCharOptimizedIndices(ast.value[i], result, ignoreCase)
}
break
case "Alternative":
const terms = ast.value
for (let i = 0; i < terms.length; i++) {
const term = terms[i]
// skip terms that cannot effect the first char results
switch (term.type) {
case "EndAnchor":
// A group back reference cannot affect potential starting char.
// because if a back reference is the first production than automatically
// the group being referenced has had to come BEFORE so its codes have already been added
case "GroupBackReference":
// assertions do not affect potential starting codes
case "Lookahead":
case "NegativeLookahead":
case "StartAnchor":
case "WordBoundary":
case "NonWordBoundary":
continue
}
const atom = term
switch (atom.type) {
case "Character":
addOptimizedIdxToResult(atom.value, result, ignoreCase)
break
case "Set":
if (atom.complement === true) {
throw Error(complementErrorMessage)
}
forEach(atom.value, (code) => {
if (typeof code === "number") {
addOptimizedIdxToResult(code, result, ignoreCase)
} else {
// range
const range = code
// cannot optimize when ignoreCase is
if (ignoreCase === true) {
for (
let rangeCode = range.from;
rangeCode <= range.to;
rangeCode++
) {
addOptimizedIdxToResult(rangeCode, result, ignoreCase)
}
}
// Optimization (2 orders of magnitude less work for very large ranges)
else {
// handle unoptimized values
for (
let rangeCode = range.from;
rangeCode <= range.to && rangeCode < minOptimizationVal;
rangeCode++
) {
addOptimizedIdxToResult(rangeCode, result, ignoreCase)
}
// Less common charCode where we optimize for faster init time, by using larger "buckets"
if (range.to >= minOptimizationVal) {
const minUnOptVal =
range.from >= minOptimizationVal
? range.from
: minOptimizationVal
const maxUnOptVal = range.to
const minOptIdx = charCodeToOptimizedIndex(minUnOptVal)
const maxOptIdx = charCodeToOptimizedIndex(maxUnOptVal)
for (
let currOptIdx = minOptIdx;
currOptIdx <= maxOptIdx;
currOptIdx++
) {
result[currOptIdx] = currOptIdx
}
}
}
}
})
break
case "Group":
firstCharOptimizedIndices(atom.value, result, ignoreCase)
break
/* istanbul ignore next */
default:
throw Error("Non Exhaustive Match")
}
// reached a mandatory production, no more **start** codes can be found on this alternative
const isOptionalQuantifier =
atom.quantifier !== undefined && atom.quantifier.atLeast === 0
if (
// A group may be optional due to empty contents /(?:)/
// or if everything inside it is optional /((a)?)/
(atom.type === "Group" && isWholeOptional(atom) === false) ||
// If this term is not a group it may only be optional if it has an optional quantifier
(atom.type !== "Group" && isOptionalQuantifier === false)
) {
break
}
}
break
/* istanbul ignore next */
default:
throw Error("non exhaustive match!")
}
// console.log(Object.keys(result).length)
return values(result)
}
function addOptimizedIdxToResult(
code: number,
result: number[],
ignoreCase: boolean
) {
const optimizedCharIdx = charCodeToOptimizedIndex(code)
result[optimizedCharIdx] = optimizedCharIdx
if (ignoreCase === true) {
handleIgnoreCase(code, result)
}
}
function handleIgnoreCase(code: number, result: number[]) {
const char = String.fromCharCode(code)
const upperChar = char.toUpperCase()
/* istanbul ignore else */
if (upperChar !== char) {
const optimizedCharIdx = charCodeToOptimizedIndex(upperChar.charCodeAt(0))
result[optimizedCharIdx] = optimizedCharIdx
} else {
const lowerChar = char.toLowerCase()
if (lowerChar !== char) {
const optimizedCharIdx = charCodeToOptimizedIndex(lowerChar.charCodeAt(0))
result[optimizedCharIdx] = optimizedCharIdx
}
}
}
function findCode(setNode, targetCharCodes) {
return find(setNode.value, (codeOrRange) => {
if (typeof codeOrRange === "number") {
return contains(targetCharCodes, codeOrRange)
} else {
// range
const range = <any>codeOrRange
return (
find(
targetCharCodes,
(targetCode) => range.from <= targetCode && targetCode <= range.to
) !== undefined
)
}
})
}
function isWholeOptional(ast) {
if (ast.quantifier && ast.quantifier.atLeast === 0) {
return true
}
if (!ast.value) {
return false
}
return isArray(ast.value)
? every(ast.value, isWholeOptional)
: isWholeOptional(ast.value)
}
class CharCodeFinder extends BaseRegExpVisitor {
found: boolean = false
constructor(private targetCharCodes: number[]) {
super()
}
visitChildren(node) {
// No need to keep looking...
if (this.found === true) {
return
}
// switch lookaheads as they do not actually consume any characters thus
// finding a charCode at lookahead context does not mean that regexp can actually contain it in a match.
switch (node.type) {
case "Lookahead":
this.visitLookahead(node)
return
case "NegativeLookahead":
this.visitNegativeLookahead(node)
return
}
super.visitChildren(node)
}
visitCharacter(node) {
if (contains(this.targetCharCodes, node.value)) {
this.found = true
}
}
visitSet(node) {
if (node.complement) {
if (findCode(node, this.targetCharCodes) === undefined) {
this.found = true
}
} else {
if (findCode(node, this.targetCharCodes) !== undefined) {
this.found = true
}
}
}
}
export function canMatchCharCode(
charCodes: number[],
pattern: RegExp | string
) {
if (pattern instanceof RegExp) {
const ast = getRegExpAst(pattern)
const charCodeFinder = new CharCodeFinder(charCodes)
charCodeFinder.visit(ast)
return charCodeFinder.found
} else {
return (
find(<any>pattern, (char) => {
return contains(charCodes, (<string>char).charCodeAt(0))
}) !== undefined
)
}
}