chevrotain
Version:
Chevrotain is a high performance fault tolerant javascript parsing DSL for building recursive decent parsers
740 lines (680 loc) • 21.5 kB
text/typescript
import { every, flatten, forEach, has, isEmpty, map, reduce } from "lodash-es";
import { possiblePathsFrom } from "./interpreter.js";
import { RestWalker } from "./rest.js";
import { Predicate, TokenMatcher } from "../parser/parser.js";
import {
tokenStructuredMatcher,
tokenStructuredMatcherNoCategories,
} from "../../scan/tokens.js";
import {
Alternation,
Alternative as AlternativeGAST,
GAstVisitor,
Option,
Repetition,
RepetitionMandatory,
RepetitionMandatoryWithSeparator,
RepetitionWithSeparator,
} from "@chevrotain/gast";
import {
BaseParser,
IOrAlt,
IProduction,
IProductionWithOccurrence,
LookaheadProductionType,
LookaheadSequence,
Rule,
TokenType,
} from "@chevrotain/types";
export enum PROD_TYPE {
OPTION,
REPETITION,
REPETITION_MANDATORY,
REPETITION_MANDATORY_WITH_SEPARATOR,
REPETITION_WITH_SEPARATOR,
ALTERNATION,
}
export function getProdType(
prod: IProduction | LookaheadProductionType,
): PROD_TYPE {
/* istanbul ignore else */
if (prod instanceof Option || prod === "Option") {
return PROD_TYPE.OPTION;
} else if (prod instanceof Repetition || prod === "Repetition") {
return PROD_TYPE.REPETITION;
} else if (
prod instanceof RepetitionMandatory ||
prod === "RepetitionMandatory"
) {
return PROD_TYPE.REPETITION_MANDATORY;
} else if (
prod instanceof RepetitionMandatoryWithSeparator ||
prod === "RepetitionMandatoryWithSeparator"
) {
return PROD_TYPE.REPETITION_MANDATORY_WITH_SEPARATOR;
} else if (
prod instanceof RepetitionWithSeparator ||
prod === "RepetitionWithSeparator"
) {
return PROD_TYPE.REPETITION_WITH_SEPARATOR;
} else if (prod instanceof Alternation || prod === "Alternation") {
return PROD_TYPE.ALTERNATION;
} else {
throw Error("non exhaustive match");
}
}
export function getLookaheadPaths(options: {
occurrence: number;
rule: Rule;
prodType: LookaheadProductionType;
maxLookahead: number;
}): LookaheadSequence[] {
const { occurrence, rule, prodType, maxLookahead } = options;
const type = getProdType(prodType);
if (type === PROD_TYPE.ALTERNATION) {
return getLookaheadPathsForOr(occurrence, rule, maxLookahead);
} else {
return getLookaheadPathsForOptionalProd(
occurrence,
rule,
type,
maxLookahead,
);
}
}
export function buildLookaheadFuncForOr(
occurrence: number,
ruleGrammar: Rule,
maxLookahead: number,
hasPredicates: boolean,
dynamicTokensEnabled: boolean,
laFuncBuilder: Function,
): (orAlts?: IOrAlt<any>[]) => number | undefined {
const lookAheadPaths = getLookaheadPathsForOr(
occurrence,
ruleGrammar,
maxLookahead,
);
const tokenMatcher = areTokenCategoriesNotUsed(lookAheadPaths)
? tokenStructuredMatcherNoCategories
: tokenStructuredMatcher;
return laFuncBuilder(
lookAheadPaths,
hasPredicates,
tokenMatcher,
dynamicTokensEnabled,
);
}
/**
* When dealing with an Optional production (OPTION/MANY/2nd iteration of AT_LEAST_ONE/...) we need to compare
* the lookahead "inside" the production and the lookahead immediately "after" it in the same top level rule (context free).
*
* Example: given a production:
* ABC(DE)?DF
*
* The optional '(DE)?' should only be entered if we see 'DE'. a single Token 'D' is not sufficient to distinguish between the two
* alternatives.
*
* @returns A Lookahead function which will return true IFF the parser should parse the Optional production.
*/
export function buildLookaheadFuncForOptionalProd(
occurrence: number,
ruleGrammar: Rule,
k: number,
dynamicTokensEnabled: boolean,
prodType: PROD_TYPE,
lookaheadBuilder: (
lookAheadSequence: LookaheadSequence,
tokenMatcher: TokenMatcher,
dynamicTokensEnabled: boolean,
) => () => boolean,
): () => boolean {
const lookAheadPaths = getLookaheadPathsForOptionalProd(
occurrence,
ruleGrammar,
prodType,
k,
);
const tokenMatcher = areTokenCategoriesNotUsed(lookAheadPaths)
? tokenStructuredMatcherNoCategories
: tokenStructuredMatcher;
return lookaheadBuilder(
lookAheadPaths[0],
tokenMatcher,
dynamicTokensEnabled,
);
}
export type Alternative = TokenType[][];
export function buildAlternativesLookAheadFunc(
alts: LookaheadSequence[],
hasPredicates: boolean,
tokenMatcher: TokenMatcher,
dynamicTokensEnabled: boolean,
): (orAlts: IOrAlt<any>[]) => number | undefined {
const numOfAlts = alts.length;
const areAllOneTokenLookahead = every(alts, (currAlt) => {
return every(currAlt, (currPath) => {
return currPath.length === 1;
});
});
// This version takes into account the predicates as well.
if (hasPredicates) {
/**
* @returns {number} - The chosen alternative index
*/
return function (
this: BaseParser,
orAlts: IOrAlt<any>[],
): number | undefined {
// unfortunately the predicates must be extracted every single time
// as they cannot be cached due to references to parameters(vars) which are no longer valid.
// note that in the common case of no predicates, no cpu time will be wasted on this (see else block)
const predicates: (Predicate | undefined)[] = map(
orAlts,
(currAlt) => currAlt.GATE,
);
for (let t = 0; t < numOfAlts; t++) {
const currAlt = alts[t];
const currNumOfPaths = currAlt.length;
const currPredicate = predicates[t];
if (currPredicate !== undefined && currPredicate.call(this) === false) {
// if the predicate does not match there is no point in checking the paths
continue;
}
nextPath: for (let j = 0; j < currNumOfPaths; j++) {
const currPath = currAlt[j];
const currPathLength = currPath.length;
for (let i = 0; i < currPathLength; i++) {
const nextToken = this.LA(i + 1);
if (tokenMatcher(nextToken, currPath[i]) === false) {
// mismatch in current path
// try the next pth
continue nextPath;
}
}
// found a full path that matches.
// this will also work for an empty ALT as the loop will be skipped
return t;
}
// none of the paths for the current alternative matched
// try the next alternative
}
// none of the alternatives could be matched
return undefined;
};
} else if (areAllOneTokenLookahead && !dynamicTokensEnabled) {
// optimized (common) case of all the lookaheads paths requiring only
// a single token lookahead. These Optimizations cannot work if dynamically defined Tokens are used.
const singleTokenAlts = map(alts, (currAlt) => {
return flatten(currAlt);
});
const choiceToAlt = reduce(
singleTokenAlts,
(result, currAlt, idx) => {
forEach(currAlt, (currTokType) => {
if (!has(result, currTokType.tokenTypeIdx!)) {
result[currTokType.tokenTypeIdx!] = idx;
}
forEach(currTokType.categoryMatches!, (currExtendingType) => {
if (!has(result, currExtendingType)) {
result[currExtendingType] = idx;
}
});
});
return result;
},
{} as Record<number, number>,
);
/**
* @returns {number} - The chosen alternative index
*/
return function (this: BaseParser): number {
const nextToken = this.LA(1);
return choiceToAlt[nextToken.tokenTypeIdx];
};
} else {
// optimized lookahead without needing to check the predicates at all.
// this causes code duplication which is intentional to improve performance.
/**
* @returns {number} - The chosen alternative index
*/
return function (this: BaseParser): number | undefined {
for (let t = 0; t < numOfAlts; t++) {
const currAlt = alts[t];
const currNumOfPaths = currAlt.length;
nextPath: for (let j = 0; j < currNumOfPaths; j++) {
const currPath = currAlt[j];
const currPathLength = currPath.length;
for (let i = 0; i < currPathLength; i++) {
const nextToken = this.LA(i + 1);
if (tokenMatcher(nextToken, currPath[i]) === false) {
// mismatch in current path
// try the next pth
continue nextPath;
}
}
// found a full path that matches.
// this will also work for an empty ALT as the loop will be skipped
return t;
}
// none of the paths for the current alternative matched
// try the next alternative
}
// none of the alternatives could be matched
return undefined;
};
}
}
export function buildSingleAlternativeLookaheadFunction(
alt: LookaheadSequence,
tokenMatcher: TokenMatcher,
dynamicTokensEnabled: boolean,
): () => boolean {
const areAllOneTokenLookahead = every(alt, (currPath) => {
return currPath.length === 1;
});
const numOfPaths = alt.length;
// optimized (common) case of all the lookaheads paths requiring only
// a single token lookahead.
if (areAllOneTokenLookahead && !dynamicTokensEnabled) {
const singleTokensTypes = flatten(alt);
if (
singleTokensTypes.length === 1 &&
isEmpty((<any>singleTokensTypes[0]).categoryMatches)
) {
const expectedTokenType = singleTokensTypes[0];
const expectedTokenUniqueKey = (<any>expectedTokenType).tokenTypeIdx;
return function (this: BaseParser): boolean {
return this.LA(1).tokenTypeIdx === expectedTokenUniqueKey;
};
} else {
const choiceToAlt = reduce(
singleTokensTypes,
(result, currTokType, idx) => {
result[currTokType.tokenTypeIdx!] = true;
forEach(currTokType.categoryMatches!, (currExtendingType) => {
result[currExtendingType] = true;
});
return result;
},
[] as boolean[],
);
return function (this: BaseParser): boolean {
const nextToken = this.LA(1);
return choiceToAlt[nextToken.tokenTypeIdx] === true;
};
}
} else {
return function (this: BaseParser): boolean {
nextPath: for (let j = 0; j < numOfPaths; j++) {
const currPath = alt[j];
const currPathLength = currPath.length;
for (let i = 0; i < currPathLength; i++) {
const nextToken = this.LA(i + 1);
if (tokenMatcher(nextToken, currPath[i]) === false) {
// mismatch in current path
// try the next pth
continue nextPath;
}
}
// found a full path that matches.
return true;
}
// none of the paths matched
return false;
};
}
}
class RestDefinitionFinderWalker extends RestWalker {
private restDef: IProduction[];
constructor(
private topProd: Rule,
private targetOccurrence: number,
private targetProdType: PROD_TYPE,
) {
super();
}
startWalking(): IProduction[] {
this.walk(this.topProd);
return this.restDef;
}
private checkIsTarget(
node: IProductionWithOccurrence,
expectedProdType: PROD_TYPE,
currRest: IProduction[],
prevRest: IProduction[],
): boolean {
if (
node.idx === this.targetOccurrence &&
this.targetProdType === expectedProdType
) {
this.restDef = currRest.concat(prevRest);
return true;
}
// performance optimization, do not iterate over the entire Grammar ast after we have found the target
return false;
}
walkOption(
optionProd: Option,
currRest: IProduction[],
prevRest: IProduction[],
): void {
if (!this.checkIsTarget(optionProd, PROD_TYPE.OPTION, currRest, prevRest)) {
super.walkOption(optionProd, currRest, prevRest);
}
}
walkAtLeastOne(
atLeastOneProd: RepetitionMandatory,
currRest: IProduction[],
prevRest: IProduction[],
): void {
if (
!this.checkIsTarget(
atLeastOneProd,
PROD_TYPE.REPETITION_MANDATORY,
currRest,
prevRest,
)
) {
super.walkOption(atLeastOneProd, currRest, prevRest);
}
}
walkAtLeastOneSep(
atLeastOneSepProd: RepetitionMandatoryWithSeparator,
currRest: IProduction[],
prevRest: IProduction[],
): void {
if (
!this.checkIsTarget(
atLeastOneSepProd,
PROD_TYPE.REPETITION_MANDATORY_WITH_SEPARATOR,
currRest,
prevRest,
)
) {
super.walkOption(atLeastOneSepProd, currRest, prevRest);
}
}
walkMany(
manyProd: Repetition,
currRest: IProduction[],
prevRest: IProduction[],
): void {
if (
!this.checkIsTarget(manyProd, PROD_TYPE.REPETITION, currRest, prevRest)
) {
super.walkOption(manyProd, currRest, prevRest);
}
}
walkManySep(
manySepProd: RepetitionWithSeparator,
currRest: IProduction[],
prevRest: IProduction[],
): void {
if (
!this.checkIsTarget(
manySepProd,
PROD_TYPE.REPETITION_WITH_SEPARATOR,
currRest,
prevRest,
)
) {
super.walkOption(manySepProd, currRest, prevRest);
}
}
}
/**
* Returns the definition of a target production in a top level level rule.
*/
class InsideDefinitionFinderVisitor extends GAstVisitor {
public result: IProduction[] = [];
constructor(
private targetOccurrence: number,
private targetProdType: PROD_TYPE,
private targetRef?: any,
) {
super();
}
private checkIsTarget(
node: { definition: IProduction[] } & IProductionWithOccurrence,
expectedProdName: PROD_TYPE,
): void {
if (
node.idx === this.targetOccurrence &&
this.targetProdType === expectedProdName &&
(this.targetRef === undefined || node === this.targetRef)
) {
this.result = node.definition;
}
}
public visitOption(node: Option): void {
this.checkIsTarget(node, PROD_TYPE.OPTION);
}
public visitRepetition(node: Repetition): void {
this.checkIsTarget(node, PROD_TYPE.REPETITION);
}
public visitRepetitionMandatory(node: RepetitionMandatory): void {
this.checkIsTarget(node, PROD_TYPE.REPETITION_MANDATORY);
}
public visitRepetitionMandatoryWithSeparator(
node: RepetitionMandatoryWithSeparator,
): void {
this.checkIsTarget(node, PROD_TYPE.REPETITION_MANDATORY_WITH_SEPARATOR);
}
public visitRepetitionWithSeparator(node: RepetitionWithSeparator): void {
this.checkIsTarget(node, PROD_TYPE.REPETITION_WITH_SEPARATOR);
}
public visitAlternation(node: Alternation): void {
this.checkIsTarget(node, PROD_TYPE.ALTERNATION);
}
}
function initializeArrayOfArrays(size: number): any[][] {
const result = new Array(size);
for (let i = 0; i < size; i++) {
result[i] = [];
}
return result;
}
/**
* A sort of hash function between a Path in the grammar and a string.
* Note that this returns multiple "hashes" to support the scenario of token categories.
* - A single path with categories may match multiple **actual** paths.
*/
function pathToHashKeys(path: TokenType[]): string[] {
let keys = [""];
for (let i = 0; i < path.length; i++) {
const tokType = path[i];
const longerKeys = [];
for (let j = 0; j < keys.length; j++) {
const currShorterKey = keys[j];
longerKeys.push(currShorterKey + "_" + tokType.tokenTypeIdx);
for (let t = 0; t < tokType.categoryMatches!.length; t++) {
const categoriesKeySuffix = "_" + tokType.categoryMatches![t];
longerKeys.push(currShorterKey + categoriesKeySuffix);
}
}
keys = longerKeys;
}
return keys;
}
/**
* Imperative style due to being called from a hot spot
*/
function isUniquePrefixHash(
altKnownPathsKeys: Record<string, boolean>[],
searchPathKeys: string[],
idx: number,
): boolean {
for (
let currAltIdx = 0;
currAltIdx < altKnownPathsKeys.length;
currAltIdx++
) {
// We only want to test vs the other alternatives
if (currAltIdx === idx) {
continue;
}
const otherAltKnownPathsKeys = altKnownPathsKeys[currAltIdx];
for (let searchIdx = 0; searchIdx < searchPathKeys.length; searchIdx++) {
const searchKey = searchPathKeys[searchIdx];
if (otherAltKnownPathsKeys[searchKey] === true) {
return false;
}
}
}
// None of the SearchPathKeys were found in any of the other alternatives
return true;
}
export function lookAheadSequenceFromAlternatives(
altsDefs: IProduction[],
k: number,
): LookaheadSequence[] {
const partialAlts = map(altsDefs, (currAlt) =>
possiblePathsFrom([currAlt], 1),
);
const finalResult = initializeArrayOfArrays(partialAlts.length);
const altsHashes = map(partialAlts, (currAltPaths) => {
const dict: { [key: string]: boolean } = {};
forEach(currAltPaths, (item) => {
const keys = pathToHashKeys(item.partialPath);
forEach(keys, (currKey) => {
dict[currKey] = true;
});
});
return dict;
});
let newData = partialAlts;
// maxLookahead loop
for (let pathLength = 1; pathLength <= k; pathLength++) {
const currDataset = newData;
newData = initializeArrayOfArrays(currDataset.length);
// alternatives loop
for (let altIdx = 0; altIdx < currDataset.length; altIdx++) {
const currAltPathsAndSuffixes = currDataset[altIdx];
// paths in current alternative loop
for (
let currPathIdx = 0;
currPathIdx < currAltPathsAndSuffixes.length;
currPathIdx++
) {
const currPathPrefix = currAltPathsAndSuffixes[currPathIdx].partialPath;
const suffixDef = currAltPathsAndSuffixes[currPathIdx].suffixDef;
const prefixKeys = pathToHashKeys(currPathPrefix);
const isUnique = isUniquePrefixHash(altsHashes, prefixKeys, altIdx);
// End of the line for this path.
if (isUnique || isEmpty(suffixDef) || currPathPrefix.length === k) {
const currAltResult = finalResult[altIdx];
// TODO: Can we implement a containsPath using Maps/Dictionaries?
if (containsPath(currAltResult, currPathPrefix) === false) {
currAltResult.push(currPathPrefix);
// Update all new keys for the current path.
for (let j = 0; j < prefixKeys.length; j++) {
const currKey = prefixKeys[j];
altsHashes[altIdx][currKey] = true;
}
}
}
// Expand longer paths
else {
const newPartialPathsAndSuffixes = possiblePathsFrom(
suffixDef,
pathLength + 1,
currPathPrefix,
);
newData[altIdx] = newData[altIdx].concat(newPartialPathsAndSuffixes);
// Update keys for new known paths
forEach(newPartialPathsAndSuffixes, (item) => {
const prefixKeys = pathToHashKeys(item.partialPath);
forEach(prefixKeys, (key) => {
altsHashes[altIdx][key] = true;
});
});
}
}
}
}
return finalResult;
}
export function getLookaheadPathsForOr(
occurrence: number,
ruleGrammar: Rule,
k: number,
orProd?: Alternation,
): LookaheadSequence[] {
const visitor = new InsideDefinitionFinderVisitor(
occurrence,
PROD_TYPE.ALTERNATION,
orProd,
);
ruleGrammar.accept(visitor);
return lookAheadSequenceFromAlternatives(visitor.result, k);
}
export function getLookaheadPathsForOptionalProd(
occurrence: number,
ruleGrammar: Rule,
prodType: PROD_TYPE,
k: number,
): LookaheadSequence[] {
const insideDefVisitor = new InsideDefinitionFinderVisitor(
occurrence,
prodType,
);
ruleGrammar.accept(insideDefVisitor);
const insideDef = insideDefVisitor.result;
const afterDefWalker = new RestDefinitionFinderWalker(
ruleGrammar,
occurrence,
prodType,
);
const afterDef = afterDefWalker.startWalking();
const insideFlat = new AlternativeGAST({ definition: insideDef });
const afterFlat = new AlternativeGAST({ definition: afterDef });
return lookAheadSequenceFromAlternatives([insideFlat, afterFlat], k);
}
export function containsPath(
alternative: Alternative,
searchPath: TokenType[],
): boolean {
compareOtherPath: for (let i = 0; i < alternative.length; i++) {
const otherPath = alternative[i];
if (otherPath.length !== searchPath.length) {
continue;
}
for (let j = 0; j < otherPath.length; j++) {
const searchTok = searchPath[j];
const otherTok = otherPath[j];
const matchingTokens =
searchTok === otherTok ||
otherTok.categoryMatchesMap![searchTok.tokenTypeIdx!] !== undefined;
if (matchingTokens === false) {
continue compareOtherPath;
}
}
return true;
}
return false;
}
export function isStrictPrefixOfPath(
prefix: TokenType[],
other: TokenType[],
): boolean {
return (
prefix.length < other.length &&
every(prefix, (tokType, idx) => {
const otherTokType = other[idx];
return (
tokType === otherTokType ||
otherTokType.categoryMatchesMap![tokType.tokenTypeIdx!]
);
})
);
}
export function areTokenCategoriesNotUsed(
lookAheadPaths: LookaheadSequence[],
): boolean {
return every(lookAheadPaths, (singleAltPaths) =>
every(singleAltPaths, (singlePath) =>
every(singlePath, (token) => isEmpty(token.categoryMatches!)),
),
);
}