UNPKG

chevrotain

Version:

Chevrotain is a high performance fault tolerant javascript parsing DSL for building recursive decent parsers

494 lines (439 loc) 15.6 kB
import { createTokenInstance, EOF, tokenMatcher, } from "../../../scan/tokens_public.js"; import { AbstractNextTerminalAfterProductionWalker, IFirstAfterRepetition, NextAfterTokenWalker, } from "../../grammar/interpreter.js"; import { IParserConfig, IToken, ITokenGrammarPath, TokenType, } from "@chevrotain/types"; import { MismatchedTokenException } from "../../exceptions_public.js"; import { IN } from "../../constants.js"; import { MixedInParser } from "./parser_traits.js"; import { DEFAULT_PARSER_CONFIG } from "../parser.js"; export const EOF_FOLLOW_KEY: any = {}; export interface IFollowKey { ruleName: string; idxInCallingRule: number; inRule: string; } export const IN_RULE_RECOVERY_EXCEPTION = "InRuleRecoveryException"; export class InRuleRecoveryException extends Error { constructor(message: string) { super(message); this.name = IN_RULE_RECOVERY_EXCEPTION; } } /** * This trait is responsible for the error recovery and fault tolerant logic */ export class Recoverable { recoveryEnabled: boolean; firstAfterRepMap: Record<string, IFirstAfterRepetition>; resyncFollows: Record<string, TokenType[]>; initRecoverable(config: IParserConfig) { this.firstAfterRepMap = {}; this.resyncFollows = {}; this.recoveryEnabled = Object.hasOwn(config, "recoveryEnabled") ? (config.recoveryEnabled as boolean) // assumes end user provides the correct config value/type : DEFAULT_PARSER_CONFIG.recoveryEnabled; // performance optimization, NOOP will be inlined which // effectively means that this optional feature does not exist // when not used. if (this.recoveryEnabled) { this.attemptInRepetitionRecovery = attemptInRepetitionRecovery; } } public getTokenToInsert(tokType: TokenType): IToken { const tokToInsert = createTokenInstance( tokType, "", NaN, NaN, NaN, NaN, NaN, NaN, ); tokToInsert.isInsertedInRecovery = true; return tokToInsert; } public canTokenTypeBeInsertedInRecovery(tokType: TokenType): boolean { return true; } public canTokenTypeBeDeletedInRecovery(tokType: TokenType): boolean { return true; } tryInRepetitionRecovery( this: MixedInParser, grammarRule: Function, grammarRuleArgs: any[], lookAheadFunc: () => boolean, expectedTokType: TokenType, ): void { // TODO: can the resyncTokenType be cached? const reSyncTokType = this.findReSyncTokenType(); const savedLexerState = this.exportLexerState(); const resyncedTokens: IToken[] = []; let passedResyncPoint = false; const nextTokenWithoutResync = this.LA_FAST(1); let currToken = this.LA_FAST(1); const generateErrorMessage = () => { const previousToken = this.LA(0); // we are preemptively re-syncing before an error has been detected, therefor we must reproduce // the error that would have been thrown const msg = this.errorMessageProvider.buildMismatchTokenMessage({ expected: expectedTokType, actual: nextTokenWithoutResync, previous: previousToken, ruleName: this.getCurrRuleFullName(), }); const error = new MismatchedTokenException( msg, nextTokenWithoutResync, this.LA(0), ); // the first token here will be the original cause of the error, this is not part of the resyncedTokens property. error.resyncedTokens = resyncedTokens.slice(0, -1); this.SAVE_ERROR(error); }; while (!passedResyncPoint) { // re-synced to a point where we can safely exit the repetition/ if (this.tokenMatcher(currToken, expectedTokType)) { generateErrorMessage(); return; // must return here to avoid reverting the inputIdx } else if (lookAheadFunc.call(this)) { // we skipped enough tokens so we can resync right back into another iteration of the repetition grammar rule generateErrorMessage(); // recursive invocation in other to support multiple re-syncs in the same top level repetition grammar rule grammarRule.apply(this, grammarRuleArgs); return; // must return here to avoid reverting the inputIdx } else if (this.tokenMatcher(currToken, reSyncTokType)) { passedResyncPoint = true; } else { currToken = this.SKIP_TOKEN(); this.addToResyncTokens(currToken, resyncedTokens); } } // we were unable to find a CLOSER point to resync inside the Repetition, reset the state. // The parsing exception we were trying to prevent will happen in the NEXT parsing step. it may be handled by // "between rules" resync recovery later in the flow. this.importLexerState(savedLexerState); } shouldInRepetitionRecoveryBeTried( this: MixedInParser, expectTokAfterLastMatch: TokenType, nextTokIdx: number, notStuck: boolean | undefined, ): boolean { // Edge case of arriving from a MANY repetition which is stuck // Attempting recovery in this case could cause an infinite loop if (notStuck === false) { return false; } // no need to recover, next token is what we expect... if (this.tokenMatcher(this.LA_FAST(1), expectTokAfterLastMatch)) { return false; } // error recovery is disabled during backtracking as it can make the parser ignore a valid grammar path // and prefer some backtracking path that includes recovered errors. if (this.isBackTracking()) { return false; } // if we can perform inRule recovery (single token insertion or deletion) we always prefer that recovery algorithm // because if it works, it makes the least amount of changes to the input stream (greedy algorithm) //noinspection RedundantIfStatementJS if ( this.canPerformInRuleRecovery( expectTokAfterLastMatch, this.getFollowsForInRuleRecovery(expectTokAfterLastMatch, nextTokIdx), ) ) { return false; } return true; } // TODO: should this be a member method or a utility? it does not have any state or usage of 'this'... // TODO: should this be more explicitly part of the public API? getNextPossibleTokenTypes( this: MixedInParser, grammarPath: ITokenGrammarPath, ): TokenType[] { const topRuleName = grammarPath.ruleStack[0]!; const gastProductions = this.getGAstProductions(); const topProduction = gastProductions[topRuleName]; const nextPossibleTokenTypes = new NextAfterTokenWalker( topProduction, grammarPath, ).startWalking(); return nextPossibleTokenTypes; } // Error Recovery functionality getFollowsForInRuleRecovery( this: MixedInParser, tokType: TokenType, tokIdxInRule: number, ): TokenType[] { const grammarPath = this.getCurrentGrammarPath(tokType, tokIdxInRule); const follows = this.getNextPossibleTokenTypes(grammarPath); return follows; } tryInRuleRecovery( this: MixedInParser, expectedTokType: TokenType, follows: TokenType[], ): IToken { if (this.canRecoverWithSingleTokenInsertion(expectedTokType, follows)) { const tokToInsert = this.getTokenToInsert(expectedTokType); return tokToInsert; } if (this.canRecoverWithSingleTokenDeletion(expectedTokType)) { const nextTok = this.SKIP_TOKEN(); this.consumeToken(); return nextTok; } throw new InRuleRecoveryException("sad sad panda"); } canPerformInRuleRecovery( this: MixedInParser, expectedToken: TokenType, follows: TokenType[], ): boolean { return ( this.canRecoverWithSingleTokenInsertion(expectedToken, follows) || this.canRecoverWithSingleTokenDeletion(expectedToken) ); } canRecoverWithSingleTokenInsertion( this: MixedInParser, expectedTokType: TokenType, follows: TokenType[], ): boolean { if (!this.canTokenTypeBeInsertedInRecovery(expectedTokType)) { return false; } // must know the possible following tokens to perform single token insertion if (follows.length === 0) { return false; } const mismatchedTok = this.LA_FAST(1); const isMisMatchedTokInFollows = follows.find((possibleFollowsTokType: TokenType) => { return this.tokenMatcher(mismatchedTok, possibleFollowsTokType); }) !== undefined; return isMisMatchedTokInFollows; } canRecoverWithSingleTokenDeletion( this: MixedInParser, expectedTokType: TokenType, ): boolean { if (!this.canTokenTypeBeDeletedInRecovery(expectedTokType)) { return false; } const isNextTokenWhatIsExpected = this.tokenMatcher( // not using LA_FAST because LA(2) might be un-safe with maxLookahead=1 // in some edge cases (?) this.LA(2), expectedTokType, ); return isNextTokenWhatIsExpected; } isInCurrentRuleReSyncSet( this: MixedInParser, tokenTypeIdx: TokenType, ): boolean { const followKey = this.getCurrFollowKey(); const currentRuleReSyncSet = this.getFollowSetFromFollowKey(followKey); return currentRuleReSyncSet.includes(tokenTypeIdx); } findReSyncTokenType(this: MixedInParser): TokenType { const allPossibleReSyncTokTypes = this.flattenFollowSet(); // this loop will always terminate as EOF is always in the follow stack and also always (virtually) in the input let nextToken = this.LA_FAST(1); let k = 2; while (true) { const foundMatch = allPossibleReSyncTokTypes.find((resyncTokType) => { const canMatch = tokenMatcher(nextToken, resyncTokType); return canMatch; }); if (foundMatch !== undefined) { return foundMatch; } nextToken = this.LA(k); k++; } } getCurrFollowKey(this: MixedInParser): IFollowKey { // the length is at least one as we always add the ruleName to the stack before invoking the rule. if (this.RULE_STACK_IDX === 0) { return EOF_FOLLOW_KEY; } const currRuleShortName = this.currRuleShortName; const currRuleIdx = this.getLastExplicitRuleOccurrenceIndex(); const prevRuleShortName = this.getPreviousExplicitRuleShortName(); return { ruleName: this.shortRuleNameToFullName(currRuleShortName), idxInCallingRule: currRuleIdx, inRule: this.shortRuleNameToFullName(prevRuleShortName), }; } buildFullFollowKeyStack(this: MixedInParser): IFollowKey[] { const explicitRuleStack = this.RULE_STACK; const explicitOccurrenceStack = this.RULE_OCCURRENCE_STACK; const len = this.RULE_STACK_IDX + 1; const result: IFollowKey[] = new Array(len); for (let idx = 0; idx < len; idx++) { if (idx === 0) { result[idx] = EOF_FOLLOW_KEY; } else { result[idx] = { ruleName: this.shortRuleNameToFullName(explicitRuleStack[idx]), idxInCallingRule: explicitOccurrenceStack[idx], inRule: this.shortRuleNameToFullName(explicitRuleStack[idx - 1]), }; } } return result; } flattenFollowSet(this: MixedInParser): TokenType[] { const followStack = this.buildFullFollowKeyStack().map((currKey) => { return this.getFollowSetFromFollowKey(currKey); }); return <any>followStack.flat(); } getFollowSetFromFollowKey( this: MixedInParser, followKey: IFollowKey, ): TokenType[] { if (followKey === EOF_FOLLOW_KEY) { return [EOF]; } const followName = followKey.ruleName + followKey.idxInCallingRule + IN + followKey.inRule; return this.resyncFollows[followName]; } // It does not make any sense to include a virtual EOF token in the list of resynced tokens // as EOF does not really exist and thus does not contain any useful information (line/column numbers) addToResyncTokens( this: MixedInParser, token: IToken, resyncTokens: IToken[], ): IToken[] { if (!this.tokenMatcher(token, EOF)) { resyncTokens.push(token); } return resyncTokens; } reSyncTo(this: MixedInParser, tokType: TokenType): IToken[] { const resyncedTokens: IToken[] = []; let nextTok = this.LA_FAST(1); while (this.tokenMatcher(nextTok, tokType) === false) { nextTok = this.SKIP_TOKEN(); this.addToResyncTokens(nextTok, resyncedTokens); } // the last token is not part of the error. return resyncedTokens.slice(0, -1); } attemptInRepetitionRecovery( this: MixedInParser, prodFunc: Function, args: any[], lookaheadFunc: () => boolean, dslMethodIdx: number, prodOccurrence: number, nextToksWalker: typeof AbstractNextTerminalAfterProductionWalker, notStuck?: boolean, ): void { // by default this is a NO-OP // The actual implementation is with the function(not method) below } getCurrentGrammarPath( this: MixedInParser, tokType: TokenType, tokIdxInRule: number, ): ITokenGrammarPath { const pathRuleStack: string[] = this.getHumanReadableRuleStack(); const pathOccurrenceStack: number[] = this.RULE_OCCURRENCE_STACK.slice( 0, this.RULE_OCCURRENCE_STACK_IDX + 1, ); const grammarPath: any = { ruleStack: pathRuleStack, occurrenceStack: pathOccurrenceStack, lastTok: tokType, lastTokOccurrence: tokIdxInRule, }; return grammarPath; } getHumanReadableRuleStack(this: MixedInParser): string[] { const len = this.RULE_STACK_IDX + 1; const result: string[] = new Array(len); for (let i = 0; i < len; i++) { result[i] = this.shortRuleNameToFullName(this.RULE_STACK[i]); } return result; } } export function attemptInRepetitionRecovery( this: MixedInParser, prodFunc: Function, args: any[], lookaheadFunc: () => boolean, dslMethodIdx: number, prodOccurrence: number, nextToksWalker: typeof AbstractNextTerminalAfterProductionWalker, notStuck?: boolean, ): void { const key = this.getKeyForAutomaticLookahead(dslMethodIdx, prodOccurrence); let firstAfterRepInfo = this.firstAfterRepMap[key]; if (firstAfterRepInfo === undefined) { const currRuleName = this.getCurrRuleFullName(); const ruleGrammar = this.getGAstProductions()[currRuleName]; const walker: AbstractNextTerminalAfterProductionWalker = new nextToksWalker(ruleGrammar, prodOccurrence); firstAfterRepInfo = walker.startWalking(); this.firstAfterRepMap[key] = firstAfterRepInfo; } let expectTokAfterLastMatch = firstAfterRepInfo.token; let nextTokIdx = firstAfterRepInfo.occurrence; const isEndOfRule = firstAfterRepInfo.isEndOfRule; // special edge case of a TOP most repetition after which the input should END. // this will force an attempt for inRule recovery in that scenario. if ( this.RULE_STACK_IDX === 0 && isEndOfRule && expectTokAfterLastMatch === undefined ) { expectTokAfterLastMatch = EOF; nextTokIdx = 1; } // We don't have anything to re-sync to... // this condition was extracted from `shouldInRepetitionRecoveryBeTried` to act as a type-guard if (expectTokAfterLastMatch === undefined || nextTokIdx === undefined) { return; } if ( this.shouldInRepetitionRecoveryBeTried( expectTokAfterLastMatch, nextTokIdx, notStuck, ) ) { // TODO: performance optimization: instead of passing the original args here, we modify // the args param (or create a new one) and make sure the lookahead func is explicitly provided // to avoid searching the cache for it once more. this.tryInRepetitionRecovery( prodFunc, args, lookaheadFunc, expectTokAfterLastMatch, ); } }