UNPKG

@sap/cds-compiler

Version:

CDS (Core Data Services) compiler and backends

647 lines (587 loc) • 26 kB

JavaScript

// Error strategy with special handling for (non-reserved) keywords // If a language has non-reserved keywords, any such keyword can be used at // places where just a identifier is expected. For doing so, we define a rule // ident : Identifier | NONRESERVED_1 | ... NONRESERVED_n ; // // Now consider another rule: // expected : RESERVED_j | NONRESERVED_k | ident ; // If parsing fails at this place, you expect to see an message like // Mismatched input '?', expecting RESERVED_j, NONRESERVED_k, or Identifier // With ANTLR's default error strategy, you unfortunately also see all other // n-1 non-reserved keyword after "expecting"... // // The error strategy provided by this file gives you the expected message. // The example above shows that it is not enough to just remove all // non-reserved keywords from the expected-set. The error strategy also allows // you to match reserved keywords as identifiers at certain places (when there // are no alternatives). // For using this error strategy, the grammar for the parser/lexer must have a // lexer rule `Number`, then rules for unreserved keywords, and finally a rule // `Identifier`. No tokens (which are used in parser rules) must be defined // after that, no other rules must be defined in between those rules. // This file is actually very ANTLR4 specific and should be checked against // future versions of the ANTLR4-js runtime. There is no need to look at this // file if you just want to understand the rest of this compiler project. 'use strict'; const antlr4 = require('antlr4'); const Antlr4LL1Analyzer = require('antlr4/src/antlr4/LL1Analyzer'); const { DefaultErrorStrategy } = require('antlr4/src/antlr4/error/ErrorStrategy'); const { InputMismatchException } = require('antlr4/src/antlr4/error/Errors'); const { predictionContextFromRuleContext: predictionContext, } = require('antlr4/src/antlr4/PredictionContext'); const { ATNState } = require('antlr4/src/antlr4/atn/ATNState'); const { IntervalSet, Interval } = require('antlr4/src/antlr4/IntervalSet'); const { CompilerAssertion } = require('../base/error'); const keywordRegexp = /^[a-zA-Z]+$/; // we don't have keywords with underscore let SEMI = null; let RBRACE = null; // Class which adapts ANTLR4s standard error strategy: do something special // with (non-reserved) keywords. // // An instance of this class should be set as property `_errHandler` to the // parser (prototype). class KeywordErrorStrategy extends DefaultErrorStrategy { constructor( ...args ) { super( ...args ); this._super = { recoverInline: super.recoverInline, getExpectedTokens: super.getExpectedTokens, }; } } // TODO: Use actual methods Object.assign( KeywordErrorStrategy.prototype, { sync, singleTokenDeletion, reportNoViableAlternative, reportInputMismatch, reportUnwantedToken, reportMissingToken, reportIgnoredWith, // getErrorRecoverySet, consumeUntil, consumeAndMarkUntil, recoverInline, getMissingSymbol, getExpectedTokensForMessage, getTokenDisplay, }); // Attempt to recover from problems in subrules, except if rule has defined a // local variable `_sync` with value 'nop' // TODO: consider performance - see #8800 // See DefaultErrorStrategy#sync function sync( recognizer ) { // If already recovering, don't try to sync if (this.inErrorRecoveryMode(recognizer)) return; const token = recognizer.getCurrentToken(); if (!token) return; const s = recognizer._interp.atn.states[recognizer.state]; // try cheaper subset first; might get lucky. seems to shave a wee bit off const nextTokens = recognizer.atn.nextTokens(s); // console.log('SYNC:', recognizer._ctx._sync, s.stateType, token.text, // intervalSetToArray( recognizer, nextTokens )) if (nextTokens.contains(token.type)) { // we are sure the token matches if (token.text === '}' && recognizer.$nextTokensToken !== token && nextTokens.contains(SEMI)) { // if the '}' could be matched alternative to ';', we had an opt ';' (rule requiredSemi) recognizer.$nextTokensToken = token; recognizer.$nextTokensState = recognizer.state; recognizer.$nextTokensContext = recognizer._ctx; } return; } if (nextTokens.contains(antlr4.Token.EPSILON)) { // when exiting a (innermost) rule, remember the state to make // getExpectedTokensForMessage() calculate the full "expected set" if (recognizer.$nextTokensToken !== token) { // console.log('SET:',token.type,recognizer.state,recognizer.$nextTokensToken && // recognizer.$nextTokensToken.type) recognizer.$nextTokensToken = token; recognizer.$nextTokensState = recognizer.state; recognizer.$nextTokensContext = recognizer._ctx; } return; } // Expected token is identifier, current is (reserved) KEYWORD: // TODO: do not use this if "close enough" (1 char diff or prefix) // to a keyword in nextTokens // // NOTE: it is important to do this only if EPSILON is not in `nextTokens`, // which means that we cannot bring the better special syntax-unexpected-reserved // in all cases. Reason: high performance impact of the alternative, // i.e. calling method Parser#isExpectedToken() = invoking the ATN // interpreter to see behind EPSILON. const identType = recognizer.constructor.Identifier; if (keywordRegexp.test( token.text ) && nextTokens.contains( identType )) { recognizer.message( 'syntax-unexpected-reserved-word', token, { code: token.text, delimited: token.text } ); // TODO: attach tokens like for 'syntax-unexpected-token' token.type = identType; // make next ANTLR decision assume identifier return; } if (recognizer._ctx._sync === 'nop') return; switch (s.stateType) { case ATNState.BLOCK_START: // 3 case ATNState.STAR_BLOCK_START: // 5 case ATNState.PLUS_BLOCK_START: // 4 case ATNState.STAR_LOOP_ENTRY: // 10 // report error and recover if possible if ( token.text !== '}' && // do not just delete a '}' this.singleTokenDeletion(recognizer) !== null) { // also calls reportUnwantedToken return; } else if (recognizer._ctx._sync === 'recover') { this.reportInputMismatch( recognizer, new InputMismatchException(recognizer) ); this.consumeUntil( recognizer, nextTokens ); return; } // TODO: at least with STAR_LOOP_ENTRY, we might want to do s/th similar as // with LOOP_BACK (syncing to “expected tokens” -> the separator) throw new InputMismatchException(recognizer); case ATNState.PLUS_LOOP_BACK: // 11 case ATNState.STAR_LOOP_BACK: { // 9 // TODO: do not delete a '}', ')', ',', ';' this.reportUnwantedToken(recognizer); const expecting = new IntervalSet(); expecting.addSet(recognizer.getExpectedTokens()); // First try some ',' insertion (TODO does not work yet): if (trySeparatorInsertion( recognizer, expecting, "','" )) return; // We then try syncing only to the loop-cont (`,`) / loop-end (`}`) token set, // but only for the current or next line (and not consuming `;`s): const prevToken = recognizer.getTokenStream().LT(-1); if (token.line <= prevToken.line + 1 && // in same or next line this.consumeAndMarkUntil( recognizer, expecting, true )) break; // console.log(token.text,JSON.stringify(intervalSetToArray(recognizer,expecting))) // If that fails, we also sync to all tokens which are in the follow set of // the current rule and all outer rules const whatFollowsLoopIterationOrRule = expecting.addSet(this.getErrorRecoverySet(recognizer)); this.consumeUntil(recognizer, whatFollowsLoopIterationOrRule); // console.log(JSON.stringify(intervalSetToArray(recognizer,expecting))) if (recognizer._ctx._sync === 'recover' || // in start rule: no exception nextTokens.contains( recognizer.getTokenStream().LA(1) )) return; throw new InputMismatchException(recognizer); } default: // do nothing if we can't identify the exact kind of ATN state } } function trySeparatorInsertion( recognizer, expecting, separatorName ) { // Remark: this function does not really work, because it is based on // singleTokenInsertion, which also does not really work… (see below). // But we might improve it in the future… const separator = recognizer.literalNames.indexOf( separatorName ); if (!expecting.contains( separator )) return false; const currentSymbolType = recognizer.getTokenStream().LA(1); // if current token is consistent with what could come after current // ATN state, then we know we're missing a token; error recovery // is free to conjure up and insert the missing token const { atn } = recognizer._interp; const currentState = atn.states[recognizer.state]; const next = separatorTransition( currentState.transitions, separator ).target; // While this is an improvement to the default ANTLR code for // singleTokenInsertion(), it still does not help, as we navigate along an // epsilon transition, i.e. we still see ',', etc const expectingAtLL2 = atn.nextTokens(next, recognizer._ctx); if (!expectingAtLL2.contains(currentSymbolType)) return false; this.reportMissingToken(recognizer); return getMissingSymbol( recognizer, separator ); } function separatorTransition( transitions, separator ) { for (const tr of transitions) { if (tr.matches( separator )) return tr; } return transitions[0]; } function singleTokenDeletion( recognizer ) { const token = recognizer.getCurrentToken(); if (!token || token.text === '}') return null; const nextTokenType = recognizer.getTokenStream().LA(2); const { Number: num } = recognizer.constructor; if (nextTokenType > num && // next token is Id|Unreserved|IllegalToken token.type <= num) // current token is not return null; const expecting = this.getExpectedTokens(recognizer); if (!expecting.contains(nextTokenType)) return null; this.reportUnwantedToken(recognizer); recognizer.consume(); // simply delete extra token // we want to return the token we're actually matching const matchedSymbol = recognizer.getCurrentToken(); this.reportMatch( recognizer ); // we know current token is correct return matchedSymbol; } // singleTokenInsertion called by recoverInline (called by match / in else), // calls reportMissingToken // Report `NoViableAltException e` signalled by parser `recognizer` function reportNoViableAlternative( recognizer, e ) { // console.log('NOV:',this.getTokenErrorDisplay(e.startToken), // this.getTokenErrorDisplay(e.offendingToken)) if (e.startToken === e.offendingToken) { // mismatch at LA(1) this.reportInputMismatch( recognizer, e ); } else { this.reportInputMismatch( recognizer, e, !e.deadEndConfigs || e.deadEndConfigs.configs ); do { // console.log('CONSUME-NOVIA:',this.getTokenErrorDisplay(recognizer.getCurrentToken())); recognizer.consume(); } while (recognizer.getCurrentToken() !== e.offendingToken); // this.lastErrorIndex = e.startToken.tokenIndex; // avoid another consume() } } // Report `InputMismatchException e` signalled by parser `recognizer`` function reportInputMismatch( recognizer, e, deadEnds ) { const expecting = deadEnds !== true && // true: cannot compute expecting this.getExpectedTokensForMessage( recognizer, e.offendingToken, deadEnds ); const offending = this.getTokenDisplay( e.offendingToken, recognizer ); e.offendingToken.$isSkipped = 'offending'; let err; if (expecting && expecting.length) { err = recognizer.error( 'syntax-unexpected-token', e.offendingToken, { offending, expecting } ); err.expectedTokens = expecting; } else { // should not really happen anymore... -> no messageId ! err = recognizer.error( null, e.offendingToken, { offending }, 'Mismatched $(OFFENDING)' ); } if (!recognizer.avoidErrorListeners) // with --trace-parser or --trace-parser-ambig recognizer.notifyErrorListeners( err.message, e.offendingToken, err ); } // Report unwanted token when the parser `recognizer` tries to recover/sync function reportUnwantedToken( recognizer, expecting ) { if (this.inErrorRecoveryMode(recognizer)) return; this.beginErrorCondition(recognizer); const token = recognizer.getCurrentToken(); token.$isSkipped = 'offending'; expecting ??= this.getExpectedTokensForMessage( recognizer, token ); const offending = this.getTokenDisplay( token, recognizer ); // Just text variant, no other message id! Would depend on ANTLR-internals const err = recognizer.error( 'syntax-unexpected-token', token, { '#': 'unwanted', offending, expecting } ); err.expectedTokens = expecting; // TODO: remove next token? if (!recognizer.avoidErrorListeners) // with --trace-parser or --trace-parser-ambig recognizer.notifyErrorListeners( err.message, token, err ); } // Report missing token when the parser `recognizer` tries to recover/sync function reportMissingToken( recognizer ) { if ( this.inErrorRecoveryMode(recognizer)) return; this.beginErrorCondition(recognizer); const token = recognizer.getCurrentToken(); token.$isSkipped = 'offending'; const expecting = this.getExpectedTokensForMessage( recognizer, token ); const offending = this.getTokenDisplay( token, recognizer ); // TODO: if non-reserved keyword will not been parsed as keyword, use Identifier for offending // Hopefully not too ANTLR-specific, so extra message id is ok: const err = recognizer.error( 'syntax-missing-token', token, { offending, expecting }, 'Missing $(EXPECTING) before $(OFFENDING)' ); err.expectedTokens = expecting; if (!recognizer.avoidErrorListeners) // with --trace-parser or --trace-parser-ambig recognizer.notifyErrorListeners( err.message, token, err ); } function reportIgnoredWith( recognizer, t ) { const next = recognizer._interp.atn.states[recognizer.state].transitions[0].target; recognizer.state = next.stateNumber; // previous match() does not set the state const expecting = this.getExpectedTokensForMessage( recognizer, t ); const m = recognizer.warning( 'syntax-unexpected-semicolon', t, { offending: "';'", expecting, keyword: 'with' }, // eslint-disable-next-line @stylistic/js/max-len 'Unexpected $(OFFENDING), expecting $(EXPECTING) - ignored previous $(KEYWORD)' ); m.expectedTokens = expecting; } function consumeUntil( recognizer, set ) { // TODO: add trace if (SEMI == null) SEMI = recognizer.literalNames.indexOf( "';'" ); if (RBRACE == null) RBRACE = recognizer.literalNames.indexOf( "'}'" ); // let s=this.getTokenDisplay( recognizer.getCurrentToken(), recognizer ); if (SEMI < 1 || RBRACE < 1) { this.consumeAndMarkUntil( recognizer, set ); } else if (set.contains(SEMI)) { // do not check for RBRACE here! this.consumeAndMarkUntil( recognizer, set ); // console.log('CONSUMED-ORIG:',s,this.getTokenDisplay( recognizer.getCurrentToken(), // recognizer ),recognizer.getCurrentToken().line,intervalSetToArray( recognizer, set )); } else { // DO NOT modify input param `set`, as the set might be cached in the ATN const stop = new IntervalSet(); stop.addSet( set ); stop.removeOne( recognizer.constructor.Identifier ); stop.addOne( SEMI ); // I am not that sure whether to add RBRACE... stop.addOne( RBRACE ); this.consumeAndMarkUntil( recognizer, stop ); const ttype = recognizer.getTokenStream().LA(1); if (ttype === SEMI || ttype === RBRACE && !set.contains(RBRACE)) { recognizer.consume(); this.reportMatch(recognizer); // we know current token is correct } // if matched '}', also try to match next ';' (also matches double ';') if (recognizer.getTokenStream().LA(1) === SEMI) { recognizer.consume(); this.reportMatch(recognizer); // we know current token is correct } // console.log('CONSUMED:',s,this.getTokenDisplay( recognizer.getCurrentToken(), // recognizer ),recognizer.getCurrentToken().line); // throw new CompilerAssertion('Sync') } } function consumeAndMarkUntil( recognizer, set, onlyInSameLine ) { const stream = recognizer.getTokenStream(); let t = stream.LT(1); const { line } = t; while (t.type !== antlr4.Token.EOF && !set.contains( t.type )) { if (onlyInSameLine && (t.line !== line || t.text === ';' || t.text === '}' )) return false; // early exit if (!t.$isSkipped) t.$isSkipped = true; recognizer.consume(); t = stream.LT(1); } return true; } // As the `match` function of the parser `recognizer` does not allow to check // against a set of token types, the generated parser code checks against that // set itself and calls this function if not successful. // We now also allow keywords if the Identifier is expected. // Called by match() and in generated parser in "else part" before consume() // for ( TOKEN1 | TOKEN2 ) function recoverInline( recognizer ) { const identType = recognizer.constructor.Identifier; if (!identType || !recognizer.isExpectedToken( identType )) return this._super.recoverInline.call( this, recognizer ); const token = recognizer.getCurrentToken(); // TODO: do not delete `)`, `}`, // TODO: overwrite singleTokenDeletion do not delete parens etc for identifier // or non-reserved keywords if (!keywordRegexp.test( token.text )) return this._super.recoverInline.call( this, recognizer ); // TODO: attach `Identifier` as valid name to message? recognizer.message( 'syntax-unexpected-reserved-word', token, { code: token.text, delimited: token.text } ); this.reportMatch(recognizer); // we know current token is correct recognizer.consume(); return token; } // Conjure up a missing token during error recovery in parser `recognizer`. If // an identifier is expected, create one. // Think about: we might want to prefer one of '}]);,'. function getMissingSymbol( recognizer, expectedTokenType ) { expectedTokenType ??= this.getExpectedTokens(recognizer).first(); // get any element const current = recognizer.getCurrentToken(); return recognizer.getTokenFactory().create( current.source, // do s/th special if EOF like in DefaultErrorStrategy ? expectedTokenType, '', antlr4.Token.DEFAULT_CHANNEL, // empty string as token text -1, -1, current.line, current.column ); } function intervalSetToArray( recognizer, expected, excludesForNextToken ) { // similar to `IntervalSet#toTokenString` let names = []; const pc = recognizer.constructor; for (const v of expected.intervals) { for (let j = v.start; j < v.stop; j++) { // a generic keyword as such does not appear in messages, only its replacements, // which are function name and argument position dependent: if (j === pc.GenericExpr) { names.push( ...recognizer.$genericKeywords.expr ); } else if (j === pc.GenericSeparator) { names.push( ...recognizer.$genericKeywords.separator ); } else if (j === pc.GenericIntro) { names.push( ...recognizer.$genericKeywords.introMsg ); } else if (j === pc.SemicolonTopLevel) { // We only insert a semikolon (i.e. make it optional) after a closing brace. // If the previous token is not `}`, don't propose these keywords, as ';' is required. if (recognizer._input.LA(-1) === recognizer._input.BRACE_CLOSE) { const name = recognizer.topLevelKeywords.map(i => expected .elementName(recognizer.literalNames, recognizer.symbolicNames, i)); names.push(...name); if (recognizer._ctx.outer?.kind !== 'source') { if (names.includes('<EOF>')) names.splice(names.indexOf('<EOF>'), 1); } } } // other expected tokens usually appear in messages, except the helper tokens // which are used to solve ambiguities via the parser method setLocalToken(): else if (j !== pc.HelperToken1 && j !== pc.HelperToken2) { names.push( expected.elementName(recognizer.literalNames, recognizer.symbolicNames, j ) ); } } } // The parser method excludeExpected() additionally removes some tokens from the message: if (recognizer.$adaptExpectedToken && recognizer.$nextTokensToken === recognizer.$adaptExpectedToken) { const excludes = (excludesForNextToken && Array.isArray(recognizer.$adaptExpectedExcludes[0])) ? recognizer.$adaptExpectedExcludes[0] : recognizer.$adaptExpectedExcludes; names = names.filter( n => !excludes.includes( n ) ); } else if (names.includes("';'")) { names = names.filter( n => n !== "'}'" ); } else if (names.includes("'?'")) { names = names.filter( n => n !== "'?'" ); } names.sort( (a, b) => (tokenPrecedence(a) < tokenPrecedence(b) ? -1 : 1) ); return names; } // Used for sorting in messages const token1sort = { // 0: Identifier, Number, ... // 1: separators: ',': 1, '.': 1, ':': 1, ';': 1, // 2: parentheses: '(': 2, ')': 2, '[': 2, ']': 2, '{': 2, '}': 2, // 3: special: '!': 3, '#': 3, $: 3, '?': 3, '@': 3, // 4: operators: '*': 4, '+': 4, '-': 4, '/': 4, '<': 4, '=': 4, '>': 4, '|': 4, // 8: KEYWORD // 9: <EOF> }; function tokenPrecedence( name ) { if (name.length < 2 || name === '<EOF>') return `9${ name }`; const prec = token1sort[name.charAt(1)]; if (prec) return `${ prec }${ name }`; return (name.charAt(1) < 'a' ? '8' : '0') + name; } function getTokenDisplay( token, recognizer ) { if (!token) return '<EOF>'; const t = token.type; if (t === antlr4.Token.EOF || t === antlr4.Token.EPSILON ) { return '<EOF>'; } else if (t === recognizer.constructor.DOTbeforeBRACE) { if (recognizer.getTokenStream().LT(2).text === '{') return "'.{'"; return "'.*'"; } return recognizer.literalNames[t] || recognizer.symbolicNames[t]; } // Return an IntervalSet of token types which the parser had expected. Do not // include non-reserved keywords if not mentioned explicitly (i.e. other than // from rule `ident`). // // We actually define something like a corrected version of function // `LL1Analyzer.prototype.getDecisionLookahead`. We cannot just redefine // `getExpectedTokens`, because that function is also used to decide whether // to consume in `DefaultErrorStrategy#singleTokenDeletion`. function getExpectedTokensForMessage( recognizer, offendingToken, deadEnds ) { const { atn } = recognizer._interp; if (recognizer.state < 0) return []; if (recognizer.state >= atn.states.length) { throw new CompilerAssertion( `Invalid state number ${ recognizer.state } for ${ this.getTokenErrorDisplay( offendingToken ) }`); } const identType = recognizer.constructor.Identifier; const hideAltsType = recognizer.constructor.HideAlternatives; const beforeUnreserved = recognizer.constructor.Number; if (!identType || !beforeUnreserved || beforeUnreserved + 2 > identType) return intervalSetToArray( recognizer, this._super.getExpectedTokens.call( this, recognizer ) ); const ll1 = new Antlr4LL1Analyzer(atn); const expected = new IntervalSet(); const origAddInterval = expected.addInterval; const origAddSet = expected.addSet; expected.addInterval = addInterval; expected.addSet = addSet; const lookBusy = new antlr4.Utils.Set(); const calledRules = new antlr4.Utils.BitSet(); if (deadEnds) { // "No viable alternative" by adaptivePredict() not on first token for (const trans of deadEnds) { ll1._LOOK( trans.state, null, predictionContext( atn, recognizer._ctx ), expected, lookBusy, calledRules, true, true ); } return intervalSetToArray( recognizer, expected, true ); } else if (offendingToken && recognizer.$nextTokensContext && offendingToken === recognizer.$nextTokensToken) { // Before exiting a rule, we had a state (via sync()) with a bigger // "expecting set" for the same token ll1._LOOK( atn.states[recognizer.$nextTokensState], null, predictionContext( atn, recognizer.$nextTokensContext ), expected, lookBusy, calledRules, true, true ); } else { // Use current state to compute "expecting" ll1._LOOK( atn.states[recognizer.state], null, predictionContext( atn, recognizer._ctx ), expected, lookBusy, calledRules, true, true ); } // console.log(state, recognizer.$nextTokensState, // expected.toString(recognizer.literalNames, recognizer.symbolicNames)); return intervalSetToArray( recognizer, expected ); function addSet( other ) { if (!other.contains( hideAltsType )) origAddSet.call( this, other ); } // Add an interval `v` to the IntervalSet `this`. If `v` contains the token // type `Identifier`, do not add non-reserved keywords in `v`. function addInterval( v ) { if (v.stop <= identType) { origAddInterval.call(this, v); } else if (v.start >= identType) { if (v.stop === identType + 1 || !recognizer.tokenRewrite) { origAddInterval.call(this, v); } else { for (let j = v.start; j < v.stop; j++) addRange( this, recognizer.tokenRewrite[j - identType] || j ); } } else { if (v.start <= beforeUnreserved) addRange( this, v.start, beforeUnreserved + 1 ); addRange( this, identType ); } } function addRange( interval, start, stop ) { origAddInterval.call( interval, new Interval( start, stop || start + 1 ) ); } } module.exports = { KeywordErrorStrategy, };