UNPKG

antlr4-runtime

Version:

JavaScript runtime for ANTLR4

369 lines (319 loc) 9.36 kB
/* Copyright (c) 2012-2022 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ import Token from './Token.js'; import Recognizer from './Recognizer.js'; import CommonTokenFactory from './CommonTokenFactory.js'; import RecognitionException from './error/RecognitionException.js'; import LexerNoViableAltException from './error/LexerNoViableAltException.js'; /** * A lexer is recognizer that draws input symbols from a character stream. * lexer grammars result in a subclass of this object. A Lexer object * uses simplified match() and error recovery mechanisms in the interest of speed. */ export default class Lexer extends Recognizer { constructor(input) { super(); this._input = input; this._factory = CommonTokenFactory.DEFAULT; this._tokenFactorySourcePair = [ this, input ]; this._interp = null; // child classes must populate this /** * The goal of all lexer rules/methods is to create a token object. * this is an instance variable as multiple rules may collaborate to * create a single token. nextToken will return this object after * matching lexer rule(s). If you subclass to allow multiple token * emissions, then set this to the last token to be matched or * something nonnull so that the auto token emit mechanism will not * emit another token. */ this._token = null; /** * What character index in the stream did the current token start at? * Needed, for example, to get the text for current token. Set at * the start of nextToken. */ this._tokenStartCharIndex = -1; // The line on which the first character of the token resides/// this._tokenStartLine = -1; // The character position of first character within the line/// this._tokenStartColumn = -1; // Once we see EOF on char stream, next token will be EOF. // If you have DONE : EOF ; then you see DONE EOF. this._hitEOF = false; // The channel number for the current token/// this._channel = Token.DEFAULT_CHANNEL; // The token type for the current token/// this._type = Token.INVALID_TYPE; this._modeStack = []; this._mode = Lexer.DEFAULT_MODE; /** * You can set the text for the current token to override what is in * the input char buffer. Use setText() or can set this instance var. */ this._text = null; } reset() { // wack Lexer state variables if (this._input !== null) { this._input.seek(0); // rewind the input } this._token = null; this._type = Token.INVALID_TYPE; this._channel = Token.DEFAULT_CHANNEL; this._tokenStartCharIndex = -1; this._tokenStartColumn = -1; this._tokenStartLine = -1; this._text = null; this._hitEOF = false; this._mode = Lexer.DEFAULT_MODE; this._modeStack = []; this._interp.reset(); } // Return a token from this source; i.e., match a token on the char stream. nextToken() { if (this._input === null) { throw "nextToken requires a non-null input stream."; } /** * Mark start location in char stream so unbuffered streams are * guaranteed at least have text of current token */ const tokenStartMarker = this._input.mark(); try { for (;;) { if (this._hitEOF) { this.emitEOF(); return this._token; } this._token = null; this._channel = Token.DEFAULT_CHANNEL; this._tokenStartCharIndex = this._input.index; this._tokenStartColumn = this._interp.column; this._tokenStartLine = this._interp.line; this._text = null; let continueOuter = false; for (;;) { this._type = Token.INVALID_TYPE; let ttype = Lexer.SKIP; try { ttype = this._interp.match(this._input, this._mode); } catch (e) { if(e instanceof RecognitionException) { this.notifyListeners(e); // report error this.recover(e); } else { console.log(e.stack); throw e; } } if (this._input.LA(1) === Token.EOF) { this._hitEOF = true; } if (this._type === Token.INVALID_TYPE) { this._type = ttype; } if (this._type === Lexer.SKIP) { continueOuter = true; break; } if (this._type !== Lexer.MORE) { break; } } if (continueOuter) { continue; } if (this._token === null) { this.emit(); } return this._token; } } finally { // make sure we release marker after match or // unbuffered char stream will keep buffering this._input.release(tokenStartMarker); } } /** * Instruct the lexer to skip creating a token for current lexer rule * and look for another token. nextToken() knows to keep looking when * a lexer rule finishes with token set to SKIP_TOKEN. Recall that * if token==null at end of any token rule, it creates one for you * and emits it. */ skip() { this._type = Lexer.SKIP; } more() { this._type = Lexer.MORE; } mode(m) { this._mode = m; } pushMode(m) { if (this._interp.debug) { console.log("pushMode " + m); } this._modeStack.push(this._mode); this.mode(m); } popMode() { if (this._modeStack.length === 0) { throw "Empty Stack"; } if (this._interp.debug) { console.log("popMode back to " + this._modeStack.slice(0, -1)); } this.mode(this._modeStack.pop()); return this._mode; } /** * By default does not support multiple emits per nextToken invocation * for efficiency reasons. Subclass and override this method, nextToken, * and getToken (to push tokens into a list and pull from that list * rather than a single variable as this implementation does). */ emitToken(token) { this._token = token; } /** * The standard method called to automatically emit a token at the * outermost lexical rule. The token object should point into the * char buffer start..stop. If there is a text override in 'text', * use that to set the token's text. Override this method to emit * custom Token objects or provide a new factory. */ emit() { const t = this._factory.create(this._tokenFactorySourcePair, this._type, this._text, this._channel, this._tokenStartCharIndex, this .getCharIndex() - 1, this._tokenStartLine, this._tokenStartColumn); this.emitToken(t); return t; } emitEOF() { const cpos = this.column; const lpos = this.line; const eof = this._factory.create(this._tokenFactorySourcePair, Token.EOF, null, Token.DEFAULT_CHANNEL, this._input.index, this._input.index - 1, lpos, cpos); this.emitToken(eof); return eof; } // What is the index of the current character of lookahead?/// getCharIndex() { return this._input.index; } /** * Return a list of all Token objects in input char stream. * Forces load of all tokens. Does not include EOF token. */ getAllTokens() { const tokens = []; let t = this.nextToken(); while (t.type !== Token.EOF) { tokens.push(t); t = this.nextToken(); } return tokens; } notifyListeners(e) { const start = this._tokenStartCharIndex; const stop = this._input.index; const text = this._input.getText(start, stop); const msg = "token recognition error at: '" + this.getErrorDisplay(text) + "'"; const listener = this.getErrorListenerDispatch(); listener.syntaxError(this, null, this._tokenStartLine, this._tokenStartColumn, msg, e); } getErrorDisplay(s) { const d = []; for (let i = 0; i < s.length; i++) { d.push(s[i]); } return d.join(''); } getErrorDisplayForChar(c) { if (c.charCodeAt(0) === Token.EOF) { return "<EOF>"; } else if (c === '\n') { return "\\n"; } else if (c === '\t') { return "\\t"; } else if (c === '\r') { return "\\r"; } else { return c; } } getCharErrorDisplay(c) { return "'" + this.getErrorDisplayForChar(c) + "'"; } /** * Lexers can normally match any char in it's vocabulary after matching * a token, so do the easy thing and just kill a character and hope * it all works out. You can instead use the rule invocation stack * to do sophisticated error recovery if you are in a fragment rule. */ recover(re) { if (this._input.LA(1) !== Token.EOF) { if (re instanceof LexerNoViableAltException) { // skip a char and try again this._interp.consume(this._input); } else { // TODO: Do we lose character or line position information? this._input.consume(); } } } get inputStream(){ return this._input; } set inputStream(input) { this._input = null; this._tokenFactorySourcePair = [ this, this._input ]; this.reset(); this._input = input; this._tokenFactorySourcePair = [ this, this._input ]; } get sourceName(){ return this._input.sourceName; } get type(){ return this._type; } set type(type) { this._type = type; } get line(){ return this._interp.line; } set line(line) { this._interp.line = line; } get column(){ return this._interp.column; } set column(column) { this._interp.column = column; } get text(){ if (this._text !== null) { return this._text; } else { return this._interp.getText(this._input); } } set text(text) { this._text = text; } } Lexer.DEFAULT_MODE = 0; Lexer.MORE = -2; Lexer.SKIP = -3; Lexer.DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL; Lexer.HIDDEN = Token.HIDDEN_CHANNEL; Lexer.MIN_CHAR_VALUE = 0x0000; Lexer.MAX_CHAR_VALUE = 0x10FFFF;