@gobstones/gobstones-parser
Version:
Gobstones parser
598 lines (557 loc) • 20 kB
text/typescript
/* eslint-disable no-underscore-dangle */
import { GbsSyntaxError, GbsWarning } from './exceptions';
import { Input, MultifileReader, SourceReader } from './reader';
import {
T_AND,
T_ARROW,
T_ASSIGN,
T_CASE,
T_CHOOSE,
T_COMMA,
T_CONCAT,
T_DIV,
T_ELLIPSIS,
T_ELSE,
T_ELSEIF,
T_EOF,
T_EQ,
T_FIELD,
T_FOREACH,
T_FUNCTION,
T_GE,
T_GETS,
T_GT,
T_IF,
T_IN,
T_INTERACTIVE,
T_IS,
T_LBRACE,
T_LBRACK,
T_LE,
T_LET,
T_LOWERID,
T_LPAREN,
T_LT,
T_MATCHING,
T_MINUS,
T_MOD,
T_NE,
T_NOT,
T_NUM,
T_ON,
T_OR,
T_OTHERWISE,
T_PIPE,
T_PLUS,
T_POW,
T_PROCEDURE,
T_PROGRAM,
T_RANGE,
T_RBRACE,
T_RBRACK,
T_RECORD,
T_REPEAT,
T_RETURN,
T_RPAREN,
T_SELECT,
T_SEMICOLON,
T_STRING,
T_SWITCH,
T_THEN,
T_TIMEOUT,
T_TIMES,
T_TO,
T_TYPE,
T_UNDERSCORE,
T_UPPERID,
T_VARIANT,
T_WHEN,
T_WHILE,
Token
} from './token';
import { ASTDef } from './ast';
import { i18n } from './i18n';
const isWhitespace = (chr: string): boolean =>
chr === ' ' || chr === '\t' || chr === '\r' || chr === '\n';
const isDigit = (chr: string): boolean => chr >= '0' && chr <= '9';
/* We define a character to be alphabetic if it has two distinct forms:
* an uppercase form and a lowercase form.
*
* This accepts alphabetic Unicode characters but rejects numbers and symbols.
*/
const isAlpha = (chr: string): boolean => chr.toUpperCase() !== chr.toLowerCase();
/* An uppercase character is an alphabetic character that coincides with
* its uppercase form */
const isUpper = (chr: string): boolean => isAlpha(chr) && chr.toUpperCase() === chr;
/* A lowercase character is an alphabetic character that coincides with
* its lowercase form */
const isLower = (chr: string): boolean => isAlpha(chr) && chr.toLowerCase() === chr;
const isIdent = (chr: string): boolean =>
isAlpha(chr) || isDigit(chr) || chr === '_' || chr === "'";
const KEYWORDS = {
program: T_PROGRAM,
interactive: T_INTERACTIVE,
procedure: T_PROCEDURE,
function: T_FUNCTION,
return: T_RETURN,
/* Control structures */
if: T_IF,
then: T_THEN,
elseif: T_ELSEIF,
else: T_ELSE,
choose: T_CHOOSE,
when: T_WHEN,
otherwise: T_OTHERWISE,
repeat: T_REPEAT,
foreach: T_FOREACH,
in: T_IN,
while: T_WHILE,
switch: T_SWITCH,
to: T_TO,
matching: T_MATCHING,
select: T_SELECT,
on: T_ON,
/* Assignment */
let: T_LET,
/* Operators */
not: T_NOT,
div: T_DIV,
mod: T_MOD,
/* Records/variants */
type: T_TYPE,
is: T_IS,
record: T_RECORD,
variant: T_VARIANT,
case: T_CASE,
field: T_FIELD,
/* Default case in a switch/match */
_: T_UNDERSCORE
};
/* Pattern for timeouts in an interactive program */
// eslint-disable-next-line @typescript-eslint/ban-types
const i: string | Function = i18n('CONS:TIMEOUT');
const t: string = typeof i === 'string' ? i : '';
KEYWORDS[t] = T_TIMEOUT;
/* Note: the order is relevant so that the 'maximal munch' rule applies. */
const SYMBOLS = [
/* Various delimiters */
{ symbol: '(', tag: T_LPAREN },
{ symbol: ')', tag: T_RPAREN },
{ symbol: '{', tag: T_LBRACE },
{ symbol: '}', tag: T_RBRACE },
{ symbol: '[', tag: T_LBRACK }, // For lists and ranges
{ symbol: ']', tag: T_RBRACK },
{ symbol: ',', tag: T_COMMA },
{ symbol: ';', tag: T_SEMICOLON },
{ symbol: '...', tag: T_ELLIPSIS }, // For intentionally incomplete programs
/* Range operator */
{ symbol: '..', tag: T_RANGE },
/* Assignment */
{ symbol: ':=', tag: T_ASSIGN },
/* Logical operators */
{ symbol: '&&', tag: T_AND },
{ symbol: '||', tag: T_OR },
/* Fields */
{ symbol: '<-', tag: T_GETS }, // Field initializer, e.g. Coord(x <- 1, y <- 2)
{ symbol: '|', tag: T_PIPE }, // Field update, e.g. Coord(c | x <- 2)
/* Pattern matching */
{ symbol: '->', tag: T_ARROW }, // For the branches of a switch
/* Relational operators */
{ symbol: '==', tag: T_EQ },
{ symbol: '/=', tag: T_NE },
{ symbol: '<=', tag: T_LE },
{ symbol: '>=', tag: T_GE },
{ symbol: '<', tag: T_LT },
{ symbol: '>', tag: T_GT },
/* Functions */
{ symbol: '++', tag: T_CONCAT },
{ symbol: '+', tag: T_PLUS },
{ symbol: '-', tag: T_MINUS },
{ symbol: '*', tag: T_TIMES },
{ symbol: '^', tag: T_POW }
];
/* Valid language options accepted by the LANGUAGE pragma */
const LANGUAGE_OPTIONS = ['DestructuringForeach', 'AllowRecursion'];
const leadingZeroes = (string: string): boolean => string.length >= 0 && string[0] === '0';
// eslint-disable-next-line @typescript-eslint/explicit-function-return-type
function fail(startPos: SourceReader, endPos: SourceReader, reason: string, args: any[]) {
throw new GbsSyntaxError(startPos, endPos, reason, args);
}
const CLOSING_DELIMITERS = {
'(': ')',
'[': ']',
'{': '}'
};
/* An instance of Lexer scans source code for tokens.
* Example:
*
* let tok = new Lexer('if (a)');
* tok.nextToken(); // ~~> new Token(T_IF, null, ...)
* tok.nextToken(); // ~~> new Token(T_LPAREN, null, ...)
* tok.nextToken(); // ~~> new Token(T_LOWERID, 'a', ...)
* tok.nextToken(); // ~~> new Token(T_RPAREN, null, ...)
* tok.nextToken(); // ~~> new Token(T_EOF, null, ...)
*
* The 'input' parameter is either a string or a mapping
* from filenames to strings.
*/
export class Lexer {
private _multifileReader: MultifileReader;
private _reader: SourceReader;
private _warnings: GbsWarning[];
private _delimiterStack: any[];
private _pendingAttributes: Record<string, ASTDef>;
private _languageOptions: string[];
public constructor(input: Input) {
this._multifileReader = new MultifileReader(input);
this._reader = this._multifileReader.readCurrentFile();
this._warnings = [];
/* A stack of tokens '(', '[' and '{', to provide more helpful
* error reporting if delimiters are not balanced. */
this._delimiterStack = [];
/* A dictionary of pending attributes, set by the ATTRIBUTE pragma.
* Pending attributes are used by the parser to decorate any procedure
* or function definition. */
this._pendingAttributes = {};
/* A list of language options, enabled by the LANGUAGE pragma.
* Language options are interpreted by the runner to initialize.
* the remaining modules (linter, compiler, runtime, ...)
* accordingly. */
this._languageOptions = [];
}
/* Return the next token from the input */
public nextToken(): Token {
if (!this._findNextToken()) {
const token = new Token(T_EOF, undefined, this._reader, this._reader);
this._checkBalancedDelimiters(token);
return token;
}
if (isDigit(this._reader.peek())) {
const startPos = this._reader;
const value = this._readStringWhile(isDigit);
const endPos = this._reader;
if (leadingZeroes(value) && value.length > 1) {
fail(startPos, endPos, 'numeric-constant-should-not-have-leading-zeroes', []);
}
return new Token(T_NUM, value, startPos, endPos);
} else if (isIdent(this._reader.peek())) {
const startPos = this._reader;
const value = this._readStringWhile(isIdent);
const endPos = this._reader;
if (value in KEYWORDS) {
return new Token(KEYWORDS[value], value, startPos, endPos);
} else if (isUpper(value[0])) {
return new Token(T_UPPERID, value, startPos, endPos);
} else if (isLower(value[0])) {
return new Token(T_LOWERID, value, startPos, endPos);
} else {
fail(startPos, endPos, 'identifier-must-start-with-alphabetic-character', []);
}
} else if (this._reader.peek() === '"') {
return this._readStringConstant();
} else {
return this._readSymbol();
}
}
/* When tokenization is done, this function returns the list of all
* the warnings collected during tokenization */
public warnings(): GbsWarning[] {
return this._warnings;
}
/* Skip whitespace and advance through files until the start of the next
* token is found. Return false if EOF is found. */
public _findNextToken(): boolean {
for (;;) {
this._ignoreWhitespaceAndComments();
if (!this._reader.eof()) {
break;
}
if (this._multifileReader.moreFiles()) {
this._multifileReader.nextFile();
this._reader = this._multifileReader.readCurrentFile();
} else {
return false;
}
}
return true;
}
/* Read a string while the given condition holds for the current
* character */
// eslint-disable-next-line @typescript-eslint/ban-types
public _readStringWhile(condition: Function): string {
const result = [];
while (!this._reader.eof()) {
if (!condition(this._reader.peek())) {
break;
}
result.push(this._reader.peek());
this._reader = this._reader.consumeCharacter();
}
return result.join('');
}
/* Reads a quote-delimited string constant.
* Escapes are recognized. */
public _readStringConstant(): Token {
const startPos = this._reader;
const result = [];
this._reader = this._reader.consumeCharacter();
while (!this._reader.eof()) {
const c = this._reader.peek();
if (c === '"') {
this._reader = this._reader.consumeCharacter();
return new Token(T_STRING, result.join(''), startPos, this._reader);
} else if (c === '\\') {
this._reader = this._reader.consumeCharacter();
if (this._reader.eof()) {
break;
}
const c2 = this._reader.peek();
this._reader = this._reader.consumeCharacter();
switch (c2) {
case 'a':
result.push('\u0007');
break;
case 'b':
result.push('\u0008');
break;
case 'f':
result.push('\u000c');
break;
case 'n':
result.push('\u000a');
break;
case 'r':
result.push('\u000d');
break;
case 't':
result.push('\u0009');
break;
case 'v':
result.push('\u000b');
break;
default:
result.push(c2);
break;
}
} else {
result.push(c);
this._reader = this._reader.consumeCharacter();
}
}
fail(startPos, this._reader, 'unclosed-string-constant', []);
}
/* Read a symbol */
public _readSymbol(): Token {
for (const { symbol, tag } of SYMBOLS) {
if (this._reader.startsWith(symbol)) {
const startPos = this._reader;
this._reader = this._reader.consumeString(symbol);
const endPos = this._reader;
const token = new Token(tag, symbol, startPos, endPos);
this._checkBalancedDelimiters(token);
return token;
}
}
fail(this._reader, this._reader, 'unknown-token', [this._reader.peek()]);
}
public _ignoreWhitespaceAndComments(): void {
while (this._ignoreWhitespace() || this._ignoreComments()) {
/* continue */
}
}
public _ignoreWhitespace(): boolean {
if (!this._reader.eof() && isWhitespace(this._reader.peek())) {
this._reader = this._reader.consumeCharacter();
return true;
} else {
return false;
}
}
/* Skips comments and pragmas, returns false if there are no comments */
public _ignoreComments(): boolean {
if (this._startSingleLineComment()) {
this._ignoreSingleLineComment();
return true;
} else if (this._reader.startsWith('/*@')) {
const startPos = this._reader;
this._evaluatePragma(startPos, this._readInvisiblePragma('/*', '*/', '@'));
return true;
} else if (this._reader.startsWith('{-')) {
this._ignoreMultilineComment('{-', '-}');
return true;
} else if (this._reader.startsWith('/*')) {
this._ignoreMultilineComment('/*', '*/');
return true;
} else {
return false;
}
}
/* Returns true if a single-line comment starts here */
public _startSingleLineComment(): boolean {
return (
this._reader.startsWith('--') ||
this._reader.startsWith('//') ||
this._reader.startsWith('#')
);
}
/* Skips a single-line comment */
public _ignoreSingleLineComment(): void {
while (!this._reader.eof()) {
this._reader = this._reader.consumeCharacter();
if (this._reader.peek() === '\n') {
break;
}
}
}
/* Skips a multiline comment with the given left/right delimiters.
* Multi-line comments may be nested. */
public _ignoreMultilineComment(left: string, right: string): void {
let nesting = 0;
const startPos = this._reader;
while (!this._reader.eof()) {
if (this._reader.startsWith(left)) {
this._reader = this._reader.consumeString(left);
nesting++;
} else if (this._reader.startsWith(right)) {
this._reader = this._reader.consumeString(right);
nesting--;
if (nesting === 0) {
return;
}
} else {
this._reader = this._reader.consumeCharacter();
}
}
fail(startPos, this._reader, 'unclosed-multiline-comment', []);
}
/* Read a pragma. A pragma is a comment delimited by the
* given left / *
* and right * /
* comment delimiters.
* It has N >= 0 parts delimited by the pragma delimiter @
* @part1@part2@...@partN@
*/
public _readInvisiblePragma(left: string, right: string, pragmaDelim: string): string[] {
const pragma = [];
const startPos = this._reader;
this._reader = this._reader.consumeInvisibleString(left);
this._reader = this._reader.consumeInvisibleString(pragmaDelim);
while (!this._reader.eof()) {
pragma.push(this._readInvisibleStringUntilDelimiter(pragmaDelim));
this._reader = this._reader.consumeInvisibleString(pragmaDelim);
if (this._reader.startsWith(right)) {
this._reader = this._reader.consumeInvisibleString(right);
return pragma;
}
}
fail(startPos, this._reader, 'unclosed-multiline-comment', []);
}
/* Read an invisible string until the given delimiter is found */
public _readInvisibleStringUntilDelimiter(delimiter: string): string {
const startPos = this._reader;
const result = [];
while (!this._reader.eof()) {
if (this._reader.peek() === delimiter) {
return result.join('');
}
result.push(this._reader.peek());
this._reader = this._reader.consumeInvisibleCharacter();
}
fail(startPos, this._reader, 'unclosed-multiline-comment', []);
}
public _evaluatePragma(startPos: SourceReader, pragma: string[]): void {
if (pragma.length === 0) {
this._emitWarning(startPos, this._reader, 'empty-pragma', []);
} else if (pragma[0] === 'BEGIN_REGION') {
const region = pragma[1];
this._reader = this._reader.beginRegion(region);
} else if (pragma[0] === 'END_REGION') {
this._reader = this._reader.endRegion();
} else if (pragma[0] === 'ATTRIBUTE' && pragma.length >= 2) {
const key = pragma[1];
const value = pragma.slice(2, pragma.length).join('@');
this.setAttribute(key, value);
} else if (pragma[0] === 'LANGUAGE' && pragma.length === 2) {
const languageOption = pragma[1];
this.addLanguageOption(languageOption);
} else {
this._emitWarning(startPos, this._reader, 'unknown-pragma', [pragma[0]]);
}
}
public _emitWarning(
startPos: SourceReader,
endPos: SourceReader,
reason: string,
args: any[]
): void {
this._warnings.push(new GbsWarning(startPos, endPos, reason, args));
}
/* Check that reading a delimiter keeps the delimiter stack balanced. */
public _checkBalancedDelimiters(token: Token): void {
if (token.tag === T_EOF && this._delimiterStack.length > 0) {
const openingDelimiter = this._delimiterStack.pop();
fail(
openingDelimiter.startPos,
openingDelimiter.endPos,
'unmatched-opening-delimiter',
[openingDelimiter.value]
);
} else if (token.tag === T_LPAREN || token.tag === T_LBRACE || token.tag === T_LBRACK) {
this._delimiterStack.push(token);
} else if (token.tag === T_RPAREN || token.tag === T_RBRACE || token.tag === T_RBRACK) {
if (this._delimiterStack.length === 0) {
fail(token.startPos, token.endPos, 'unmatched-closing-delimiter', [token.value]);
}
const openingDelimiter = this._delimiterStack.pop();
if (CLOSING_DELIMITERS[openingDelimiter.value] !== token.value) {
fail(
openingDelimiter.startPos,
openingDelimiter.endPos,
'unmatched-opening-delimiter',
[openingDelimiter.value]
);
}
}
}
/*
* Interface for handling attributes.
*
* The pragma ATTRIBUTE@key@value
* establishes the attribute given by <key> to <value>.
*
* Whenever the parser finds a definition of the following kinds:
* procedure
* function
* program
* interactive program
* type
* it gets decorated with the pending attributes.
*/
public getPendingAttributes(): Record<string, ASTDef> {
const a = this._pendingAttributes;
this._pendingAttributes = {};
return a;
}
// eslint-disable-next-line @typescript-eslint/explicit-module-boundary-types
public setAttribute(key: string, value: any): void {
this._pendingAttributes[key] = value;
}
/*
* Interface for handling language options.
*
* The pragma LANGUAGE@option sets the given option.
*
* The runner module reads these options to initialize the
* linter/compiler/runtime.
*/
public getLanguageOptions(): string[] {
return this._languageOptions;
}
public addLanguageOption(option: string): void {
if (LANGUAGE_OPTIONS.indexOf(option) !== -1) {
this._languageOptions.push(option);
} else {
fail(this._reader, this._reader, 'unknown-language-option', [option]);
}
}
}