UNPKG

@gobstones/gobstones-parser

Version:
598 lines (557 loc) 20 kB
/* eslint-disable no-underscore-dangle */ import { GbsSyntaxError, GbsWarning } from './exceptions'; import { Input, MultifileReader, SourceReader } from './reader'; import { T_AND, T_ARROW, T_ASSIGN, T_CASE, T_CHOOSE, T_COMMA, T_CONCAT, T_DIV, T_ELLIPSIS, T_ELSE, T_ELSEIF, T_EOF, T_EQ, T_FIELD, T_FOREACH, T_FUNCTION, T_GE, T_GETS, T_GT, T_IF, T_IN, T_INTERACTIVE, T_IS, T_LBRACE, T_LBRACK, T_LE, T_LET, T_LOWERID, T_LPAREN, T_LT, T_MATCHING, T_MINUS, T_MOD, T_NE, T_NOT, T_NUM, T_ON, T_OR, T_OTHERWISE, T_PIPE, T_PLUS, T_POW, T_PROCEDURE, T_PROGRAM, T_RANGE, T_RBRACE, T_RBRACK, T_RECORD, T_REPEAT, T_RETURN, T_RPAREN, T_SELECT, T_SEMICOLON, T_STRING, T_SWITCH, T_THEN, T_TIMEOUT, T_TIMES, T_TO, T_TYPE, T_UNDERSCORE, T_UPPERID, T_VARIANT, T_WHEN, T_WHILE, Token } from './token'; import { ASTDef } from './ast'; import { i18n } from './i18n'; const isWhitespace = (chr: string): boolean => chr === ' ' || chr === '\t' || chr === '\r' || chr === '\n'; const isDigit = (chr: string): boolean => chr >= '0' && chr <= '9'; /* We define a character to be alphabetic if it has two distinct forms: * an uppercase form and a lowercase form. * * This accepts alphabetic Unicode characters but rejects numbers and symbols. */ const isAlpha = (chr: string): boolean => chr.toUpperCase() !== chr.toLowerCase(); /* An uppercase character is an alphabetic character that coincides with * its uppercase form */ const isUpper = (chr: string): boolean => isAlpha(chr) && chr.toUpperCase() === chr; /* A lowercase character is an alphabetic character that coincides with * its lowercase form */ const isLower = (chr: string): boolean => isAlpha(chr) && chr.toLowerCase() === chr; const isIdent = (chr: string): boolean => isAlpha(chr) || isDigit(chr) || chr === '_' || chr === "'"; const KEYWORDS = { program: T_PROGRAM, interactive: T_INTERACTIVE, procedure: T_PROCEDURE, function: T_FUNCTION, return: T_RETURN, /* Control structures */ if: T_IF, then: T_THEN, elseif: T_ELSEIF, else: T_ELSE, choose: T_CHOOSE, when: T_WHEN, otherwise: T_OTHERWISE, repeat: T_REPEAT, foreach: T_FOREACH, in: T_IN, while: T_WHILE, switch: T_SWITCH, to: T_TO, matching: T_MATCHING, select: T_SELECT, on: T_ON, /* Assignment */ let: T_LET, /* Operators */ not: T_NOT, div: T_DIV, mod: T_MOD, /* Records/variants */ type: T_TYPE, is: T_IS, record: T_RECORD, variant: T_VARIANT, case: T_CASE, field: T_FIELD, /* Default case in a switch/match */ _: T_UNDERSCORE }; /* Pattern for timeouts in an interactive program */ // eslint-disable-next-line @typescript-eslint/ban-types const i: string | Function = i18n('CONS:TIMEOUT'); const t: string = typeof i === 'string' ? i : ''; KEYWORDS[t] = T_TIMEOUT; /* Note: the order is relevant so that the 'maximal munch' rule applies. */ const SYMBOLS = [ /* Various delimiters */ { symbol: '(', tag: T_LPAREN }, { symbol: ')', tag: T_RPAREN }, { symbol: '{', tag: T_LBRACE }, { symbol: '}', tag: T_RBRACE }, { symbol: '[', tag: T_LBRACK }, // For lists and ranges { symbol: ']', tag: T_RBRACK }, { symbol: ',', tag: T_COMMA }, { symbol: ';', tag: T_SEMICOLON }, { symbol: '...', tag: T_ELLIPSIS }, // For intentionally incomplete programs /* Range operator */ { symbol: '..', tag: T_RANGE }, /* Assignment */ { symbol: ':=', tag: T_ASSIGN }, /* Logical operators */ { symbol: '&&', tag: T_AND }, { symbol: '||', tag: T_OR }, /* Fields */ { symbol: '<-', tag: T_GETS }, // Field initializer, e.g. Coord(x <- 1, y <- 2) { symbol: '|', tag: T_PIPE }, // Field update, e.g. Coord(c | x <- 2) /* Pattern matching */ { symbol: '->', tag: T_ARROW }, // For the branches of a switch /* Relational operators */ { symbol: '==', tag: T_EQ }, { symbol: '/=', tag: T_NE }, { symbol: '<=', tag: T_LE }, { symbol: '>=', tag: T_GE }, { symbol: '<', tag: T_LT }, { symbol: '>', tag: T_GT }, /* Functions */ { symbol: '++', tag: T_CONCAT }, { symbol: '+', tag: T_PLUS }, { symbol: '-', tag: T_MINUS }, { symbol: '*', tag: T_TIMES }, { symbol: '^', tag: T_POW } ]; /* Valid language options accepted by the LANGUAGE pragma */ const LANGUAGE_OPTIONS = ['DestructuringForeach', 'AllowRecursion']; const leadingZeroes = (string: string): boolean => string.length >= 0 && string[0] === '0'; // eslint-disable-next-line @typescript-eslint/explicit-function-return-type function fail(startPos: SourceReader, endPos: SourceReader, reason: string, args: any[]) { throw new GbsSyntaxError(startPos, endPos, reason, args); } const CLOSING_DELIMITERS = { '(': ')', '[': ']', '{': '}' }; /* An instance of Lexer scans source code for tokens. * Example: * * let tok = new Lexer('if (a)'); * tok.nextToken(); // ~~> new Token(T_IF, null, ...) * tok.nextToken(); // ~~> new Token(T_LPAREN, null, ...) * tok.nextToken(); // ~~> new Token(T_LOWERID, 'a', ...) * tok.nextToken(); // ~~> new Token(T_RPAREN, null, ...) * tok.nextToken(); // ~~> new Token(T_EOF, null, ...) * * The 'input' parameter is either a string or a mapping * from filenames to strings. */ export class Lexer { private _multifileReader: MultifileReader; private _reader: SourceReader; private _warnings: GbsWarning[]; private _delimiterStack: any[]; private _pendingAttributes: Record<string, ASTDef>; private _languageOptions: string[]; public constructor(input: Input) { this._multifileReader = new MultifileReader(input); this._reader = this._multifileReader.readCurrentFile(); this._warnings = []; /* A stack of tokens '(', '[' and '{', to provide more helpful * error reporting if delimiters are not balanced. */ this._delimiterStack = []; /* A dictionary of pending attributes, set by the ATTRIBUTE pragma. * Pending attributes are used by the parser to decorate any procedure * or function definition. */ this._pendingAttributes = {}; /* A list of language options, enabled by the LANGUAGE pragma. * Language options are interpreted by the runner to initialize. * the remaining modules (linter, compiler, runtime, ...) * accordingly. */ this._languageOptions = []; } /* Return the next token from the input */ public nextToken(): Token { if (!this._findNextToken()) { const token = new Token(T_EOF, undefined, this._reader, this._reader); this._checkBalancedDelimiters(token); return token; } if (isDigit(this._reader.peek())) { const startPos = this._reader; const value = this._readStringWhile(isDigit); const endPos = this._reader; if (leadingZeroes(value) && value.length > 1) { fail(startPos, endPos, 'numeric-constant-should-not-have-leading-zeroes', []); } return new Token(T_NUM, value, startPos, endPos); } else if (isIdent(this._reader.peek())) { const startPos = this._reader; const value = this._readStringWhile(isIdent); const endPos = this._reader; if (value in KEYWORDS) { return new Token(KEYWORDS[value], value, startPos, endPos); } else if (isUpper(value[0])) { return new Token(T_UPPERID, value, startPos, endPos); } else if (isLower(value[0])) { return new Token(T_LOWERID, value, startPos, endPos); } else { fail(startPos, endPos, 'identifier-must-start-with-alphabetic-character', []); } } else if (this._reader.peek() === '"') { return this._readStringConstant(); } else { return this._readSymbol(); } } /* When tokenization is done, this function returns the list of all * the warnings collected during tokenization */ public warnings(): GbsWarning[] { return this._warnings; } /* Skip whitespace and advance through files until the start of the next * token is found. Return false if EOF is found. */ public _findNextToken(): boolean { for (;;) { this._ignoreWhitespaceAndComments(); if (!this._reader.eof()) { break; } if (this._multifileReader.moreFiles()) { this._multifileReader.nextFile(); this._reader = this._multifileReader.readCurrentFile(); } else { return false; } } return true; } /* Read a string while the given condition holds for the current * character */ // eslint-disable-next-line @typescript-eslint/ban-types public _readStringWhile(condition: Function): string { const result = []; while (!this._reader.eof()) { if (!condition(this._reader.peek())) { break; } result.push(this._reader.peek()); this._reader = this._reader.consumeCharacter(); } return result.join(''); } /* Reads a quote-delimited string constant. * Escapes are recognized. */ public _readStringConstant(): Token { const startPos = this._reader; const result = []; this._reader = this._reader.consumeCharacter(); while (!this._reader.eof()) { const c = this._reader.peek(); if (c === '"') { this._reader = this._reader.consumeCharacter(); return new Token(T_STRING, result.join(''), startPos, this._reader); } else if (c === '\\') { this._reader = this._reader.consumeCharacter(); if (this._reader.eof()) { break; } const c2 = this._reader.peek(); this._reader = this._reader.consumeCharacter(); switch (c2) { case 'a': result.push('\u0007'); break; case 'b': result.push('\u0008'); break; case 'f': result.push('\u000c'); break; case 'n': result.push('\u000a'); break; case 'r': result.push('\u000d'); break; case 't': result.push('\u0009'); break; case 'v': result.push('\u000b'); break; default: result.push(c2); break; } } else { result.push(c); this._reader = this._reader.consumeCharacter(); } } fail(startPos, this._reader, 'unclosed-string-constant', []); } /* Read a symbol */ public _readSymbol(): Token { for (const { symbol, tag } of SYMBOLS) { if (this._reader.startsWith(symbol)) { const startPos = this._reader; this._reader = this._reader.consumeString(symbol); const endPos = this._reader; const token = new Token(tag, symbol, startPos, endPos); this._checkBalancedDelimiters(token); return token; } } fail(this._reader, this._reader, 'unknown-token', [this._reader.peek()]); } public _ignoreWhitespaceAndComments(): void { while (this._ignoreWhitespace() || this._ignoreComments()) { /* continue */ } } public _ignoreWhitespace(): boolean { if (!this._reader.eof() && isWhitespace(this._reader.peek())) { this._reader = this._reader.consumeCharacter(); return true; } else { return false; } } /* Skips comments and pragmas, returns false if there are no comments */ public _ignoreComments(): boolean { if (this._startSingleLineComment()) { this._ignoreSingleLineComment(); return true; } else if (this._reader.startsWith('/*@')) { const startPos = this._reader; this._evaluatePragma(startPos, this._readInvisiblePragma('/*', '*/', '@')); return true; } else if (this._reader.startsWith('{-')) { this._ignoreMultilineComment('{-', '-}'); return true; } else if (this._reader.startsWith('/*')) { this._ignoreMultilineComment('/*', '*/'); return true; } else { return false; } } /* Returns true if a single-line comment starts here */ public _startSingleLineComment(): boolean { return ( this._reader.startsWith('--') || this._reader.startsWith('//') || this._reader.startsWith('#') ); } /* Skips a single-line comment */ public _ignoreSingleLineComment(): void { while (!this._reader.eof()) { this._reader = this._reader.consumeCharacter(); if (this._reader.peek() === '\n') { break; } } } /* Skips a multiline comment with the given left/right delimiters. * Multi-line comments may be nested. */ public _ignoreMultilineComment(left: string, right: string): void { let nesting = 0; const startPos = this._reader; while (!this._reader.eof()) { if (this._reader.startsWith(left)) { this._reader = this._reader.consumeString(left); nesting++; } else if (this._reader.startsWith(right)) { this._reader = this._reader.consumeString(right); nesting--; if (nesting === 0) { return; } } else { this._reader = this._reader.consumeCharacter(); } } fail(startPos, this._reader, 'unclosed-multiline-comment', []); } /* Read a pragma. A pragma is a comment delimited by the * given left / * * and right * / * comment delimiters. * It has N >= 0 parts delimited by the pragma delimiter @ * @part1@part2@...@partN@ */ public _readInvisiblePragma(left: string, right: string, pragmaDelim: string): string[] { const pragma = []; const startPos = this._reader; this._reader = this._reader.consumeInvisibleString(left); this._reader = this._reader.consumeInvisibleString(pragmaDelim); while (!this._reader.eof()) { pragma.push(this._readInvisibleStringUntilDelimiter(pragmaDelim)); this._reader = this._reader.consumeInvisibleString(pragmaDelim); if (this._reader.startsWith(right)) { this._reader = this._reader.consumeInvisibleString(right); return pragma; } } fail(startPos, this._reader, 'unclosed-multiline-comment', []); } /* Read an invisible string until the given delimiter is found */ public _readInvisibleStringUntilDelimiter(delimiter: string): string { const startPos = this._reader; const result = []; while (!this._reader.eof()) { if (this._reader.peek() === delimiter) { return result.join(''); } result.push(this._reader.peek()); this._reader = this._reader.consumeInvisibleCharacter(); } fail(startPos, this._reader, 'unclosed-multiline-comment', []); } public _evaluatePragma(startPos: SourceReader, pragma: string[]): void { if (pragma.length === 0) { this._emitWarning(startPos, this._reader, 'empty-pragma', []); } else if (pragma[0] === 'BEGIN_REGION') { const region = pragma[1]; this._reader = this._reader.beginRegion(region); } else if (pragma[0] === 'END_REGION') { this._reader = this._reader.endRegion(); } else if (pragma[0] === 'ATTRIBUTE' && pragma.length >= 2) { const key = pragma[1]; const value = pragma.slice(2, pragma.length).join('@'); this.setAttribute(key, value); } else if (pragma[0] === 'LANGUAGE' && pragma.length === 2) { const languageOption = pragma[1]; this.addLanguageOption(languageOption); } else { this._emitWarning(startPos, this._reader, 'unknown-pragma', [pragma[0]]); } } public _emitWarning( startPos: SourceReader, endPos: SourceReader, reason: string, args: any[] ): void { this._warnings.push(new GbsWarning(startPos, endPos, reason, args)); } /* Check that reading a delimiter keeps the delimiter stack balanced. */ public _checkBalancedDelimiters(token: Token): void { if (token.tag === T_EOF && this._delimiterStack.length > 0) { const openingDelimiter = this._delimiterStack.pop(); fail( openingDelimiter.startPos, openingDelimiter.endPos, 'unmatched-opening-delimiter', [openingDelimiter.value] ); } else if (token.tag === T_LPAREN || token.tag === T_LBRACE || token.tag === T_LBRACK) { this._delimiterStack.push(token); } else if (token.tag === T_RPAREN || token.tag === T_RBRACE || token.tag === T_RBRACK) { if (this._delimiterStack.length === 0) { fail(token.startPos, token.endPos, 'unmatched-closing-delimiter', [token.value]); } const openingDelimiter = this._delimiterStack.pop(); if (CLOSING_DELIMITERS[openingDelimiter.value] !== token.value) { fail( openingDelimiter.startPos, openingDelimiter.endPos, 'unmatched-opening-delimiter', [openingDelimiter.value] ); } } } /* * Interface for handling attributes. * * The pragma ATTRIBUTE@key@value * establishes the attribute given by <key> to <value>. * * Whenever the parser finds a definition of the following kinds: * procedure * function * program * interactive program * type * it gets decorated with the pending attributes. */ public getPendingAttributes(): Record<string, ASTDef> { const a = this._pendingAttributes; this._pendingAttributes = {}; return a; } // eslint-disable-next-line @typescript-eslint/explicit-module-boundary-types public setAttribute(key: string, value: any): void { this._pendingAttributes[key] = value; } /* * Interface for handling language options. * * The pragma LANGUAGE@option sets the given option. * * The runner module reads these options to initialize the * linter/compiler/runtime. */ public getLanguageOptions(): string[] { return this._languageOptions; } public addLanguageOption(option: string): void { if (LANGUAGE_OPTIONS.indexOf(option) !== -1) { this._languageOptions.push(option); } else { fail(this._reader, this._reader, 'unknown-language-option', [option]); } } }