UNPKG

js-slang

Version:

Javascript-based implementations of Source, written in Typescript

589 lines (587 loc) 24 kB
"use strict"; /* * Full disclosure: The general structure of this file is adapted from my own * Rust implementation of a scanner * https://github.com/Fidget-Spinner/crafting_interpreters/blob/main/rust/src/scanner.rs. * That is in turn is adapted from the Java code written by the excellent book, * "Crafting Interpreters" https://craftinginterpreters.com/scanning.html. * Said book's copyright is under Robert Nystrom. * I've included the MIT license that code snippets from * the book is licensed under down below. See * https://github.com/munificent/craftinginterpreters/blob/master/LICENSE * * The changes I've made: I have rewritten basically everything from scratch. * Only the method names and overall method APIs * are the same. Their internal behaviors are quite different as the scanner * in the book parses a JS-like language, not Python. * * - The book was written in Java. I have written this in TypeScript. * - The scanner supports a whitespace significant language now. * - Also added support for column numbers for better error messages in the future. * - Also added better errors. * - Also added forbidden identifiers. * * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * */ Object.defineProperty(exports, "__esModule", { value: true }); exports.Tokenizer = exports.SPECIAL_IDENTIFIER_TOKENS = exports.Token = void 0; const tokens_1 = require("./tokens"); const errors_1 = require("./errors"); class Token { constructor(type, lexeme, line, col, indexInSource) { this.type = type; this.lexeme = lexeme; this.line = line; this.col = col; this.indexInSource = indexInSource; } } exports.Token = Token; const specialIdentifiers = new Map([ ["and", tokens_1.TokenType.AND], ["or", tokens_1.TokenType.OR], ["while", tokens_1.TokenType.WHILE], ["for", tokens_1.TokenType.FOR], ["None", tokens_1.TokenType.NONE], ["is", tokens_1.TokenType.IS], ["not", tokens_1.TokenType.NOT], ["pass", tokens_1.TokenType.PASS], ["def", tokens_1.TokenType.DEF], ["lambda", tokens_1.TokenType.LAMBDA], ["from", tokens_1.TokenType.FROM], ["True", tokens_1.TokenType.TRUE], ["False", tokens_1.TokenType.FALSE], ["break", tokens_1.TokenType.BREAK], ["continue", tokens_1.TokenType.CONTINUE], ["return", tokens_1.TokenType.RETURN], ["assert", tokens_1.TokenType.ASSERT], ["import", tokens_1.TokenType.IMPORT], ["global", tokens_1.TokenType.GLOBAL], ["nonlocal", tokens_1.TokenType.NONLOCAL], ["if", tokens_1.TokenType.IF], ["elif", tokens_1.TokenType.ELIF], ["else", tokens_1.TokenType.ELSE], ["in", tokens_1.TokenType.IN], ]); exports.SPECIAL_IDENTIFIER_TOKENS = Array.from(specialIdentifiers.values()); class Tokenizer { // forbiddenOperators: Set<TokenType>; constructor(source) { this.source = source; this.tokens = []; this.start = 0; this.current = 0; this.line = 0; this.col = 0; this.indentStack = [0]; this.specialIdentifiers = specialIdentifiers; // Not used by us, but should be kept reserved as per Python spec this.forbiddenIdentifiers = new Map([ ["async", tokens_1.TokenType.ASYNC], ["await", tokens_1.TokenType.AWAIT], ["yield", tokens_1.TokenType.YIELD], ["with", tokens_1.TokenType.WITH], ["del", tokens_1.TokenType.DEL], ["try", tokens_1.TokenType.TRY], ["except", tokens_1.TokenType.EXCEPT], ["finally", tokens_1.TokenType.FINALLY], ["raise", tokens_1.TokenType.RAISE], ]); // Operators that are valid in Python, but invalid for our use case. // this.forbiddenOperators = new Set([ // TokenType.AT, // // Augmented assign e.g. *= // TokenType.ATEQUAL, // TokenType.PLUSEQUAL, // TokenType.MINEQUAL, // TokenType.STAREQUAL, // TokenType.SLASHEQUAL, // TokenType.PERCENTEQUAL, // TokenType.AMPEREQUAL, // TokenType.VBAREQUAL, // TokenType.CIRCUMFLEXEQUAL, // TokenType.LEFTSHIFTEQUAL, // TokenType.RIGHTSHIFTEQUAL, // TokenType.DOUBLESTAREQUAL, // TokenType.DOUBLESLASHEQUAL, // ]) this.parenthesesLevel = 0; } isAtEnd() { return this.current >= this.source.length; } advance() { const res = this.source[this.current]; if (this.peek() == '\n') { this.line += 1; } this.current += 1; this.col += 1; return res; } /* Single character lookahead. */ peek() { return this.isAtEnd() ? '\0' : this.source[this.current]; } /* Double character lookahead. */ overwriteToken(type) { const previousToken = this.tokens[this.tokens.length - 1]; const lexeme = this.source.slice(previousToken.indexInSource, this.current); this.tokens[this.tokens.length - 1] = new Token(type, lexeme, previousToken.line, previousToken.col, previousToken.indexInSource); } addToken(type) { const line = this.line; const col = this.col; const lexeme = this.source.slice(this.start, this.current); this.tokens.push(new Token(type, lexeme, line, col, this.current - lexeme.length)); } addStringToken(type) { const line = this.line; const col = this.col; // Remove starting and ending quotes when slicing // Ensures that string is parsed properly const lexeme = this.source.slice(this.start + 1, this.current - 1); this.tokens.push(new Token(type, lexeme, line, col, this.current - lexeme.length)); } addMultiLineStringToken(type) { const line = this.line; const col = this.col; // Remove three starting and ending quotes when slicing const lexeme = this.source.slice(this.start + 3, this.current - 3); this.tokens.push(new Token(type, lexeme, line, col, this.current - lexeme.length)); } // Checks that the current character matches a pattern. If so the character is consumed, else nothing is consumed. matches(pattern) { if (this.isAtEnd()) { return false; } else { if (this.source[this.current] === pattern) { this.col += 1; this.current += 1; return true; } return false; } } isAlpha(c) { return /^[A-Za-z]$/i.test(c); } isDigit(c) { return /^[0-9]/.test(c); } isHexa(c) { return /^[0-9A-F]$/i.test(c); } isOcta(c) { return /^[0-7]/.test(c); } isBinary(c) { return /^[0-1]/.test(c); } isIdentifier(c) { return c === '_' || this.isAlpha(c) || this.isDigit(c); } baseNumber() { switch (this.peek()) { case 'x': this.advance(); if (!this.isHexa(this.peek())) { throw new errors_1.TokenizerErrors.InvalidNumberError(this.line, this.col, this.source, this.start, this.current); } while (this.isHexa(this.peek())) { this.advance(); } break; case 'o': this.advance(); if (!this.isOcta(this.peek())) { throw new errors_1.TokenizerErrors.InvalidNumberError(this.line, this.col, this.source, this.start, this.current); } while (this.isOcta(this.peek())) { this.advance(); } break; case 'b': this.advance(); if (!this.isBinary(this.peek())) { throw new errors_1.TokenizerErrors.InvalidNumberError(this.line, this.col, this.source, this.start, this.current); } while (this.isBinary(this.peek())) { this.advance(); } break; default: while (this.isDigit(this.peek())) { this.advance(); } if (this.peek() !== '.' && this.peek() !== 'e') { this.addToken(tokens_1.TokenType.BIGINT); return; } if (this.peek() === '.') { this.advance(); while (this.isDigit(this.peek())) { this.advance(); } } if (this.peek() === 'e') { this.advance(); if (this.peek() === '-') { this.advance(); } if (!this.isDigit(this.peek())) { throw new errors_1.TokenizerErrors.InvalidNumberError(this.line, this.col, this.source, this.start, this.current); } while (this.isDigit(this.peek())) { this.advance(); } } } this.addToken(tokens_1.TokenType.NUMBER); } number() { while (this.isDigit(this.peek())) { this.advance(); } if (this.peek() !== '.' && this.peek() !== 'e') { this.addToken(tokens_1.TokenType.BIGINT); return; } // Fractional part if (this.peek() === '.') { this.advance(); while (this.isDigit(this.peek())) { this.advance(); } } // Exponent part if (this.peek() === 'e') { this.advance(); if (this.peek() === '-') { this.advance(); } if (!this.isDigit(this.peek())) { throw new errors_1.TokenizerErrors.InvalidNumberError(this.line, this.col, this.source, this.start, this.current); } while (this.isDigit(this.peek())) { this.advance(); } } this.addToken(tokens_1.TokenType.NUMBER); } name() { while (this.isIdentifier(this.peek())) { this.advance(); } const identifier = this.source.slice(this.start, this.current); if (!!this.forbiddenIdentifiers.get(identifier)) { throw new errors_1.TokenizerErrors.ForbiddenIdentifierError(this.line, this.col, this.source, this.start); } const specialIdent = this.specialIdentifiers.get(identifier); if (specialIdent !== undefined) { /* Merge multi-token operators, like 'is not', 'not in' */ const previousToken = this.tokens[this.tokens.length - 1]; switch (specialIdent) { case tokens_1.TokenType.NOT: if (previousToken.type === tokens_1.TokenType.IS) { this.overwriteToken(tokens_1.TokenType.ISNOT); } else { this.addToken(specialIdent); } return; case tokens_1.TokenType.IN: if (previousToken.type === tokens_1.TokenType.NOT) { this.overwriteToken(tokens_1.TokenType.NOTIN); } else { this.addToken(specialIdent); } return; default: this.addToken(specialIdent); } } else { this.addToken(tokens_1.TokenType.NAME); } } scanToken() { const c = this.advance(); // KJ: I really hope the JS runtime optimizes this to a jump table... switch (c) { //// SPECIAL MARKERS // Comment -- advance to end of line. case '#': while ((this.peek() !== '\n' && this.peek() !== '\r') && !this.isAtEnd()) { this.advance(); } break; case ':': this.addToken(this.matches(':') ? tokens_1.TokenType.DOUBLECOLON : tokens_1.TokenType.COLON); break; // All non-significant whitespace case ' ': break; // CR LF on Windows case '\r': if (this.matches('\n')) { // fall through } else { break; } case '\n': if (this.parenthesesLevel > 0) { this.line += 1; this.col = 0; break; } this.addToken(tokens_1.TokenType.NEWLINE); this.line += 1; this.col = 0; let accLeadingWhiteSpace = 0; // Detect significant whitespace while (this.peek() === " " && !this.isAtEnd()) { accLeadingWhiteSpace += 1; // Consume the rest of the line's leading whitespace. this.advance(); } // Handles comments if (this.peek() === "#") { while ((this.peek() !== '\n' && this.peek() !== '\r') && !this.isAtEnd()) { this.advance(); } } // The following block handles things like /* def foo(): pass <---- this newline should be zapped pass <---- this should be part of the block */ while ((this.peek() === "\n" || this.peek() === "\r") && !this.isAtEnd()) { // Handle \r\n on Windows if (this.peek() === "\r") { this.advance(); if (this.peek() === "\n") { this.advance(); } } else { this.advance(); } this.line += 1; this.col = 0; accLeadingWhiteSpace = 0; // Detect significant whitespace while (this.peek() === " " && !this.isAtEnd()) { accLeadingWhiteSpace += 1; // Consume the rest of the line's leading whitespace. this.advance(); } } if (accLeadingWhiteSpace % 4 !== 0) { throw new errors_1.TokenizerErrors.NonFourIndentError(this.line, this.col, this.source, this.current); } const tos = this.indentStack[this.indentStack.length - 1]; if (accLeadingWhiteSpace > tos) { this.indentStack.push(accLeadingWhiteSpace); const indents = Math.floor((accLeadingWhiteSpace - tos) / 4); for (let i = 0; i < indents; ++i) { this.addToken(tokens_1.TokenType.INDENT); } } else if (accLeadingWhiteSpace < tos) { if (this.indentStack.length == 0) { throw new errors_1.TokenizerErrors.InconsistentIndentError(this.line, this.col, this.source, this.current); } const prev = this.indentStack[this.indentStack.length - 1]; if (prev === undefined || prev === null) { throw new errors_1.TokenizerErrors.InconsistentIndentError(this.line, this.col, this.source, this.current); } const indents = Math.floor((prev - accLeadingWhiteSpace) / 4); for (let i = 0; i < indents; ++i) { this.indentStack.pop(); this.addToken(tokens_1.TokenType.DEDENT); } } break; // String case '"': case "'": let quote = c; if (this.peek() == quote) { // handle multi-line string this.advance(); // second quote found and consumed if (this.peek() != quote) { // empty string "" this.addStringToken(tokens_1.TokenType.STRING); break; } this.advance(); // third quote consumed while (this.peek() != quote && !this.isAtEnd()) { this.advance(); // advance until ending quote found } if (this.isAtEnd()) { throw new errors_1.TokenizerErrors.UnterminatedStringError(this.line, this.col, this.source, this.start, this.current); } this.advance(); // consume first ending quote if (this.peek() != quote) { throw new errors_1.TokenizerErrors.UnterminatedStringError(this.line, this.col, this.source, this.start, this.current); } this.advance(); // consume second ending quote if (this.peek() != quote) { throw new errors_1.TokenizerErrors.UnterminatedStringError(this.line, this.col, this.source, this.start, this.current); } this.advance(); // consume third ending quote this.addMultiLineStringToken(tokens_1.TokenType.STRING); } else { // other case, single-line string while (this.peek() != quote && this.peek() != '\n' && !this.isAtEnd()) { this.advance(); } if (this.peek() === '\n' || this.isAtEnd()) { throw new errors_1.TokenizerErrors.UnterminatedStringError(this.line, this.col, this.source, this.start, this.current); } // Consume Closing " this.advance(); this.addStringToken(tokens_1.TokenType.STRING); } break; // Number... I wish JS had match statements :( case '0': this.baseNumber(); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': this.number(); break; //// Everything else case '(': this.addToken(tokens_1.TokenType.LPAR); this.parenthesesLevel++; break; case ')': this.addToken(tokens_1.TokenType.RPAR); if (this.parenthesesLevel === 0) { throw new errors_1.TokenizerErrors.NonMatchingParenthesesError(this.line, this.col, this.source, this.current); } this.parenthesesLevel--; break; case ',': this.addToken(tokens_1.TokenType.COMMA); break; //// OPERATORS case '-': if (this.matches('=')) { this.raiseForbiddenOperator(); } this.addToken(tokens_1.TokenType.MINUS); break; case '+': if (this.matches('=')) { this.raiseForbiddenOperator(); } this.addToken(tokens_1.TokenType.PLUS); break; case '*': if (this.matches('=')) { this.raiseForbiddenOperator(); } this.addToken(this.matches('*') ? tokens_1.TokenType.DOUBLESTAR : tokens_1.TokenType.STAR); break; case '/': if (this.matches('=')) { this.raiseForbiddenOperator(); } this.addToken(this.matches('/') ? tokens_1.TokenType.DOUBLESLASH : tokens_1.TokenType.SLASH); break; case '%': if (this.matches('=')) { this.raiseForbiddenOperator(); } this.addToken(tokens_1.TokenType.PERCENT); break; case '!': this.addToken(this.matches('=') ? tokens_1.TokenType.NOTEQUAL : tokens_1.TokenType.BANG); break; case '=': this.addToken(this.matches('=') ? tokens_1.TokenType.DOUBLEEQUAL : tokens_1.TokenType.EQUAL); break; case '<': this.addToken(this.matches('=') ? tokens_1.TokenType.LESSEQUAL : tokens_1.TokenType.LESS); break; case '>': this.addToken(this.matches('=') ? tokens_1.TokenType.GREATEREQUAL : tokens_1.TokenType.GREATER); break; default: // Identifier start if (c === '_' || this.isAlpha(c)) { this.name(); break; } this.matchForbiddenOperator(c); throw new errors_1.TokenizerErrors.UnknownTokenError(c, this.line, this.col, this.source, this.current); } } matchForbiddenOperator(ch) { switch (ch) { case '@': case '|': case '&': case '~': case '^': this.matches('='); this.raiseForbiddenOperator(); break; default: break; } } scanEverything() { while (!this.isAtEnd()) { this.start = this.current; this.scanToken(); } // Unravel the indent stack while (this.indentStack[this.indentStack.length - 1] !== 0) { this.indentStack.pop(); this.addToken(tokens_1.TokenType.DEDENT); } this.tokens.push(new Token(tokens_1.TokenType.ENDMARKER, "", this.line, this.col, this.current)); return this.tokens; } printTokens() { for (const token of this.tokens) { console.log(`${token.indexInSource}:${token.line}-${token.line},${token.indexInSource + token.lexeme.length}\t\t\t\ ${tokens_1.TokenType[token.type]}\t\t\t'${token.lexeme}'`); } } raiseForbiddenOperator() { throw new errors_1.TokenizerErrors.ForbiddenOperatorError(this.line, this.col, this.source, this.start, this.current); } } exports.Tokenizer = Tokenizer; //# sourceMappingURL=tokenizer.js.map