UNPKG

js-slang

Version:

Javascript-based implementations of Source, written in Typescript

github.com/source-academy/js-slang

source-academy/js-slang

278 lines • 9.79 kB

JavaScript

"use strict"; // Thanks to Ken Jin (py-slang) for the great resource // https://craftinginterpreters.com/scanning.html // This tokenizer/lexer is a modified version, inspired by both the // tokenizer/lexer above as well as Ken Jin's py-slang tokenizer/lexer. // It has been adapted to be written in typescript for scheme. // Crafting Interpreters: https://craftinginterpreters.com/ // py-slang: https://github.com/source-academy/py-slang Object.defineProperty(exports, "__esModule", { value: true }); exports.SchemeLexer = void 0; const core_math_1 = require("../../stdlib/core-math"); const token_1 = require("../types/tokens/token"); const token_type_1 = require("../types/tokens/token-type"); const LexerError = require("./lexer-error"); // syntactic keywords in the scheme language let keywords = new Map([ [".", token_type_1.TokenType.DOT], ["if", token_type_1.TokenType.IF], ["let", token_type_1.TokenType.LET], ["cond", token_type_1.TokenType.COND], ["else", token_type_1.TokenType.ELSE], ["set!", token_type_1.TokenType.SET], ["begin", token_type_1.TokenType.BEGIN], ["delay", token_type_1.TokenType.DELAY], ["quote", token_type_1.TokenType.QUOTE], ["export", token_type_1.TokenType.EXPORT], ["import", token_type_1.TokenType.IMPORT], ["define", token_type_1.TokenType.DEFINE], ["lambda", token_type_1.TokenType.LAMBDA], ]); class SchemeLexer { constructor(source) { this.start = 0; this.current = 0; this.line = 1; this.col = 0; this.source = source; this.tokens = []; } isAtEnd() { return this.current >= this.source.length; } advance() { // get the next character this.col++; return this.source.charAt(this.current++); } jump() { // when you want to ignore a character this.start = this.current; this.col++; this.current++; } addToken(type, literal = null) { const text = this.source.substring(this.start, this.current); this.tokens.push(new token_1.Token(type, text, literal, this.start, this.current, this.line, this.col)); } scanTokens() { while (!this.isAtEnd()) { this.start = this.current; this.scanToken(); } this.tokens.push(new token_1.Token(token_type_1.TokenType.EOF, "", null, this.start, this.current, this.line, this.col)); return this.tokens; } scanToken() { const c = this.advance(); switch (c) { case "(": this.addToken(token_type_1.TokenType.LEFT_PAREN); break; case ")": this.addToken(token_type_1.TokenType.RIGHT_PAREN); break; case "[": this.addToken(token_type_1.TokenType.LEFT_BRACKET); break; case "]": this.addToken(token_type_1.TokenType.RIGHT_BRACKET); break; case "'": this.addToken(token_type_1.TokenType.APOSTROPHE); break; case "`": this.addToken(token_type_1.TokenType.BACKTICK); break; case ",": if (this.match("@")) { this.addToken(token_type_1.TokenType.COMMA_AT); break; } this.addToken(token_type_1.TokenType.COMMA); break; case "#": // by itself, it is an error if (this.match("t") || this.match("f")) { this.booleanToken(); } else if (this.match("|")) { // a multiline comment this.comment(); } else if (this.match(";")) { // a datum comment this.addToken(token_type_1.TokenType.HASH_SEMICOLON); } else if (this.peek() === "(" || this.peek() === "[") { // We keep the hash character and the parenthesis/bracket // separate as our parentheses matching systems // will suffer with 4 possible left grouping tokens! // ensure that the next character is a vector this.addToken(token_type_1.TokenType.HASH_VECTOR); } else { // chars are not currently supported throw new LexerError.UnexpectedCharacterError(this.line, this.col, c); } break; case ";": // a comment while (this.peek() != "\n" && !this.isAtEnd()) this.advance(); break; // double character tokens not currently needed case " ": case "\r": case "\t": // ignore whitespace break; case "\n": this.line++; this.col = 0; break; case '"': this.stringToken(); break; case "|": this.identifierTokenLoose(); break; default: // Deviates slightly from the original lexer. // Scheme allows for identifiers to start with a digit // or include a specific set of symbols. if (this.isDigit(c) || c === "-" || c === "+" || c === "." || c === "i" || // inf c === "n" // nan ) { // may or may not be a number this.identifierNumberToken(); } else if (this.isValidIdentifier(c)) { // filtered out the potential numbers // these are definitely identifiers this.identifierToken(); } else { throw new LexerError.UnexpectedCharacterError(this.line, this.col, c); } break; } } comment() { while (!(this.peek() == "|" && this.peekNext() == "#") && !this.isAtEnd()) { if (this.peek() === "\n") { this.line++; this.col = 0; } this.advance(); } if (this.isAtEnd()) { throw new LexerError.UnexpectedEOFError(this.line, this.col); } this.jump(); this.jump(); } identifierToken() { while (this.isValidIdentifier(this.peek())) this.advance(); this.addToken(this.checkKeyword()); } identifierTokenLoose() { // this is a special case for identifiers // add the first | this.advance(); while (this.peek() != "|" && !this.isAtEnd()) { if (this.peek() === "\n") { this.line++; this.col = 0; } this.advance(); } if (this.isAtEnd()) { throw new LexerError.UnexpectedEOFError(this.line, this.col); } // add the last | this.advance(); this.addToken(this.checkKeyword()); } identifierNumberToken() { // we first obtain the entire identifier while (this.isValidIdentifier(this.peek())) { this.advance(); } const lexeme = this.source.substring(this.start, this.current); if ((0, core_math_1.stringIsSchemeNumber)(lexeme)) { this.addToken(token_type_1.TokenType.NUMBER, lexeme); return; } this.addToken(this.checkKeyword()); } checkKeyword() { var text = this.source.substring(this.start, this.current); if (keywords.has(text)) { return keywords.get(text); } return token_type_1.TokenType.IDENTIFIER; } stringToken() { while (this.peek() != '"' && !this.isAtEnd()) { if (this.peek() === "\n") { this.line++; this.col = 0; } this.advance(); } if (this.isAtEnd()) { throw new LexerError.UnexpectedEOFError(this.line, this.col); } // closing " this.advance(); // trim the surrounding quotes const value = this.source.substring(this.start + 1, this.current - 1); this.addToken(token_type_1.TokenType.STRING, value); } booleanToken() { this.addToken(token_type_1.TokenType.BOOLEAN, this.peekPrev() === "t" ? true : false); } match(expected) { if (this.isAtEnd()) return false; if (this.source.charAt(this.current) != expected) return false; this.current++; return true; } peek() { if (this.isAtEnd()) return "\0"; return this.source.charAt(this.current); } peekNext() { if (this.current + 1 >= this.source.length) return "\0"; return this.source.charAt(this.current + 1); } peekPrev() { if (this.current - 1 < 0) return "\0"; return this.source.charAt(this.current - 1); } isDigit(c) { return c >= "0" && c <= "9"; } isSpecialSyntax(c) { return (c === "(" || c === ")" || c === "[" || c === "]" || c === ";" || c === "|"); } isValidIdentifier(c) { return !this.isWhitespace(c) && !this.isSpecialSyntax(c); } isWhitespace(c) { return c === " " || c === "\0" || c === "\n" || c === "\r" || c === "\t"; } } exports.SchemeLexer = SchemeLexer; //# sourceMappingURL=scheme-lexer.js.map