swahili-lang
Version:
A new programming language with semantics borrowed from the Swahili language to help teach programming concepts to Swahili speaking students.
382 lines (338 loc) • 11 kB
JavaScript
const TT = require('./tokenTypes');
const LEX = require('./lexemes');
const Token = require('./token');
const Position = require('./position');
const { IllegalCharError, ExpectedCharError } = require('../interpreter/error');
/**
* Performs a lexical analysis to ensure correct syntax of the programming language
*/
class Lexer {
/**
* Instantiates a lexer
* @param {String} fileName The name of the file the lexer is running in
* @param {String} text The content/text to be tokenized
*/
constructor(fileName, text) {
this.fileName = fileName;
this.pos = new Position(-1, 0, -1, fileName, text);
this.currentChar = null;
// treat semicolons as line endings if input is received from <stdin>
// semicolons are otherwise ILLEGAL CHARACTERS!
if (fileName === '<stdin>') text = text.replace(LEX.semi, '\n');
// convert all line endings to an @ sign for consistent lexing
this.text = text.replace(LEX.line, '@');
this.advance();
}
/**
* advances to the next character in the text
*/
advance() {
this.pos.advance(this.currentChar);
this.currentChar =
this.pos.idx < this.text.length ? this.text[this.pos.idx] : null;
}
/**
* generates a number token after encountering a digit in the text
* @returns {Token}
*/
makeNumber() {
let numStr = '';
let dotCount = 0;
let posStart = this.pos.copy();
// keep going while character is a digit or a dot, and we haven't seen a dot yet
while (
this.currentChar !== null &&
(LEX.digits.test(this.currentChar) || LEX.dot.test(this.currentChar))
) {
if (LEX.dot.test(this.currentChar)) {
if (dotCount === 1) break;
dotCount++;
}
numStr += this.currentChar;
this.advance();
}
// check if INT or FLOAT
if (dotCount === 0) {
return new Token(TT.INT, parseInt(numStr), posStart, this.pos);
} else {
return new Token(TT.FLOAT, parseFloat(numStr), posStart, this.pos);
}
}
/**
* generates a string after encountering double quotes in the text
* @returns {Token}
*/
makeString() {
let string = '';
let posStart = this.pos.copy();
let escapeCharacter = false;
this.advance();
while (
this.currentChar !== null &&
(!LEX.doubleQuotes.test(this.currentChar) || escapeCharacter)
) {
if (escapeCharacter) {
if (LEX.doubleQuotes.test(this.currentChar)) {
string += this.currentChar;
} else {
string += '\\' + this.currentChar;
}
escapeCharacter = false;
} else {
if (LEX.backSlash.test(this.currentChar)) {
escapeCharacter = true;
} else {
string += this.currentChar;
}
}
this.advance();
}
this.advance();
return new Token(TT.STRING, string, posStart, this.pos);
}
/**
* generates an identifier token after encountering an alphabetic character in the text
* @returns {Token}
*/
makeIdentifier() {
let idStr = '';
let posStart = this.pos.copy();
// keep going while character is a alphanumeric or an underscore
while (
this.currentChar !== null &&
(LEX.digits.test(this.currentChar) || LEX.alpha.test(this.currentChar))
) {
idStr += this.currentChar;
this.advance();
}
// check if KEYWORD or IDENTIFIER
const keywords = Object.values(LEX.keywords);
let tokType = keywords.includes(idStr) ? TT.KEYWORD : TT.IDENTIFIER;
return new Token(tokType, idStr, posStart, this.pos);
}
/**
* generates an AND token after encountering a ampersand in the text
* @returns {Token}
*/
makeAnd() {
let posStart = this.pos.copy();
this.advance();
if (LEX.ampersand.test(this.currentChar)) {
this.advance();
return [new Token(TT.AND, null, posStart, this.pos), null];
}
this.advance();
return [
null,
new ExpectedCharError(posStart, this.pos, `'${LEX.ampersand.source}'`),
];
}
/**
* generates an OR token after encountering a pipe in the text
* @returns {Token}
*/
makeOr() {
let posStart = this.pos.copy();
this.advance();
if (LEX.pipe.test(this.currentChar)) {
this.advance();
return [new Token(TT.OR, null, posStart, this.pos), null];
}
this.advance();
return [
null,
new ExpectedCharError(posStart, this.pos, `'${LEX.pipe.source}'`),
];
}
/**
* generates a NOT/NE token after encountering an exclamation in the text
* @returns {Token}
*/
makeNotEquals() {
let tokType = TT.NOT;
let posStart = this.pos.copy();
this.advance();
if (LEX.equals.test(this.currentChar)) {
this.advance();
tokType = TT.NE;
}
return new Token(tokType, null, posStart, this.pos);
}
/**
* generates an EQ/EE token after encountering an equals sign in the text
* @returns {Token}
*/
makeEquals() {
let tokType = TT.EQ;
let posStart = this.pos.copy();
this.advance();
if (LEX.equals.test(this.currentChar)) {
this.advance();
tokType = TT.EE;
}
return new Token(tokType, null, posStart, this.pos);
}
/**
* generates a LT/LTE token after encountering a leftArrow in the text
* @returns {Token}
*/
makeLessThan() {
let tokType = TT.LT;
let posStart = this.pos.copy();
this.advance();
if (LEX.equals.test(this.currentChar)) {
this.advance();
tokType = TT.LTE;
}
return new Token(tokType, null, posStart, this.pos);
}
/**
* generates a GT/GTE token after encountering a '>' in the text
* @returns {Token}
*/
makeGreaterThan() {
let tokType = TT.GT;
let posStart = this.pos.copy();
this.advance();
if (LEX.equals.test(this.currentChar)) {
this.advance();
tokType = TT.GTE;
}
return new Token(tokType, null, posStart, this.pos);
}
/** makes a div token or skips a comment block */
makeDivide() {
let posStart = this.pos.copy();
this.advance();
if (
LEX.forwardSlash.test(this.currentChar) ||
LEX.asterisk.test(this.currentChar)
) {
this.skipComment();
} else {
return new Token(TT.DIV, null, posStart);
}
}
/** grabs all characters in a comment and ignores them */
skipComment() {
if (LEX.forwardSlash.test(this.currentChar)) {
// if next char is a forward slash, line comment
// keep going until new line
this.advance();
while (
this.currentChar !== null &&
!LEX.lineEndings.test(this.currentChar)
) {
this.advance();
}
} else if (LEX.asterisk.test(this.currentChar)) {
// if next char is an asterisk, block comment
// keep going until find other asterisk
this.advance();
while (
this.currentChar !== null &&
!LEX.asterisk.test(this.currentChar)
) {
this.advance();
if (LEX.asterisk.test(this.currentChar)) {
this.advance();
// if char after that is forward slash, done
if (LEX.forwardSlash.test(this.currentChar)) {
this.advance();
break;
}
}
}
}
}
/**
* generates a list of tokens by going through each char in the text
* @returns {[Token[], Error]}
*/
makeTokens() {
let tokens = [];
while (this.currentChar !== null) {
if (LEX.spacesAndTabs.test(this.currentChar)) {
this.advance(); // ignore spaces and tabs
} else if (LEX.lineEndings.test(this.currentChar)) {
tokens.push(new Token(TT.NEWLINE, null, this.pos));
this.advance();
} else if (LEX.digits.test(this.currentChar)) {
tokens.push(this.makeNumber());
} else if (LEX.alpha.test(this.currentChar)) {
tokens.push(this.makeIdentifier());
} else if (LEX.doubleQuotes.test(this.currentChar)) {
tokens.push(this.makeString());
} else if (LEX.plus.test(this.currentChar)) {
tokens.push(new Token(TT.PLUS, null, this.pos));
this.advance();
} else if (LEX.hyphen.test(this.currentChar)) {
tokens.push(new Token(TT.MINUS, null, this.pos));
this.advance();
} else if (LEX.asterisk.test(this.currentChar)) {
tokens.push(new Token(TT.MUL, null, this.pos));
this.advance();
} else if (LEX.forwardSlash.test(this.currentChar)) {
let tok = this.makeDivide();
if (tok) tokens.push(tok);
} else if (LEX.caret.test(this.currentChar)) {
tokens.push(new Token(TT.POW, null, this.pos));
this.advance();
} else if (LEX.modulo.test(this.currentChar)) {
tokens.push(new Token(TT.MOD, null, this.pos));
this.advance();
} else if (LEX.leftParen.test(this.currentChar)) {
tokens.push(new Token(TT.LPAREN, null, this.pos));
this.advance();
} else if (LEX.rightParen.test(this.currentChar)) {
tokens.push(new Token(TT.RPAREN, null, this.pos));
this.advance();
} else if (LEX.leftSquare.test(this.currentChar)) {
tokens.push(new Token(TT.LSQUARE, null, this.pos));
this.advance();
} else if (LEX.rightSquare.test(this.currentChar)) {
tokens.push(new Token(TT.RSQUARE, null, this.pos));
this.advance();
} else if (LEX.leftCurly.test(this.currentChar)) {
tokens.push(new Token(TT.LCURL, null, this.pos));
this.advance();
} else if (LEX.rightCurly.test(this.currentChar)) {
tokens.push(new Token(TT.RCURL, null, this.pos));
this.advance();
} else if (LEX.dot.test(this.currentChar)) {
tokens.push(new Token(TT.DOT, null, this.pos));
this.advance();
} else if (LEX.col.test(this.currentChar)) {
tokens.push(new Token(TT.COL, null, this.pos));
this.advance();
} else if (LEX.comma.test(this.currentChar)) {
tokens.push(new Token(TT.COMMA, null, this.pos));
this.advance();
} else if (LEX.ampersand.test(this.currentChar)) {
let [tok, error] = this.makeAnd();
if (error) return [[], error];
tokens.push(tok);
} else if (LEX.pipe.test(this.currentChar)) {
let [tok, error] = this.makeOr();
if (error) return [[], error];
tokens.push(tok);
} else if (LEX.exclamation.test(this.currentChar)) {
tokens.push(this.makeNotEquals());
} else if (LEX.equals.test(this.currentChar)) {
tokens.push(this.makeEquals());
} else if (LEX.leftArrow.test(this.currentChar)) {
tokens.push(this.makeLessThan());
} else if (LEX.rightArrow.test(this.currentChar)) {
tokens.push(this.makeGreaterThan());
} else {
let posStart = this.pos.copy();
let char = this.currentChar;
this.advance();
return [[], new IllegalCharError(posStart, this.pos, `'${char}'`)];
}
}
tokens.push(new Token(TT.EOF, null, this.pos));
return [tokens, null];
}
}
module.exports = Lexer;