adder-script
Version:
Python like language to execute untrusted codes in browsers and Node.js.
477 lines (390 loc) • 15.7 kB
JavaScript
"use strict";
/**
* The lexer takes a code string and convert it to a list of tokens.
*
* Author: Ronen Ness.
* Since: 2016
*/
// include jsface for classes
var jsface = require("./../dependencies/jsface"),
Class = jsface.Class,
extend = jsface.extend;
// include errors
var Errors = require("./../errors");
// include console
var Console = require("./../console");
// all arithmetic operators
// note: its important to put the double operators (==, **, ..) before their singles version. sometimes we iterate and try to match first)
var signOperators = ["+=", "-=", "*=", "/=", "|=", "&=", "%=", "**", "==", "!=", ">=", "<=", ">", "<", "+", "-", "*", "/", "|", "&", "%", "=", "."];
var wordsOperators = ["not in", "is not", "not", "in", "or", "and", "is"];
var operators = signOperators.concat(wordsOperators);
// get default flags
var defaultFlags = require("./default_flags");
// comment sign
var commentPrefix = '#';
// get utils
var Utils = require("./../utils");
// all token types
var TokenTypes = require("./tokens");
// values that break between words
// note: undefined is to identify out-of-string-range.
var breakingSigns = [' ', '(', ')', '[', ']', undefined, ',', ':', ';', '\n', '\\'];
breakingSigns = breakingSigns.concat(operators);
breakingSigns = Utils.toSet(breakingSigns);
// the Lexer class - convert code into tokens.
var Lexer = Class({
// static stuff
$static: {
wordsOperators: wordsOperators,
operators: operators,
},
// Lexer constructor
constructor: function(flags)
{
// store flags
this._flags = flags || defaultFlags;
},
// create a token instance
/*
token types:
"p" // punctuation: commas, etc.
"n" // numbers.
"s" // strings.
"v" // identifiers / variables / keywords.
"o" // operators.
"b" // line / command break.
"_" // change in block index
*/
makeToken: function(type, val)
{
return {t: type, v: val};
},
// return if a character is a breaker, eg something that separate words etc
isBreaker: function(expression, pos)
{
// first check comment
if (this.isComment(expression, pos)) {return true;}
// get character at position
var c = expression[pos];
// check if space, undefined (means end of string) or operator
return breakingSigns.has(c);
},
// return if a character is a digit
isNumber: function(c)
{
return (c >= '0' && c <= '9');
},
// return if opening string
isOpeningString: function(c)
{
return c === '"' || c === "'";
},
// return if a character is a punctuation
isPunc: function(c)
{
return c === "," || c === ":";
},
// return if a comment
isComment: function(expression, start)
{
return expression[start] === commentPrefix;
},
// read a whole number from starting pos until the end of the number
// return [number, last_index]
readNumber: function(expression, start)
{
// iterate until space
var pos = start;
var alreadyGotDot = false;
while (expression[pos] === '.' || !this.isBreaker(expression, pos))
{
// get current char
var c = expression[pos];
// check if current char is a dot
var isDot = c === '.';
// if we got non-digit (it means something like this: "4d41") its illegal expression.
if (!this.isNumber(c) && !isDot) {
throw new Errors.IllegalExpression(expression, "Invalid syntax (non-digits inside a number)", this.lineIndex);
}
// if its a dot:
if (isDot)
{
// if already got dot in this expression its syntax error
if (alreadyGotDot) {
throw new Errors.IllegalExpression(expression, "Invalid syntax (multiple decimal marks in float)", this.lineIndex);
}
// set that already got dot
alreadyGotDot = true;
}
// take next character
pos++;
}
// return the number
return [expression.substr(start, pos-start), pos-1];
},
// read the whole operator from string pos
// return [operator, last_index]
readOperator: function(expression, start)
{
// get current part that might contain the operator
var currSeg = expression.substr(start, 10);
// first check word operators
for (var i = 0; i < wordsOperators.length; ++i) {
// get current word operator
var currOp = wordsOperators[i];
// check if match and if so return
if (currSeg.indexOf(currOp + " ") === 0) {
return [currOp, start + currOp.length - 1];
}
}
// now iterate over sign operators
for (var i = 0; i < signOperators.length; ++i)
{
// get curr operator
var curr = signOperators[i];
// check if operator match
if (currSeg.substr(0, curr.length) === curr) {
return [curr, start + curr.length - 1];
}
}
// if operator not found return null
return null;
},
// read the whole string from string pos
// return [string, last_index]
readString: function(expression, start)
{
// check if our quote sign is ' or "
var quote = expression[start] === '"' ? '"' : "'";
// loop until finding the closing quotes (quotes without \ before them)
var i = start;
var lastC; var c;
while (c = expression[++i])
{
lastC = c;
if (c === quote && lastC !== '\\') break;
}
// didn't find closing quotes?
if (c === undefined) {
throw new Errors.IllegalExpression(expression, "EOL while scanning string literal.", this.lineIndex);
}
// parse the string inside
var val = expression.substr(start, i-start+1);
return [val, i];
},
// read the whole punctuation from string pos
// return [punctuation, last_index]
readPunctuation: function(expression, start)
{
return [expression[start], start];
},
// read a comment until the end
// unlike the other 'read' functions, this won't return the actual comment, just the ending position
readComment: function(expression, start) {
// iterate until end of string or until line break
var pos = start;
while (expression[pos] !== undefined && expression[pos] !== "\n") {
pos++;
}
// return new position
return ["", pos];
},
// read a word from string pos
// return [word, last_index]
readWord: function(expression, start)
{
// read characters until complete current word
var pos = start;
while (!this.isBreaker(expression, pos))
{
pos++;
}
// get whole word
var word = expression.substr(start, pos-start);
// return word and position
// take one char back so we won't skip the breaking character
return [word, pos-1];
},
// convert string expression into list of tokens.
parseExpression: function(expression) {
// return list
var ret = [];
// current and last character parsed
var lastC; var c;
// count lines
this.lineIndex = 1;
// if true we need to skip next line break
var skipNextLineBreak = false;
// last block indent
var lastBlockIndent = 0;
// was last character a line break?
var wasLineBreak = false;
// indicating that last token was an inline block
var inlineBlocks = 0;
// count spaces we had in a row after line break
var spacesInRow = 0;
// iterate over all characters of expression
for (var i = 0; i < expression.length; ++i)
{
// skip white spaces
if (expression[i] === ' ') {
if (wasLineBreak) spacesInRow++;
continue;
}
if (expression[i] === '\t') {
if (wasLineBreak) spacesInRow += this._flags.spacesNeededForBlockIndent;
continue;
}
// if we got spaces after line break, calc block indent
if (wasLineBreak) {
// if this is break after inline block
if (inlineBlocks > 0)
{
lastBlockIndent -= inlineBlocks;
ret.push(this.makeToken(TokenTypes.cblock, lastBlockIndent));
inlineBlocks = 0;
}
// if its a regular block and line break wasn't ';'
else if (lastC !== ';')
{
// get spaces needed for block indent
var spacesForIndent = this._flags.spacesNeededForBlockIndent;
// make sure current character is not line break, so we won't change blocks / validate indent for empty lines
if (expression[i] !== '\n')
{
// check if spaces are not multiply indent spaces, but only if last token wasn't ';' (inline break)
if ((spacesInRow % spacesForIndent) !== 0) {
throw new Errors.SyntaxError("Bad block indent (spaces not multiply of " +
this._flags.spacesNeededForBlockIndent + ")", this.lineIndex);
}
// calc current block indent and add block change token
var blockIndent = spacesInRow / spacesForIndent;
if (blockIndent !== lastBlockIndent)
{
ret.push(this.makeToken(TokenTypes.cblock, blockIndent));
lastBlockIndent = blockIndent;
}
}
}
// zero spaces count
spacesInRow = 0;
}
// if its a comment - read it
if (this.isComment(expression, i))
{
// read comment
var tokenData = this.readComment(expression, i);
i = tokenData[1];
// add line break after the comment
// but only if didn't reach the end
if (expression[i])
{
this.lineIndex++;
wasLineBreak = true;
ret.push(this.makeToken(TokenTypes.lbreak, '\n'));
}
// continue to next character
continue;
}
// store last character and get current character
lastC = c;
var c = expression[i];
// special case - command break
if (c === ';' || c === '\n') {
// increase line count
this.lineIndex++;
// if should skip line break skip it
if (skipNextLineBreak) {
skipNextLineBreak = false;
continue;
}
// do line break
lastC = c;
wasLineBreak = true;
ret.push(this.makeToken(TokenTypes.lbreak, c));
continue;
}
// special case 2 - anti-line break, eg character that combine lines together
else if (c === '\\') {
if (expression[i+1] !== '\n') {
throw new Errors.SyntaxError("Invalid character after \\ sign.", this.lineIndex);
}
skipNextLineBreak = true;
continue;
}
// special case - if last character was ':', but we didn't get a new line, it means its an inline block
if (lastC === ":") {
// add break + open block
ret.push(this.makeToken(TokenTypes.lbreak, ";"));
lastBlockIndent++;
ret.push(this.makeToken(TokenTypes.cblock, lastBlockIndent));
inlineBlocks++;
}
// not a line break
wasLineBreak = false;
// if we got an parenthesis parse it
if (c === "(" || c === ")")
{
// add to tokens list
ret.push(this.makeToken('o', c));
continue;
}
// if punctuation
if (this.isPunc(c))
{
// read punctuation
var tokenData = this.readPunctuation(expression, i);
var token = tokenData[0]; i = tokenData[1];
// add punctuation to tokens list
ret.push(this.makeToken(TokenTypes.punctuation, token));
continue;
}
// if a number
if (this.isNumber(c))
{
// read punctuation
var tokenData = this.readNumber(expression, i);
var token = tokenData[0]; i = tokenData[1];
// add punctuation to tokens list
ret.push(this.makeToken(TokenTypes.number, token));
continue;
}
// try to read an operator
// with operators its a little different - we just try to read it and return null if not found
var tokenData = this.readOperator(expression, i);
if (tokenData)
{
// get token and new index
var token = tokenData[0]; i = tokenData[1];
// add operator to tokens list
ret.push(this.makeToken(TokenTypes.operator, token));
continue;
}
// if got string read it all until its closed
if (this.isOpeningString(c))
{
// read operator
var tokenData = this.readString(expression, i);
var token = tokenData[0]; i = tokenData[1];
// add operator to tokens list
ret.push(this.makeToken(TokenTypes.string, token));
continue;
}
// if got here it means its a keyword, var-name, statement, etc..
// read word and add it
var tokenData = this.readWord(expression, i);
var token = tokenData[0]; i = tokenData[1];
// illegal token?
if (token === "") {
throw new Errors.IllegalExpression(expression, "Invalid or unexpected token '" + c + "'!", this.lineIndex);
}
// add operator to tokens list
ret.push(this.makeToken(TokenTypes.identifier, token));
}
// return parsed expression
Console.debug("Lexer parsed", ret);
return ret;
},
});
// export the Lexer class
module.exports = Lexer;