jison-lex
Version:
lexical analyzer generator used by jison
604 lines (523 loc) • 20.2 kB
JavaScript
// Basic Lexer implemented using JavaScript regular expressions
// MIT Licensed
;
var lexParser = require('lex-parser');
var version = require('./package.json').version;
// expand macros and convert matchers to RegExp's
function prepareRules(rules, macros, actions, tokens, startConditions, caseless) {
var m,i,k,action,conditions,
newRules = [];
if (macros) {
macros = prepareMacros(macros);
}
function tokenNumberReplacement (str, token) {
return "return " + (tokens[token] || "'" + token + "'");
}
actions.push('switch($avoiding_name_collisions) {');
for (i=0;i < rules.length; i++) {
if (Object.prototype.toString.apply(rules[i][0]) !== '[object Array]') {
// implicit add to all inclusive start conditions
for (k in startConditions) {
if (startConditions[k].inclusive) {
startConditions[k].rules.push(i);
}
}
} else if (rules[i][0][0] === '*') {
// Add to ALL start conditions
for (k in startConditions) {
startConditions[k].rules.push(i);
}
rules[i].shift();
} else {
// Add to explicit start conditions
conditions = rules[i].shift();
for (k=0;k<conditions.length;k++) {
startConditions[conditions[k]].rules.push(i);
}
}
m = rules[i][0];
if (typeof m === 'string') {
for (k in macros) {
if (macros.hasOwnProperty(k)) {
m = m.split("{" + k + "}").join('(' + macros[k] + ')');
}
}
m = new RegExp("^(?:" + m + ")", caseless ? 'i':'');
}
newRules.push(m);
if (typeof rules[i][1] === 'function') {
rules[i][1] = String(rules[i][1]).replace(/^\s*function \(\)\s?\{/, '').replace(/\}\s*$/, '');
}
action = rules[i][1];
if (tokens && action.match(/return '[^']+'/)) {
action = action.replace(/return '([^']+)'/g, tokenNumberReplacement);
}
actions.push('case ' + i + ':' + action + '\nbreak;');
}
actions.push("}");
return newRules;
}
// expand macros within macros
function prepareMacros (macros) {
var cont = true,
m,i,k,mnew;
while (cont) {
cont = false;
for (i in macros) if (macros.hasOwnProperty(i)) {
m = macros[i];
for (k in macros) if (macros.hasOwnProperty(k) && i !== k) {
mnew = m.split("{" + k + "}").join('(' + macros[k] + ')');
if (mnew !== m) {
cont = true;
macros[i] = mnew;
}
}
}
}
return macros;
}
function prepareStartConditions (conditions) {
var sc,
hash = {};
for (sc in conditions) if (conditions.hasOwnProperty(sc)) {
hash[sc] = {rules:[],inclusive:!!!conditions[sc]};
}
return hash;
}
function buildActions (dict, tokens) {
var actions = [dict.actionInclude || '', "var YYSTATE=YY_START;"];
var tok;
var toks = {};
for (tok in tokens) {
toks[tokens[tok]] = tok;
}
if (dict.options && dict.options.flex) {
dict.rules.push([".", "console.log(yytext);"]);
}
this.rules = prepareRules(dict.rules, dict.macros, actions, tokens && toks, this.conditions, this.options["case-insensitive"]);
var fun = actions.join("\n");
"yytext yyleng yylineno yylloc".split(' ').forEach(function (yy) {
fun = fun.replace(new RegExp("\\b(" + yy + ")\\b", "g"), "yy_.$1");
});
return "function anonymous(yy,yy_,$avoiding_name_collisions,YY_START) {" + fun + "\n}";
}
function RegExpLexer (dict, input, tokens) {
var opts = processGrammar(dict, tokens);
var source = generateModuleBody(opts);
var lexer = eval(source);
lexer.yy = {};
if (input) {
lexer.setInput(input);
}
lexer.generate = function () { return generateFromOpts(opts); };
lexer.generateModule = function () { return generateModule(opts); };
lexer.generateCommonJSModule = function () { return generateCommonJSModule(opts); };
lexer.generateAMDModule = function () { return generateAMDModule(opts); };
return lexer;
}
RegExpLexer.prototype = {
EOF: 1,
parseError: function parseError(str, hash) {
if (this.yy.parser) {
this.yy.parser.parseError(str, hash);
} else {
throw new Error(str);
}
},
// resets the lexer, sets new input
setInput: function (input, yy) {
this.yy = yy || this.yy || {};
this._input = input;
this._more = this._backtrack = this.done = false;
this.yylineno = this.yyleng = 0;
this.yytext = this.matched = this.match = '';
this.conditionStack = ['INITIAL'];
this.yylloc = {
first_line: 1,
first_column: 0,
last_line: 1,
last_column: 0
};
if (this.options.ranges) {
this.yylloc.range = [0,0];
}
this.offset = 0;
return this;
},
// consumes and returns one char from the input
input: function () {
var ch = this._input[0];
this.yytext += ch;
this.yyleng++;
this.offset++;
this.match += ch;
this.matched += ch;
var lines = ch.match(/(?:\r\n?|\n).*/g);
if (lines) {
this.yylineno++;
this.yylloc.last_line++;
} else {
this.yylloc.last_column++;
}
if (this.options.ranges) {
this.yylloc.range[1]++;
}
this._input = this._input.slice(1);
return ch;
},
// unshifts one char (or a string) into the input
unput: function (ch) {
var len = ch.length;
var lines = ch.split(/(?:\r\n?|\n)/g);
this._input = ch + this._input;
this.yytext = this.yytext.substr(0, this.yytext.length - len);
//this.yyleng -= len;
this.offset -= len;
var oldLines = this.match.split(/(?:\r\n?|\n)/g);
this.match = this.match.substr(0, this.match.length - 1);
this.matched = this.matched.substr(0, this.matched.length - 1);
if (lines.length - 1) {
this.yylineno -= lines.length - 1;
}
var r = this.yylloc.range;
this.yylloc = {
first_line: this.yylloc.first_line,
last_line: this.yylineno + 1,
first_column: this.yylloc.first_column,
last_column: lines ?
(lines.length === oldLines.length ? this.yylloc.first_column : 0)
+ oldLines[oldLines.length - lines.length].length - lines[0].length :
this.yylloc.first_column - len
};
if (this.options.ranges) {
this.yylloc.range = [r[0], r[0] + this.yyleng - len];
}
this.yyleng = this.yytext.length;
return this;
},
// When called from action, caches matched text and appends it on next action
more: function () {
this._more = true;
return this;
},
// When called from action, signals the lexer that this rule fails to match the input, so the next matching rule (regex) should be tested instead.
reject: function () {
if (this.options.backtrack_lexer) {
this._backtrack = true;
} else {
return this.parseError('Lexical error on line ' + (this.yylineno + 1) + '. You can only invoke reject() in the lexer when the lexer is of the backtracking persuasion (options.backtrack_lexer = true).\n' + this.showPosition(), {
text: "",
token: null,
line: this.yylineno
});
}
return this;
},
// retain first n characters of the match
less: function (n) {
this.unput(this.match.slice(n));
},
// displays already matched input, i.e. for error messages
pastInput: function () {
var past = this.matched.substr(0, this.matched.length - this.match.length);
return (past.length > 20 ? '...':'') + past.substr(-20).replace(/\n/g, "");
},
// displays upcoming input, i.e. for error messages
upcomingInput: function () {
var next = this.match;
if (next.length < 20) {
next += this._input.substr(0, 20-next.length);
}
return (next.substr(0,20) + (next.length > 20 ? '...' : '')).replace(/\n/g, "");
},
// displays the character position where the lexing error occurred, i.e. for error messages
showPosition: function () {
var pre = this.pastInput();
var c = new Array(pre.length + 1).join("-");
return pre + this.upcomingInput() + "\n" + c + "^";
},
// test the lexed token: return FALSE when not a match, otherwise return token
test_match: function(match, indexed_rule) {
var token,
lines,
backup;
if (this.options.backtrack_lexer) {
// save context
backup = {
yylineno: this.yylineno,
yylloc: {
first_line: this.yylloc.first_line,
last_line: this.last_line,
first_column: this.yylloc.first_column,
last_column: this.yylloc.last_column
},
yytext: this.yytext,
match: this.match,
matches: this.matches,
matched: this.matched,
yyleng: this.yyleng,
offset: this.offset,
_more: this._more,
_input: this._input,
yy: this.yy,
conditionStack: this.conditionStack.slice(0),
done: this.done
};
if (this.options.ranges) {
backup.yylloc.range = this.yylloc.range.slice(0);
}
}
lines = match[0].match(/(?:\r\n?|\n).*/g);
if (lines) {
this.yylineno += lines.length;
}
this.yylloc = {
first_line: this.yylloc.last_line,
last_line: this.yylineno + 1,
first_column: this.yylloc.last_column,
last_column: lines ?
lines[lines.length - 1].length - lines[lines.length - 1].match(/\r?\n?/)[0].length :
this.yylloc.last_column + match[0].length
};
this.yytext += match[0];
this.match += match[0];
this.matches = match;
this.yyleng = this.yytext.length;
if (this.options.ranges) {
this.yylloc.range = [this.offset, this.offset += this.yyleng];
}
this._more = false;
this._backtrack = false;
this._input = this._input.slice(match[0].length);
this.matched += match[0];
token = this.performAction.call(this, this.yy, this, indexed_rule, this.conditionStack[this.conditionStack.length - 1]);
if (this.done && this._input) {
this.done = false;
}
if (token) {
return token;
} else if (this._backtrack) {
// recover context
for (var k in backup) {
this[k] = backup[k];
}
return false; // rule action called reject() implying the next rule should be tested instead.
}
return false;
},
// return next match in input
next: function () {
if (this.done) {
return this.EOF;
}
if (!this._input) {
this.done = true;
}
var token,
match,
tempMatch,
index;
if (!this._more) {
this.yytext = '';
this.match = '';
}
var rules = this._currentRules();
for (var i = 0; i < rules.length; i++) {
tempMatch = this._input.match(this.rules[rules[i]]);
if (tempMatch && (!match || tempMatch[0].length > match[0].length)) {
match = tempMatch;
index = i;
if (this.options.backtrack_lexer) {
token = this.test_match(tempMatch, rules[i]);
if (token !== false) {
return token;
} else if (this._backtrack) {
match = false;
continue; // rule action called reject() implying a rule MISmatch.
} else {
// else: this is a lexer rule which consumes input without producing a token (e.g. whitespace)
return false;
}
} else if (!this.options.flex) {
break;
}
}
}
if (match) {
token = this.test_match(match, rules[index]);
if (token !== false) {
return token;
}
// else: this is a lexer rule which consumes input without producing a token (e.g. whitespace)
return false;
}
if (this._input === "") {
return this.EOF;
} else {
return this.parseError('Lexical error on line ' + (this.yylineno + 1) + '. Unrecognized text.\n' + this.showPosition(), {
text: "",
token: null,
line: this.yylineno
});
}
},
// return next match that has a token
lex: function lex () {
var r = this.next();
if (r) {
return r;
} else {
return this.lex();
}
},
// activates a new lexer condition state (pushes the new lexer condition state onto the condition stack)
begin: function begin (condition) {
this.conditionStack.push(condition);
},
// pop the previously active lexer condition state off the condition stack
popState: function popState () {
var n = this.conditionStack.length - 1;
if (n > 0) {
return this.conditionStack.pop();
} else {
return this.conditionStack[0];
}
},
// produce the lexer rule set which is active for the currently active lexer condition state
_currentRules: function _currentRules () {
if (this.conditionStack.length && this.conditionStack[this.conditionStack.length - 1]) {
return this.conditions[this.conditionStack[this.conditionStack.length - 1]].rules;
} else {
return this.conditions["INITIAL"].rules;
}
},
// return the currently active lexer condition state; when an index argument is provided it produces the N-th previous condition state, if available
topState: function topState (n) {
n = this.conditionStack.length - 1 - Math.abs(n || 0);
if (n >= 0) {
return this.conditionStack[n];
} else {
return "INITIAL";
}
},
// alias for begin(condition)
pushState: function pushState (condition) {
this.begin(condition);
},
// return the number of states pushed
stateStackSize: function stateStackSize() {
return this.conditionStack.length;
}
};
// generate lexer source from a grammar
function generate (dict, tokens) {
var opt = processGrammar(dict, tokens);
return generateFromOpts(opt);
}
// process the grammar and build final data structures and functions
function processGrammar(dict, tokens) {
var opts = {};
if (typeof dict === 'string') {
dict = lexParser.parse(dict);
}
dict = dict || {};
opts.options = dict.options || {};
opts.moduleType = opts.options.moduleType;
opts.moduleName = opts.options.moduleName;
opts.conditions = prepareStartConditions(dict.startConditions);
opts.conditions.INITIAL = {rules:[],inclusive:true};
opts.performAction = buildActions.call(opts, dict, tokens);
opts.conditionStack = ['INITIAL'];
opts.moduleInclude = (dict.moduleInclude || '').trim();
return opts;
}
// Assemble the final source from the processed grammar
function generateFromOpts (opt) {
var code = "";
if (opt.moduleType === 'commonjs') {
code = generateCommonJSModule(opt);
} else if (opt.moduleType === 'amd') {
code = generateAMDModule(opt);
} else {
code = generateModule(opt);
}
return code;
}
function generateModuleBody (opt) {
var functionDescriptions = {
setInput: "resets the lexer, sets new input",
input: "consumes and returns one char from the input",
unput: "unshifts one char (or a string) into the input",
more: "When called from action, caches matched text and appends it on next action",
reject: "When called from action, signals the lexer that this rule fails to match the input, so the next matching rule (regex) should be tested instead.",
less: "retain first n characters of the match",
pastInput: "displays already matched input, i.e. for error messages",
upcomingInput: "displays upcoming input, i.e. for error messages",
showPosition: "displays the character position where the lexing error occurred, i.e. for error messages",
test_match: "test the lexed token: return FALSE when not a match, otherwise return token",
next: "return next match in input",
lex: "return next match that has a token",
begin: "activates a new lexer condition state (pushes the new lexer condition state onto the condition stack)",
popState: "pop the previously active lexer condition state off the condition stack",
_currentRules: "produce the lexer rule set which is active for the currently active lexer condition state",
topState: "return the currently active lexer condition state; when an index argument is provided it produces the N-th previous condition state, if available",
pushState: "alias for begin(condition)",
stateStackSize: "return the number of states currently on the stack"
};
var out = "({\n";
var p = [];
var descr;
for (var k in RegExpLexer.prototype) {
if (RegExpLexer.prototype.hasOwnProperty(k) && k.indexOf("generate") === -1) {
// copy the function description as a comment before the implementation; supports multi-line descriptions
descr = "\n";
if (functionDescriptions[k]) {
descr += "// " + functionDescriptions[k].replace(/\n/g, "\n\/\/ ") + "\n";
}
p.push(descr + k + ":" + (RegExpLexer.prototype[k].toString() || '""'));
}
}
out += p.join(",\n");
if (opt.options) {
out += ",\noptions: " + JSON.stringify(opt.options);
}
out += ",\nperformAction: " + String(opt.performAction);
out += ",\nrules: [" + opt.rules + "]";
out += ",\nconditions: " + JSON.stringify(opt.conditions);
out += "\n})";
return out;
}
function generateModule(opt) {
opt = opt || {};
var out = "/* generated by jison-lex " + version + " */";
var moduleName = opt.moduleName || "lexer";
out += "\nvar " + moduleName + " = (function(){\nvar lexer = "
+ generateModuleBody(opt);
if (opt.moduleInclude) {
out += ";\n" + opt.moduleInclude;
}
out += ";\nreturn lexer;\n})();";
return out;
}
function generateAMDModule(opt) {
var out = "/* generated by jison-lex " + version + " */";
out += "define([], function(){\nvar lexer = "
+ generateModuleBody(opt);
if (opt.moduleInclude) {
out += ";\n" + opt.moduleInclude;
}
out += ";\nreturn lexer;"
+ "\n});";
return out;
}
function generateCommonJSModule(opt) {
opt = opt || {};
var out = "";
var moduleName = opt.moduleName || "lexer";
out += generateModule(opt);
out += "\nexports.lexer = " + moduleName;
out += ";\nexports.lex = function () { return " + moduleName + ".lex.apply(lexer, arguments); };";
return out;
}
RegExpLexer.generate = generate;
module.exports = RegExpLexer;