tokenizr
Version:
String Tokenization Library for JavaScript
563 lines (562 loc) • 19 kB
JavaScript
const hex = (ch) => ch.charCodeAt(0).toString(16).toUpperCase();
const excerpt = (txt, o) => {
const l = txt.length;
let b = o - 20;
if (b < 0)
b = 0;
let e = o + 20;
if (e > l)
e = l;
const extract = (txt2, pos, len) => txt2.substring(pos, pos + len).replaceAll(/\\/g, "\\\\").replaceAll(/\x08/g, "\\b").replaceAll(/\t/g, "\\t").replaceAll(/\n/g, "\\n").replaceAll(/\f/g, "\\f").replaceAll(/\r/g, "\\r").replaceAll(/[\x00-\x07\x0B\x0E\x0F]/g, (ch) => "\\x0" + hex(ch)).replaceAll(/[\x10-\x1F\x80-\xFF]/g, (ch) => "\\x" + hex(ch)).replaceAll(/[\u0100-\u0FFF]/g, (ch) => "\\u0" + hex(ch)).replaceAll(/[\u1000-\uFFFF]/g, (ch) => "\\u" + hex(ch));
return {
prologTrunc: b > 0,
prologText: extract(txt, b, o - b),
tokenText: extract(txt, o, 1),
epilogText: extract(txt, o + 1, e - (o + 1)),
epilogTrunc: e < l
};
};
class Token {
/* construct and initialize object */
constructor(type, value, text, pos = 0, line = 0, column = 0) {
this.type = type;
this.value = value;
this.text = text;
this.pos = pos;
this.line = line;
this.column = column;
}
/* render a useful string representation */
toString(colorize = (type, text) => text) {
return `${colorize("type", this.type)} (value: ${colorize("value", JSON.stringify(this.value))}, text: ${colorize("text", JSON.stringify(this.text))}, pos: ${colorize("pos", this.pos.toString())}, line: ${colorize("line", this.line.toString())}, column: ${colorize("column", this.column.toString())})`;
}
/* check whether value is a Token */
isA(type, value) {
if (type !== this.type)
return false;
if (value !== void 0 && value !== this.value)
return false;
return true;
}
}
class ParsingError extends Error {
/* construct and initialize object */
constructor(message, pos, line, column, input) {
super(message);
this.name = "ParsingError";
this.message = message;
this.pos = pos;
this.line = line;
this.column = column;
this.input = input;
}
/* render a useful string representation */
toString() {
const l = excerpt(this.input, this.pos);
const prefix1 = `line ${this.line} (column ${this.column}): `;
const prefix2 = " ".repeat(prefix1.length + l.prologText.length);
const msg = "Parsing Error: " + this.message + "\n" + prefix1 + l.prologText + l.tokenText + l.epilogText + "\n" + prefix2 + "^";
return msg;
}
}
class ActionContext {
/* construct and initialize the object */
constructor(tokenizr) {
this._tokenizr = tokenizr;
this._data = {};
this._repeat = false;
this._reject = false;
this._ignore = false;
this._match = null;
}
/* store and retrieve user data attached to context */
data(key, value) {
const valueOld = this._data[key];
if (arguments.length === 2)
this._data[key] = value;
return valueOld;
}
/* retrieve information of current matching */
info() {
return {
line: this._tokenizr._line,
column: this._tokenizr._column,
pos: this._tokenizr._pos,
len: this._match?.[0]?.length ?? 0
};
}
/* pass-through functions to attached tokenizer */
push(state) {
this._tokenizr.push(state);
return this;
}
pop() {
return this._tokenizr.pop();
}
state(state) {
if (state !== void 0) {
this._tokenizr.state(state);
return this;
}
return this._tokenizr.state();
}
tag(tag) {
this._tokenizr.tag(tag);
return this;
}
tagged(tag) {
return this._tokenizr.tagged(tag);
}
untag(tag) {
this._tokenizr.untag(tag);
return this;
}
/* mark current matching to be repeated from scratch */
repeat() {
this._tokenizr._log(" REPEAT");
this._repeat = true;
return this;
}
/* mark current matching to be rejected */
reject() {
this._tokenizr._log(" REJECT");
this._reject = true;
return this;
}
/* mark current matching to be ignored */
ignore() {
this._tokenizr._log(" IGNORE");
this._ignore = true;
return this;
}
/* accept current matching as a new token */
accept(type, value) {
value = value ?? this._match?.[0];
this._tokenizr._log(` ACCEPT: type: ${type}, value: ${JSON.stringify(value)} (${typeof value}), text: "${this._match?.[0] ?? ""}"`);
this._tokenizr._pending.push(new Token(type, value, this._match?.[0] ?? "", this._tokenizr._pos, this._tokenizr._line, this._tokenizr._column));
return this;
}
/* immediately stop tokenization */
stop() {
this._tokenizr._stopped = true;
return this;
}
}
class Tokenizr {
/* construct and initialize the object */
constructor() {
this._before = null;
this._after = null;
this._finish = null;
this._rules = [];
this._debug = false;
this._input = "";
this._len = 0;
this._eof = false;
this._pos = 0;
this._line = 1;
this._column = 1;
this._state = ["default"];
this._tag = {};
this._transaction = [];
this._pending = [];
this._stopped = false;
this._ctx = new ActionContext(this);
}
/* reset the internal state */
reset() {
this._input = "";
this._len = 0;
this._eof = false;
this._pos = 0;
this._line = 1;
this._column = 1;
this._state = ["default"];
this._tag = {};
this._transaction = [];
this._pending = [];
this._stopped = false;
this._ctx = new ActionContext(this);
return this;
}
/* create an error message for the current position */
error(message) {
return new ParsingError(message, this._pos, this._line, this._column, this._input);
}
/* configure debug operation */
debug(debug) {
this._debug = debug;
return this;
}
/* output a debug message */
_log(msg) {
if (this._debug)
console.log(`tokenizr: ${msg}`);
}
/* provide (new) input string to tokenize */
input(input) {
if (typeof input !== "string")
throw new Error('parameter "input" not a String');
this.reset();
this._input = input;
this._len = input.length;
return this;
}
/* push state */
push(state) {
if (arguments.length !== 1)
throw new Error("invalid number of arguments");
if (typeof state !== "string")
throw new Error('parameter "state" not a String');
this._log(` STATE (PUSH): old: <${this._state[this._state.length - 1]}>, new: <${state}>`);
this._state.push(state);
return this;
}
/* pop state */
pop() {
if (arguments.length !== 0)
throw new Error("invalid number of arguments");
if (this._state.length < 2)
throw new Error("no more custom states to pop");
this._log(` STATE (POP): old: <${this._state[this._state.length - 1]}>, new: <${this._state[this._state.length - 2]}>`);
return this._state.pop();
}
state(state) {
if (arguments.length === 1) {
if (typeof state !== "string")
throw new Error('parameter "state" not a String');
this._log(` STATE (SET): old: <${this._state[this._state.length - 1]}>, new: <${state}>`);
this._state[this._state.length - 1] = state;
return this;
} else if (arguments.length === 0)
return this._state[this._state.length - 1];
throw new Error("invalid number of arguments");
}
/* set a tag */
tag(tag) {
if (arguments.length !== 1)
throw new Error("invalid number of arguments");
if (typeof tag !== "string")
throw new Error('parameter "tag" not a String');
this._log(` TAG (ADD): ${tag}`);
this._tag[tag] = true;
return this;
}
/* check whether tag is set */
tagged(tag) {
if (arguments.length !== 1)
throw new Error("invalid number of arguments");
if (typeof tag !== "string")
throw new Error('parameter "tag" not a String');
return this._tag[tag] === true;
}
/* unset a tag */
untag(tag) {
if (arguments.length !== 1)
throw new Error("invalid number of arguments");
if (typeof tag !== "string")
throw new Error('parameter "tag" not a String');
this._log(` TAG (DEL): ${tag}`);
delete this._tag[tag];
return this;
}
/* configure a tokenization before-rule callback */
before(action) {
this._before = action;
return this;
}
/* configure a tokenization after-rule callback */
after(action) {
this._after = action;
return this;
}
/* configure a tokenization finish callback */
finish(action) {
this._finish = action;
return this;
}
rule(state, pattern, action, name = "unknown") {
if (arguments.length === 2 && typeof pattern === "function") {
[pattern, action] = [state, pattern];
state = "*";
} else if (arguments.length === 3 && typeof pattern === "function") {
[pattern, action, name] = [state, pattern, action];
state = "*";
}
if (typeof state !== "string")
throw new Error('parameter "state" not a String');
if (!(typeof pattern === "object" && pattern instanceof RegExp))
throw new Error('parameter "pattern" not a RegExp');
if (typeof action !== "function")
throw new Error('parameter "action" not a Function');
if (typeof name !== "string")
throw new Error('parameter "name" not a String');
const parsedState = state.split(/\s*,\s*/g).map((entry) => {
const items = entry.split(/\s+/g);
const states = items.filter((item) => item.match(/^#/) === null);
const tags = items.filter((item) => item.match(/^#/) !== null).map((tag) => tag.replace(/^#/, ""));
if (states.length !== 1)
throw new Error("exactly one state required");
return { state: states[0], tags };
});
let flags = "g";
try {
const regexp = new RegExp("", "y");
if (typeof regexp.sticky === "boolean")
flags = "y";
} catch (ex) {
}
if (typeof pattern.multiline === "boolean" && pattern.multiline)
flags += "m";
if (typeof pattern.dotAll === "boolean" && pattern.dotAll)
flags += "s";
if (typeof pattern.ignoreCase === "boolean" && pattern.ignoreCase)
flags += "i";
if (typeof pattern.unicode === "boolean" && pattern.unicode)
flags += "u";
const processedPattern = new RegExp(pattern.source, flags);
this._log(`rule: configure rule (state: ${state}, pattern: ${processedPattern.source})`);
this._rules.push({ state: parsedState, pattern: processedPattern, action, name });
return this;
}
/* progress the line/column counter */
_progress(from, until) {
const line = this._line;
const column = this._column;
const s = this._input;
for (let i = from; i < until; i++) {
const c = s.charAt(i);
if (c === "\r")
this._column = 1;
else if (c === "\n") {
this._line++;
this._column = 1;
} else if (c === " ")
this._column += 8 - this._column % 8;
else
this._column++;
}
this._log(` PROGRESS: characters: ${until - from}, from: <line ${line}, column ${column}>, to: <line ${this._line}, column ${this._column}>`);
}
/* determine and provide the next token */
_tokenize() {
const finish = () => {
if (!this._eof) {
if (this._finish !== null)
this._finish.call(this._ctx, this._ctx);
this._eof = true;
this._pending.push(new Token("EOF", "", "", this._pos, this._line, this._column));
}
};
if (this._stopped || this._pos >= this._len) {
finish();
return;
}
let continued = true;
while (continued) {
continued = false;
if (this._debug) {
const e = excerpt(this._input, this._pos);
const tags = Object.keys(this._tag).map((tag) => `#${tag}`).join(" ");
this._log(`INPUT: state: <${this._state[this._state.length - 1]}>, tags: <${tags}>, text: ` + (e.prologTrunc ? "..." : '"') + `${e.prologText}<${e.tokenText}>${e.epilogText}` + (e.epilogTrunc ? "..." : '"') + `, at: <line ${this._line}, column ${this._column}>`);
}
for (let i = 0; i < this._rules.length; i++) {
if (this._debug) {
const state = this._rules[i].state.map((item) => {
let output = item.state;
if (item.tags.length > 0)
output += " " + item.tags.map((tag) => `#${tag}`).join(" ");
return output;
}).join(", ");
this._log(` RULE: state(s): <${state}>, pattern: ${this._rules[i].pattern.source}`);
}
let matches = false;
const states = this._rules[i].state.map((item) => item.state);
let idx = states.indexOf("*");
if (idx < 0)
idx = states.indexOf(this._state[this._state.length - 1]);
if (idx >= 0) {
const requiredTags = this._rules[i].state[idx].tags;
matches = requiredTags.every((tag) => this._tag[tag]);
}
if (!matches)
continue;
this._rules[i].pattern.lastIndex = this._pos;
const found = this._rules[i].pattern.exec(this._input);
if (found !== null && found.index === this._pos) {
if (this._debug)
this._log(" MATCHED: " + JSON.stringify(found));
this._ctx._match = found;
this._ctx._repeat = false;
this._ctx._reject = false;
this._ctx._ignore = false;
if (this._before !== null)
this._before.call(this._ctx, this._ctx, found, this._rules[i]);
this._rules[i].action.call(this._ctx, this._ctx, found);
if (this._after !== null)
this._after.call(this._ctx, this._ctx, found, this._rules[i]);
if (this._ctx._reject)
continue;
else if (this._ctx._repeat) {
continued = true;
break;
} else if (this._ctx._ignore) {
this._progress(this._pos, this._rules[i].pattern.lastIndex);
this._pos = this._rules[i].pattern.lastIndex;
if (this._pos >= this._len) {
finish();
return;
}
continued = true;
break;
} else if (this._pending.length > 0) {
this._progress(this._pos, this._rules[i].pattern.lastIndex);
this._pos = this._rules[i].pattern.lastIndex;
if (this._pos >= this._len)
finish();
return;
} else
throw new Error('action of pattern "' + this._rules[i].pattern.source + '" neither rejected nor accepted any token(s)');
}
}
}
throw this.error("token not recognized");
}
/* determine and return next token */
token() {
if (this._pending.length === 0)
this._tokenize();
if (this._pending.length > 0) {
const token = this._pending.shift();
if (this._transaction.length > 0)
this._transaction[0].push(token);
this._log(`TOKEN: ${token.toString()}`);
return token;
}
return null;
}
/* determine and return all tokens */
tokens() {
const result = [];
let token;
while ((token = this.token()) !== null)
result.push(token);
return result;
}
/* peek at the next token or token at particular offset */
peek(offset) {
if (offset === void 0)
offset = 0;
if (typeof offset !== "number" || offset < 0)
throw new Error('parameter "offset" not a positive Number');
while (offset >= this._pending.length) {
this._tokenize();
if (this._pending.length === 0)
break;
}
if (offset >= this._pending.length)
throw new Error("not enough tokens available for peek operation");
this._log(`PEEK: ${this._pending[offset].toString()}`);
return this._pending[offset];
}
/* skip one or more tokens */
skip(len) {
if (len === void 0)
len = 1;
for (let i = 0; i < len; i++)
this._tokenize();
if (len > this._pending.length)
throw new Error("not enough tokens available for skip operation");
while (len-- > 0)
this.token();
return this;
}
/* consume the current token (by expecting it to be a particular symbol) */
consume(type, value) {
for (let i = 0; i < this._pending.length + 1; i++)
this._tokenize();
if (this._pending.length === 0)
throw new Error("not enough tokens available for consume operation");
const token = this.token();
this._log(`CONSUME: ${token.toString()}`);
const raiseError = () => {
throw new ParsingError(`expected: <type: ${type}, value: ${JSON.stringify(value)} (${typeof value})>, found: <type: ${token.type}, value: ${JSON.stringify(token.value)} (${typeof token.value})>`, token.pos, token.line, token.column, this._input);
};
if (arguments.length === 2 && !token.isA(type, value))
raiseError();
else if (!token.isA(type))
raiseError();
return token;
}
/* open tokenization transaction */
begin() {
this._log(`BEGIN: level ${this._transaction.length}`);
this._transaction.unshift([]);
return this;
}
/* determine depth of still open tokenization transaction */
depth() {
if (this._transaction.length === 0)
throw new Error("cannot determine depth -- no active transaction");
return this._transaction[0].length;
}
/* close (successfully) tokenization transaction */
commit() {
if (this._transaction.length === 0)
throw new Error("cannot commit transaction -- no active transaction");
const committed = this._transaction.shift();
if (this._transaction.length > 0)
this._transaction[0] = this._transaction[0].concat(committed);
this._log(`COMMIT: level ${this._transaction.length}`);
return this;
}
/* close (unsuccessfully) tokenization transaction */
rollback() {
if (this._transaction.length === 0)
throw new Error("cannot rollback transaction -- no active transaction");
const rolledback = this._transaction.shift();
this._pending = rolledback.concat(this._pending);
this._log(`ROLLBACK: level ${this._transaction.length}`);
return this;
}
/* execute multiple alternative callbacks */
alternatives(...alternatives) {
let result = null;
let depths = [];
for (let i = 0; i < alternatives.length; i++) {
try {
this.begin();
result = alternatives[i].call(this);
this.commit();
break;
} catch (ex) {
if (ex instanceof Error) {
this._log(`EXCEPTION: ${ex.message}`);
depths.push({ ex, depth: this.depth() });
} else {
this._log("EXCEPTION: alternative failed");
depths.push({ ex: new Error("alternative failed"), depth: this.depth() });
}
this.rollback();
continue;
}
}
if (result === null && depths.length > 0) {
depths = depths.sort((a, b) => a.depth - b.depth);
throw depths[0].ex;
}
return result;
}
static {
this.Token = Token;
}
static {
this.ParsingError = ParsingError;
}
static {
this.ActionContext = ActionContext;
}
}
export {
Tokenizr as default
};