UNPKG

tokenizr

Version:

String Tokenization Library for JavaScript

563 lines (562 loc) 19 kB
const hex = (ch) => ch.charCodeAt(0).toString(16).toUpperCase(); const excerpt = (txt, o) => { const l = txt.length; let b = o - 20; if (b < 0) b = 0; let e = o + 20; if (e > l) e = l; const extract = (txt2, pos, len) => txt2.substring(pos, pos + len).replaceAll(/\\/g, "\\\\").replaceAll(/\x08/g, "\\b").replaceAll(/\t/g, "\\t").replaceAll(/\n/g, "\\n").replaceAll(/\f/g, "\\f").replaceAll(/\r/g, "\\r").replaceAll(/[\x00-\x07\x0B\x0E\x0F]/g, (ch) => "\\x0" + hex(ch)).replaceAll(/[\x10-\x1F\x80-\xFF]/g, (ch) => "\\x" + hex(ch)).replaceAll(/[\u0100-\u0FFF]/g, (ch) => "\\u0" + hex(ch)).replaceAll(/[\u1000-\uFFFF]/g, (ch) => "\\u" + hex(ch)); return { prologTrunc: b > 0, prologText: extract(txt, b, o - b), tokenText: extract(txt, o, 1), epilogText: extract(txt, o + 1, e - (o + 1)), epilogTrunc: e < l }; }; class Token { /* construct and initialize object */ constructor(type, value, text, pos = 0, line = 0, column = 0) { this.type = type; this.value = value; this.text = text; this.pos = pos; this.line = line; this.column = column; } /* render a useful string representation */ toString(colorize = (type, text) => text) { return `${colorize("type", this.type)} (value: ${colorize("value", JSON.stringify(this.value))}, text: ${colorize("text", JSON.stringify(this.text))}, pos: ${colorize("pos", this.pos.toString())}, line: ${colorize("line", this.line.toString())}, column: ${colorize("column", this.column.toString())})`; } /* check whether value is a Token */ isA(type, value) { if (type !== this.type) return false; if (value !== void 0 && value !== this.value) return false; return true; } } class ParsingError extends Error { /* construct and initialize object */ constructor(message, pos, line, column, input) { super(message); this.name = "ParsingError"; this.message = message; this.pos = pos; this.line = line; this.column = column; this.input = input; } /* render a useful string representation */ toString() { const l = excerpt(this.input, this.pos); const prefix1 = `line ${this.line} (column ${this.column}): `; const prefix2 = " ".repeat(prefix1.length + l.prologText.length); const msg = "Parsing Error: " + this.message + "\n" + prefix1 + l.prologText + l.tokenText + l.epilogText + "\n" + prefix2 + "^"; return msg; } } class ActionContext { /* construct and initialize the object */ constructor(tokenizr) { this._tokenizr = tokenizr; this._data = {}; this._repeat = false; this._reject = false; this._ignore = false; this._match = null; } /* store and retrieve user data attached to context */ data(key, value) { const valueOld = this._data[key]; if (arguments.length === 2) this._data[key] = value; return valueOld; } /* retrieve information of current matching */ info() { return { line: this._tokenizr._line, column: this._tokenizr._column, pos: this._tokenizr._pos, len: this._match?.[0]?.length ?? 0 }; } /* pass-through functions to attached tokenizer */ push(state) { this._tokenizr.push(state); return this; } pop() { return this._tokenizr.pop(); } state(state) { if (state !== void 0) { this._tokenizr.state(state); return this; } return this._tokenizr.state(); } tag(tag) { this._tokenizr.tag(tag); return this; } tagged(tag) { return this._tokenizr.tagged(tag); } untag(tag) { this._tokenizr.untag(tag); return this; } /* mark current matching to be repeated from scratch */ repeat() { this._tokenizr._log(" REPEAT"); this._repeat = true; return this; } /* mark current matching to be rejected */ reject() { this._tokenizr._log(" REJECT"); this._reject = true; return this; } /* mark current matching to be ignored */ ignore() { this._tokenizr._log(" IGNORE"); this._ignore = true; return this; } /* accept current matching as a new token */ accept(type, value) { value = value ?? this._match?.[0]; this._tokenizr._log(` ACCEPT: type: ${type}, value: ${JSON.stringify(value)} (${typeof value}), text: "${this._match?.[0] ?? ""}"`); this._tokenizr._pending.push(new Token(type, value, this._match?.[0] ?? "", this._tokenizr._pos, this._tokenizr._line, this._tokenizr._column)); return this; } /* immediately stop tokenization */ stop() { this._tokenizr._stopped = true; return this; } } class Tokenizr { /* construct and initialize the object */ constructor() { this._before = null; this._after = null; this._finish = null; this._rules = []; this._debug = false; this._input = ""; this._len = 0; this._eof = false; this._pos = 0; this._line = 1; this._column = 1; this._state = ["default"]; this._tag = {}; this._transaction = []; this._pending = []; this._stopped = false; this._ctx = new ActionContext(this); } /* reset the internal state */ reset() { this._input = ""; this._len = 0; this._eof = false; this._pos = 0; this._line = 1; this._column = 1; this._state = ["default"]; this._tag = {}; this._transaction = []; this._pending = []; this._stopped = false; this._ctx = new ActionContext(this); return this; } /* create an error message for the current position */ error(message) { return new ParsingError(message, this._pos, this._line, this._column, this._input); } /* configure debug operation */ debug(debug) { this._debug = debug; return this; } /* output a debug message */ _log(msg) { if (this._debug) console.log(`tokenizr: ${msg}`); } /* provide (new) input string to tokenize */ input(input) { if (typeof input !== "string") throw new Error('parameter "input" not a String'); this.reset(); this._input = input; this._len = input.length; return this; } /* push state */ push(state) { if (arguments.length !== 1) throw new Error("invalid number of arguments"); if (typeof state !== "string") throw new Error('parameter "state" not a String'); this._log(` STATE (PUSH): old: <${this._state[this._state.length - 1]}>, new: <${state}>`); this._state.push(state); return this; } /* pop state */ pop() { if (arguments.length !== 0) throw new Error("invalid number of arguments"); if (this._state.length < 2) throw new Error("no more custom states to pop"); this._log(` STATE (POP): old: <${this._state[this._state.length - 1]}>, new: <${this._state[this._state.length - 2]}>`); return this._state.pop(); } state(state) { if (arguments.length === 1) { if (typeof state !== "string") throw new Error('parameter "state" not a String'); this._log(` STATE (SET): old: <${this._state[this._state.length - 1]}>, new: <${state}>`); this._state[this._state.length - 1] = state; return this; } else if (arguments.length === 0) return this._state[this._state.length - 1]; throw new Error("invalid number of arguments"); } /* set a tag */ tag(tag) { if (arguments.length !== 1) throw new Error("invalid number of arguments"); if (typeof tag !== "string") throw new Error('parameter "tag" not a String'); this._log(` TAG (ADD): ${tag}`); this._tag[tag] = true; return this; } /* check whether tag is set */ tagged(tag) { if (arguments.length !== 1) throw new Error("invalid number of arguments"); if (typeof tag !== "string") throw new Error('parameter "tag" not a String'); return this._tag[tag] === true; } /* unset a tag */ untag(tag) { if (arguments.length !== 1) throw new Error("invalid number of arguments"); if (typeof tag !== "string") throw new Error('parameter "tag" not a String'); this._log(` TAG (DEL): ${tag}`); delete this._tag[tag]; return this; } /* configure a tokenization before-rule callback */ before(action) { this._before = action; return this; } /* configure a tokenization after-rule callback */ after(action) { this._after = action; return this; } /* configure a tokenization finish callback */ finish(action) { this._finish = action; return this; } rule(state, pattern, action, name = "unknown") { if (arguments.length === 2 && typeof pattern === "function") { [pattern, action] = [state, pattern]; state = "*"; } else if (arguments.length === 3 && typeof pattern === "function") { [pattern, action, name] = [state, pattern, action]; state = "*"; } if (typeof state !== "string") throw new Error('parameter "state" not a String'); if (!(typeof pattern === "object" && pattern instanceof RegExp)) throw new Error('parameter "pattern" not a RegExp'); if (typeof action !== "function") throw new Error('parameter "action" not a Function'); if (typeof name !== "string") throw new Error('parameter "name" not a String'); const parsedState = state.split(/\s*,\s*/g).map((entry) => { const items = entry.split(/\s+/g); const states = items.filter((item) => item.match(/^#/) === null); const tags = items.filter((item) => item.match(/^#/) !== null).map((tag) => tag.replace(/^#/, "")); if (states.length !== 1) throw new Error("exactly one state required"); return { state: states[0], tags }; }); let flags = "g"; try { const regexp = new RegExp("", "y"); if (typeof regexp.sticky === "boolean") flags = "y"; } catch (ex) { } if (typeof pattern.multiline === "boolean" && pattern.multiline) flags += "m"; if (typeof pattern.dotAll === "boolean" && pattern.dotAll) flags += "s"; if (typeof pattern.ignoreCase === "boolean" && pattern.ignoreCase) flags += "i"; if (typeof pattern.unicode === "boolean" && pattern.unicode) flags += "u"; const processedPattern = new RegExp(pattern.source, flags); this._log(`rule: configure rule (state: ${state}, pattern: ${processedPattern.source})`); this._rules.push({ state: parsedState, pattern: processedPattern, action, name }); return this; } /* progress the line/column counter */ _progress(from, until) { const line = this._line; const column = this._column; const s = this._input; for (let i = from; i < until; i++) { const c = s.charAt(i); if (c === "\r") this._column = 1; else if (c === "\n") { this._line++; this._column = 1; } else if (c === " ") this._column += 8 - this._column % 8; else this._column++; } this._log(` PROGRESS: characters: ${until - from}, from: <line ${line}, column ${column}>, to: <line ${this._line}, column ${this._column}>`); } /* determine and provide the next token */ _tokenize() { const finish = () => { if (!this._eof) { if (this._finish !== null) this._finish.call(this._ctx, this._ctx); this._eof = true; this._pending.push(new Token("EOF", "", "", this._pos, this._line, this._column)); } }; if (this._stopped || this._pos >= this._len) { finish(); return; } let continued = true; while (continued) { continued = false; if (this._debug) { const e = excerpt(this._input, this._pos); const tags = Object.keys(this._tag).map((tag) => `#${tag}`).join(" "); this._log(`INPUT: state: <${this._state[this._state.length - 1]}>, tags: <${tags}>, text: ` + (e.prologTrunc ? "..." : '"') + `${e.prologText}<${e.tokenText}>${e.epilogText}` + (e.epilogTrunc ? "..." : '"') + `, at: <line ${this._line}, column ${this._column}>`); } for (let i = 0; i < this._rules.length; i++) { if (this._debug) { const state = this._rules[i].state.map((item) => { let output = item.state; if (item.tags.length > 0) output += " " + item.tags.map((tag) => `#${tag}`).join(" "); return output; }).join(", "); this._log(` RULE: state(s): <${state}>, pattern: ${this._rules[i].pattern.source}`); } let matches = false; const states = this._rules[i].state.map((item) => item.state); let idx = states.indexOf("*"); if (idx < 0) idx = states.indexOf(this._state[this._state.length - 1]); if (idx >= 0) { const requiredTags = this._rules[i].state[idx].tags; matches = requiredTags.every((tag) => this._tag[tag]); } if (!matches) continue; this._rules[i].pattern.lastIndex = this._pos; const found = this._rules[i].pattern.exec(this._input); if (found !== null && found.index === this._pos) { if (this._debug) this._log(" MATCHED: " + JSON.stringify(found)); this._ctx._match = found; this._ctx._repeat = false; this._ctx._reject = false; this._ctx._ignore = false; if (this._before !== null) this._before.call(this._ctx, this._ctx, found, this._rules[i]); this._rules[i].action.call(this._ctx, this._ctx, found); if (this._after !== null) this._after.call(this._ctx, this._ctx, found, this._rules[i]); if (this._ctx._reject) continue; else if (this._ctx._repeat) { continued = true; break; } else if (this._ctx._ignore) { this._progress(this._pos, this._rules[i].pattern.lastIndex); this._pos = this._rules[i].pattern.lastIndex; if (this._pos >= this._len) { finish(); return; } continued = true; break; } else if (this._pending.length > 0) { this._progress(this._pos, this._rules[i].pattern.lastIndex); this._pos = this._rules[i].pattern.lastIndex; if (this._pos >= this._len) finish(); return; } else throw new Error('action of pattern "' + this._rules[i].pattern.source + '" neither rejected nor accepted any token(s)'); } } } throw this.error("token not recognized"); } /* determine and return next token */ token() { if (this._pending.length === 0) this._tokenize(); if (this._pending.length > 0) { const token = this._pending.shift(); if (this._transaction.length > 0) this._transaction[0].push(token); this._log(`TOKEN: ${token.toString()}`); return token; } return null; } /* determine and return all tokens */ tokens() { const result = []; let token; while ((token = this.token()) !== null) result.push(token); return result; } /* peek at the next token or token at particular offset */ peek(offset) { if (offset === void 0) offset = 0; if (typeof offset !== "number" || offset < 0) throw new Error('parameter "offset" not a positive Number'); while (offset >= this._pending.length) { this._tokenize(); if (this._pending.length === 0) break; } if (offset >= this._pending.length) throw new Error("not enough tokens available for peek operation"); this._log(`PEEK: ${this._pending[offset].toString()}`); return this._pending[offset]; } /* skip one or more tokens */ skip(len) { if (len === void 0) len = 1; for (let i = 0; i < len; i++) this._tokenize(); if (len > this._pending.length) throw new Error("not enough tokens available for skip operation"); while (len-- > 0) this.token(); return this; } /* consume the current token (by expecting it to be a particular symbol) */ consume(type, value) { for (let i = 0; i < this._pending.length + 1; i++) this._tokenize(); if (this._pending.length === 0) throw new Error("not enough tokens available for consume operation"); const token = this.token(); this._log(`CONSUME: ${token.toString()}`); const raiseError = () => { throw new ParsingError(`expected: <type: ${type}, value: ${JSON.stringify(value)} (${typeof value})>, found: <type: ${token.type}, value: ${JSON.stringify(token.value)} (${typeof token.value})>`, token.pos, token.line, token.column, this._input); }; if (arguments.length === 2 && !token.isA(type, value)) raiseError(); else if (!token.isA(type)) raiseError(); return token; } /* open tokenization transaction */ begin() { this._log(`BEGIN: level ${this._transaction.length}`); this._transaction.unshift([]); return this; } /* determine depth of still open tokenization transaction */ depth() { if (this._transaction.length === 0) throw new Error("cannot determine depth -- no active transaction"); return this._transaction[0].length; } /* close (successfully) tokenization transaction */ commit() { if (this._transaction.length === 0) throw new Error("cannot commit transaction -- no active transaction"); const committed = this._transaction.shift(); if (this._transaction.length > 0) this._transaction[0] = this._transaction[0].concat(committed); this._log(`COMMIT: level ${this._transaction.length}`); return this; } /* close (unsuccessfully) tokenization transaction */ rollback() { if (this._transaction.length === 0) throw new Error("cannot rollback transaction -- no active transaction"); const rolledback = this._transaction.shift(); this._pending = rolledback.concat(this._pending); this._log(`ROLLBACK: level ${this._transaction.length}`); return this; } /* execute multiple alternative callbacks */ alternatives(...alternatives) { let result = null; let depths = []; for (let i = 0; i < alternatives.length; i++) { try { this.begin(); result = alternatives[i].call(this); this.commit(); break; } catch (ex) { if (ex instanceof Error) { this._log(`EXCEPTION: ${ex.message}`); depths.push({ ex, depth: this.depth() }); } else { this._log("EXCEPTION: alternative failed"); depths.push({ ex: new Error("alternative failed"), depth: this.depth() }); } this.rollback(); continue; } } if (result === null && depths.length > 0) { depths = depths.sort((a, b) => a.depth - b.depth); throw depths[0].ex; } return result; } static { this.Token = Token; } static { this.ParsingError = ParsingError; } static { this.ActionContext = ActionContext; } } export { Tokenizr as default };