UNPKG

tokenizr

Version:

String Tokenization Library for JavaScript

810 lines (731 loc) 28.2 kB
/* ** Tokenizr -- String Tokenization Library ** Copyright (c) 2015-2025 Dr. Ralf S. Engelschall <rse@engelschall.com> ** Licensed under MIT license <https://spdx.org/licenses/MIT> */ /* utility function: create a source excerpt */ interface ExcerptResult { prologTrunc: boolean prologText: string tokenText: string epilogText: string epilogTrunc: boolean } const hex = (ch: string) => ch.charCodeAt(0).toString(16).toUpperCase() const excerpt = (txt: string, o: number): ExcerptResult => { const l = txt.length let b = o - 20; if (b < 0) b = 0 let e = o + 20; if (e > l) e = l const extract = (txt: string, pos: number, len: number) => txt.substring(pos, pos + len) .replaceAll(/\\/g, "\\\\") .replaceAll(/\x08/g, "\\b") .replaceAll(/\t/g, "\\t") .replaceAll(/\n/g, "\\n") .replaceAll(/\f/g, "\\f") .replaceAll(/\r/g, "\\r") .replaceAll(/[\x00-\x07\x0B\x0E\x0F]/g, (ch) => "\\x0" + hex(ch)) .replaceAll(/[\x10-\x1F\x80-\xFF]/g, (ch) => "\\x" + hex(ch)) .replaceAll(/[\u0100-\u0FFF]/g, (ch) => "\\u0" + hex(ch)) .replaceAll(/[\u1000-\uFFFF]/g, (ch) => "\\u" + hex(ch)) return { prologTrunc: b > 0, prologText: extract(txt, b, o - b), tokenText: extract(txt, o, 1), epilogText: extract(txt, o + 1, e - (o + 1)), epilogTrunc: e < l } } /* helper class for token representation */ class Token { public type: string public value: unknown public text: string public pos: number public line: number public column: number /* construct and initialize object */ constructor (type: string, value: unknown, text: string, pos = 0, line = 0, column = 0) { this.type = type this.value = value this.text = text this.pos = pos this.line = line this.column = column } /* render a useful string representation */ toString (colorize = (type: string, text: string) => text) { return `${colorize("type", this.type)} ` + `(value: ${colorize("value", JSON.stringify(this.value))}, ` + `text: ${colorize("text", JSON.stringify(this.text))}, ` + `pos: ${colorize("pos", this.pos.toString())}, ` + `line: ${colorize("line", this.line.toString())}, ` + `column: ${colorize("column", this.column.toString())})` } /* check whether value is a Token */ isA (type: string, value?: unknown) { if (type !== this.type) return false if (value !== undefined && value !== this.value) return false return true } } /* helper class for tokenization error reporting */ class ParsingError extends Error { public name: string public message: string public pos: number public line: number public column: number public input: string /* construct and initialize object */ constructor (message: string, pos: number, line: number, column: number, input: string) { super(message) this.name = "ParsingError" this.message = message this.pos = pos this.line = line this.column = column this.input = input } /* render a useful string representation */ toString () { const l = excerpt(this.input, this.pos) const prefix1 = `line ${this.line} (column ${this.column}): ` const prefix2 = " ".repeat(prefix1.length + l.prologText.length) const msg = "Parsing Error: " + this.message + "\n" + prefix1 + l.prologText + l.tokenText + l.epilogText + "\n" + prefix2 + "^" return msg } } /* helper class for action context */ export interface TokenInfo { line: number column: number pos: number len: number } class ActionContext { private _tokenizr: Tokenizr private _data: { [ key: string ]: unknown } public _repeat: boolean public _reject: boolean public _ignore: boolean public _match: RegExpExecArray | null /* construct and initialize the object */ constructor (tokenizr: Tokenizr) { this._tokenizr = tokenizr this._data = {} this._repeat = false this._reject = false this._ignore = false this._match = null } /* store and retrieve user data attached to context */ data (key: string, value?: unknown) { const valueOld = this._data[key] if (arguments.length === 2) this._data[key] = value return valueOld } /* retrieve information of current matching */ info (): TokenInfo { return { line: this._tokenizr._line, column: this._tokenizr._column, pos: this._tokenizr._pos, len: this._match?.[0]?.length ?? 0 } satisfies TokenInfo } /* pass-through functions to attached tokenizer */ push (state: string) { this._tokenizr.push(state) return this } pop () { return this._tokenizr.pop() } state (): string state (state: string): this state (state?: string): this | string { if (state !== undefined) { this._tokenizr.state(state!) return this } return this._tokenizr.state() } tag (tag: string) { this._tokenizr.tag(tag) return this } tagged (tag: string) { return this._tokenizr.tagged(tag) } untag (tag: string) { this._tokenizr.untag(tag) return this } /* mark current matching to be repeated from scratch */ repeat () { this._tokenizr._log(" REPEAT") this._repeat = true return this } /* mark current matching to be rejected */ reject () { this._tokenizr._log(" REJECT") this._reject = true return this } /* mark current matching to be ignored */ ignore () { this._tokenizr._log(" IGNORE") this._ignore = true return this } /* accept current matching as a new token */ accept (type: string, value?: unknown) { value = value ?? this._match?.[0] this._tokenizr._log(` ACCEPT: type: ${type}, value: ` + `${JSON.stringify(value)} (${typeof value}), text: "${this._match?.[0] ?? ""}"`) this._tokenizr._pending.push(new Token( type, value, this._match?.[0] ?? "", this._tokenizr._pos, this._tokenizr._line, this._tokenizr._column )) return this } /* immediately stop tokenization */ stop (): this { this._tokenizr._stopped = true return this } } /* external API class */ export interface RuleState { state: string tags: string[] } export type RuleAction = ( this: ActionContext, ctx: ActionContext, found: RegExpExecArray ) => void export interface Rule { state: RuleState[] pattern: RegExp action: RuleAction name: string } export type BeforeAfterAction = ( this: ActionContext, ctx: ActionContext, match: RegExpExecArray, rule: Rule ) => void export type FinishAction = ( this: ActionContext, ctx: ActionContext ) => void export default class Tokenizr { private _before: BeforeAfterAction | null private _after: BeforeAfterAction | null private _finish: FinishAction | null private _rules: Rule[] private _debug: boolean private _input: string private _len: number private _eof: boolean public _pos: number public _line: number public _column: number private _state: string[] private _tag: { [key: string]: boolean } private _transaction: Token[][] public _pending: Token[] public _stopped: boolean private _ctx: ActionContext /* construct and initialize the object */ constructor () { this._before = null this._after = null this._finish = null this._rules = [] this._debug = false /* inlined reset */ this._input = "" this._len = 0 this._eof = false this._pos = 0 this._line = 1 this._column = 1 this._state = [ "default" ] this._tag = {} this._transaction = [] this._pending = [] this._stopped = false this._ctx = new ActionContext(this) } /* reset the internal state */ reset () { this._input = "" this._len = 0 this._eof = false this._pos = 0 this._line = 1 this._column = 1 this._state = [ "default" ] this._tag = {} this._transaction = [] this._pending = [] this._stopped = false this._ctx = new ActionContext(this) return this } /* create an error message for the current position */ error (message: string) { return new ParsingError(message, this._pos, this._line, this._column, this._input) } /* configure debug operation */ debug (debug: boolean) { this._debug = debug return this } /* output a debug message */ _log (msg: string) { /* eslint no-console: off */ /* oxlint-disable no-console */ if (this._debug) console.log(`tokenizr: ${msg}`) } /* provide (new) input string to tokenize */ input (input: string) { /* sanity check arguments */ if (typeof input !== "string") throw new Error("parameter \"input\" not a String") /* reset state and store new input */ this.reset() this._input = input this._len = input.length return this } /* push state */ push (state: string) { /* sanity check arguments */ if (arguments.length !== 1) throw new Error("invalid number of arguments") if (typeof state !== "string") throw new Error("parameter \"state\" not a String") /* push new state */ this._log(" STATE (PUSH): " + `old: <${this._state[this._state.length - 1]}>, ` + `new: <${state}>`) this._state.push(state) return this } /* pop state */ pop () { /* sanity check arguments */ if (arguments.length !== 0) throw new Error("invalid number of arguments") if (this._state.length < 2) throw new Error("no more custom states to pop") /* pop old state */ this._log(" STATE (POP): " + `old: <${this._state[this._state.length - 1]}>, ` + `new: <${this._state[this._state.length - 2]}>`) return this._state.pop()! } /* get/set state */ state (): string state (state: string): this state (state?: string): this | string { if (arguments.length === 1) { /* sanity check arguments */ if (typeof state !== "string") throw new Error("parameter \"state\" not a String") /* change current state */ this._log(" STATE (SET): " + `old: <${this._state[this._state.length - 1]}>, ` + `new: <${state}>`) this._state[this._state.length - 1] = state return this } else if (arguments.length === 0) return this._state[this._state.length - 1] throw new Error("invalid number of arguments") } /* set a tag */ tag (tag: string) { /* sanity check arguments */ if (arguments.length !== 1) throw new Error("invalid number of arguments") if (typeof tag !== "string") throw new Error("parameter \"tag\" not a String") /* set tag */ this._log(` TAG (ADD): ${tag}`) this._tag[tag] = true return this } /* check whether tag is set */ tagged (tag: string) { /* sanity check arguments */ if (arguments.length !== 1) throw new Error("invalid number of arguments") if (typeof tag !== "string") throw new Error("parameter \"tag\" not a String") /* set tag */ return (this._tag[tag] === true) } /* unset a tag */ untag (tag: string) { /* sanity check arguments */ if (arguments.length !== 1) throw new Error("invalid number of arguments") if (typeof tag !== "string") throw new Error("parameter \"tag\" not a String") /* delete tag */ this._log(` TAG (DEL): ${tag}`) delete this._tag[tag] return this } /* configure a tokenization before-rule callback */ before (action: BeforeAfterAction) { this._before = action return this } /* configure a tokenization after-rule callback */ after (action: BeforeAfterAction) { this._after = action return this } /* configure a tokenization finish callback */ finish (action: FinishAction) { this._finish = action return this } /* configure a tokenization rule */ rule (state: string, pattern: RegExp, action: RuleAction, name?: string): this rule (pattern: RegExp, action: RuleAction, name?: string): this rule (state: string | RegExp, pattern?: RegExp | RuleAction, action?: RuleAction | string, name = "unknown"): this { /* support optional states */ if (arguments.length === 2 && typeof pattern === "function") { [ pattern, action ] = [ state as RegExp, pattern as RuleAction ] state = "*" } else if (arguments.length === 3 && typeof pattern === "function") { [ pattern, action, name ] = [ state as RegExp, pattern as RuleAction, action as string ] state = "*" } /* sanity check arguments */ if (typeof state !== "string") throw new Error("parameter \"state\" not a String") if (!(typeof pattern === "object" && pattern instanceof RegExp)) throw new Error("parameter \"pattern\" not a RegExp") if (typeof action !== "function") throw new Error("parameter \"action\" not a Function") if (typeof name !== "string") throw new Error("parameter \"name\" not a String") /* post-process state */ const parsedState = state.split(/\s*,\s*/g).map((entry: string) => { const items = entry.split(/\s+/g) const states = items.filter((item: string) => item.match(/^#/) === null) const tags = items.filter((item: string) => item.match(/^#/) !== null) .map((tag: string) => tag.replace(/^#/, "")) if (states.length !== 1) throw new Error("exactly one state required") return { state: states[0], tags } }) /* post-process pattern */ let flags = "g" /* ECMAScript <= 5 */ try { const regexp = new RegExp("", "y") if (typeof regexp.sticky === "boolean") flags = "y" /* ECMAScript >= 2015 */ } catch (ex) { /* no-op */ } if (typeof pattern.multiline === "boolean" && pattern.multiline) flags += "m" if (typeof pattern.dotAll === "boolean" && pattern.dotAll) flags += "s" if (typeof pattern.ignoreCase === "boolean" && pattern.ignoreCase) flags += "i" if (typeof pattern.unicode === "boolean" && pattern.unicode) flags += "u" const processedPattern = new RegExp(pattern.source, flags) /* store rule */ this._log(`rule: configure rule (state: ${state}, pattern: ${processedPattern.source})`) this._rules.push({ state: parsedState, pattern: processedPattern, action, name }) return this } /* progress the line/column counter */ _progress (from: number, until: number) { const line = this._line const column = this._column const s = this._input for (let i = from; i < until; i++) { const c = s.charAt(i) if (c === "\r") this._column = 1 else if (c === "\n") { this._line++ this._column = 1 } else if (c === "\t") this._column += 8 - (this._column % 8) else this._column++ } this._log(` PROGRESS: characters: ${until - from}, ` + `from: <line ${line}, column ${column}>, ` + `to: <line ${this._line}, column ${this._column}>`) } /* determine and provide the next token */ _tokenize () { /* helper function for finishing parsing */ const finish = () => { if (!this._eof) { if (this._finish !== null) this._finish.call(this._ctx, this._ctx) this._eof = true this._pending.push(new Token("EOF", "", "", this._pos, this._line, this._column)) } } /* tokenize only as long as we were not stopped and there is input left */ if (this._stopped || this._pos >= this._len) { finish() return } /* loop... */ let continued = true while (continued) { continued = false /* some optional debugging context */ if (this._debug) { const e = excerpt(this._input, this._pos) const tags = Object.keys(this._tag).map((tag: string) => `#${tag}`).join(" ") this._log(`INPUT: state: <${this._state[this._state.length - 1]}>, tags: <${tags}>, text: ` + (e.prologTrunc ? "..." : "\"") + `${e.prologText}<${e.tokenText}>${e.epilogText}` + (e.epilogTrunc ? "..." : "\"") + `, at: <line ${this._line}, column ${this._column}>`) } /* iterate over all rules... */ for (let i = 0; i < this._rules.length; i++) { if (this._debug) { const state = this._rules[i].state.map((item: RuleState) => { let output = item.state if (item.tags.length > 0) output += " " + item.tags.map((tag: string) => `#${tag}`).join(" ") return output }).join(", ") this._log(` RULE: state(s): <${state}>, ` + `pattern: ${this._rules[i].pattern.source}`) } /* one of rule's states (and all of its tags) has to match */ let matches = false const states = this._rules[i].state.map((item: RuleState) => item.state) let idx = states.indexOf("*") if (idx < 0) idx = states.indexOf(this._state[this._state.length - 1]) if (idx >= 0) { const requiredTags = this._rules[i].state[idx].tags matches = requiredTags.every((tag: string) => this._tag[tag]) } if (!matches) continue /* match pattern at the last position */ this._rules[i].pattern.lastIndex = this._pos const found = this._rules[i].pattern.exec(this._input) if (found !== null && found.index === this._pos) { if (this._debug) this._log(" MATCHED: " + JSON.stringify(found)) /* pattern found, so give action a chance to operate on it and act according to its results */ this._ctx._match = found this._ctx._repeat = false this._ctx._reject = false this._ctx._ignore = false if (this._before !== null) this._before.call(this._ctx, this._ctx, found, this._rules[i]) this._rules[i].action.call(this._ctx, this._ctx, found) if (this._after !== null) this._after.call(this._ctx, this._ctx, found, this._rules[i]) if (this._ctx._reject) /* reject current action, continue matching */ continue else if (this._ctx._repeat) { /* repeat matching from scratch */ continued = true break } else if (this._ctx._ignore) { /* ignore token */ this._progress(this._pos, this._rules[i].pattern.lastIndex) this._pos = this._rules[i].pattern.lastIndex if (this._pos >= this._len) { finish() return } continued = true break } else if (this._pending.length > 0) { /* accept token(s) */ this._progress(this._pos, this._rules[i].pattern.lastIndex) this._pos = this._rules[i].pattern.lastIndex if (this._pos >= this._len) finish() return } else throw new Error("action of pattern \"" + this._rules[i].pattern.source + "\" neither rejected nor accepted any token(s)") } } } /* no pattern matched at all */ throw this.error("token not recognized") } /* determine and return next token */ token (): Token | null { /* if no more tokens are pending, try to determine a new one */ if (this._pending.length === 0) this._tokenize() /* return now potentially pending token */ if (this._pending.length > 0) { const token = this._pending.shift()! if (this._transaction.length > 0) this._transaction[0].push(token) this._log(`TOKEN: ${token.toString()}`) return token } /* no more tokens */ return null } /* determine and return all tokens */ tokens () { const result: Token[] = [] let token: Token | null while ((token = this.token()) !== null) result.push(token) return result } /* peek at the next token or token at particular offset */ peek (offset?: number) { if (offset === undefined) offset = 0 if (typeof offset !== "number" || offset < 0) throw new Error("parameter \"offset\" not a positive Number") /* if no more tokens are pending, try to determine new ones */ while (offset >= this._pending.length) { this._tokenize() if (this._pending.length === 0) break } if (offset >= this._pending.length) throw new Error("not enough tokens available for peek operation") this._log(`PEEK: ${this._pending[offset].toString()}`) return this._pending[offset] } /* skip one or more tokens */ skip (len?: number) { if (len === undefined) len = 1 for (let i = 0; i < len; i++) this._tokenize() if (len > this._pending.length) throw new Error("not enough tokens available for skip operation") while (len-- > 0) this.token() return this } /* consume the current token (by expecting it to be a particular symbol) */ consume (type: string, value?: unknown) { for (let i = 0; i < this._pending.length + 1; i++) this._tokenize() if (this._pending.length === 0) throw new Error("not enough tokens available for consume operation") const token = this.token()! this._log(`CONSUME: ${token.toString()}`) const raiseError = () => { throw new ParsingError( `expected: <type: ${type}, value: ${JSON.stringify(value)} (${typeof value})>, ` + `found: <type: ${token.type}, value: ${JSON.stringify(token.value)} (${typeof token.value})>`, token.pos, token.line, token.column, this._input ) } if (arguments.length === 2 && !token.isA(type, value)) raiseError() else if (!token.isA(type)) raiseError() return token } /* open tokenization transaction */ begin () { this._log(`BEGIN: level ${this._transaction.length}`) this._transaction.unshift([]) return this } /* determine depth of still open tokenization transaction */ depth () { if (this._transaction.length === 0) throw new Error("cannot determine depth -- no active transaction") return this._transaction[0].length } /* close (successfully) tokenization transaction */ commit () { if (this._transaction.length === 0) throw new Error("cannot commit transaction -- no active transaction") /* remove current transaction */ const committed = this._transaction.shift()! /* in case we were a nested transaction, still remember the tokens */ if (this._transaction.length > 0) this._transaction[0] = this._transaction[0].concat(committed) this._log(`COMMIT: level ${this._transaction.length}`) return this } /* close (unsuccessfully) tokenization transaction */ rollback () { if (this._transaction.length === 0) throw new Error("cannot rollback transaction -- no active transaction") /* remove current transaction */ const rolledback = this._transaction.shift()! /* make the tokens available again, as new pending tokens */ this._pending = rolledback.concat(this._pending) this._log(`ROLLBACK: level ${this._transaction.length}`) return this } /* execute multiple alternative callbacks */ alternatives (...alternatives: ((this: Tokenizr) => unknown)[]) { let result: unknown = null let depths: { ex: Error, depth: number }[] = [] for (let i = 0; i < alternatives.length; i++) { try { this.begin() result = alternatives[i].call(this) this.commit() break } catch (ex) { if (ex instanceof Error) { this._log(`EXCEPTION: ${ex.message}`) depths.push({ ex, depth: this.depth() }) } else { this._log("EXCEPTION: alternative failed") depths.push({ ex: new Error("alternative failed"), depth: this.depth() }) } this.rollback() continue } } if (result === null && depths.length > 0) { depths = depths.sort((a, b) => a.depth - b.depth) throw depths[0].ex } return result } /* expose the utility classes, too */ static readonly Token = Token static readonly ParsingError = ParsingError static readonly ActionContext = ActionContext }