apg-unicode
Version:
JavaScript APG parser of Unicode code point arrays
697 lines (677 loc) • 25.1 kB
JavaScript
// # Overview of the Parser Class
// The `Parser` class serves as the core interface to the `apg-unicode` parsing engine.
// It encapsulates the primary `parse` function and exposes modular facilities for
// enhanced parsing diagnostics and customization.
// ## Design Philosophy
// - **Modular Diagnostics**: AST, Trace, and Stats are opt-in, allowing lightweight parsing when needed and deep introspection when desired.
// - **Callback-Driven Customization**: Callback functions can be attached to Rule and UDT names for input string translation during the pass through the parse tree.
// - **Semantic Clarity**: Each method is purpose-built to expose a distinct parsing concern, following the single-responsibility principle.
import { identifiers as id } from '../src/identifiers.js';
import { utilities as utils } from '../src/utilities.js';
export { Parser };
class Parser {
#FILENAME = 'parser.js: ';
#charType;
#ast;
#stats;
#trace;
#lookAhead;
#treeDepth;
#maxTreeDepth;
#nodeHits;
#maxMatched;
#rules;
#udts;
#opcodes;
#chars;
#charStart;
#charEnd;
#ruleCallbacks;
#udtCallbacks;
#userData;
#sysData = {};
// The constructor takes a single argument, an SABNF grammar object.
// See any of the examples in the `examples` directory for usage.
constructor(grammar) {
this.#rules = grammar.rules;
this.#udts = grammar.udts;
this.#ruleCallbacks = Array(this.#rules.length).fill(undefined);
this.#udtCallbacks = this.#udts.length ? Array(this.#udts.length).fill(undefined) : undefined;
}
// Attach a Trace object to the parser.
// The parser will interact with the Trace object
// generating a trace of the parser through all of the nodes
// of the parse tree. See `examples/trace` for usage.
setTrace(trace) {
if (!trace) {
this.#trace = undefined;
} else {
this.#trace = trace;
}
}
// Attach a Stats object to the parser.
// The parser will interact with the Stats object
// generating a collection of node hits by the parser.
// See `examples/stats` for usage.
setStats(stats) {
if (!stats) {
this.#stats = undefined;
} else {
this.#stats = stats;
}
}
// Attach an Ast object to the parser.
// The parser will interact with the Ast object
// generating an AST.
// See `examples/ast` for usage.
setAst(ast) {
if (!ast) {
this.#ast = undefined;
} else {
this.#ast = ast;
this.#ast.initGrammar(this.#rules, this.#udts);
}
}
// Attach a callback function to a rule or UDT named in the SABNF grammar.
// * @param {string | undefined} name - a valid rule or UDT name. If `undefined` all callback functions will be cleared.
// * @param {function} callback - a user-written function for processing the node phrases
setCallback(name, callback) {
if (name === undefined) {
/* clear all callbacks */
this.#ruleCallbacks = Array(this.#rules.length).fill(undefined);
this.#udtCallbacks = Array(this.#udts.length).fill(undefined);
return;
}
if (!(typeof name === 'string' && typeof callback === 'function')) {
throw new Error(`${this.#FILENAME}: setCallback() argument types not "string", "function"`);
}
/* see if this is a rule name */
const lower = name.toLowerCase();
for (const rule of this.#rules) {
if (lower === rule.lower) {
this.#ruleCallbacks[rule.index] = callback;
return;
}
}
/* see if this is a UDT name */
for (const udt of this.#udts) {
if (lower === udt.lower) {
this.#udtCallbacks[udt.index] = callback;
return;
}
}
throw new Error(`${this.#FILENAME}: setCallback name not a rule name or UDT name: ${name}`);
}
// Parse an input string against the configured SABNF grammar.
// * @param {string} startName - Name of the start rule.
// * @param {*} input - The input string. May be of type:
// * string(converted internally to Uint32Array of codepoints)
// * Array
// * Buffer
// * Uint8Array
// * Uint16Array
// * Uint32Array
// * @param {*} callbackData - User callback data. Not used by the parser but
// available to callback functions for the application's use.
// * @param {number} startChar - Index of the first character (inclusive) of the substring to parse. (default = 0)
// * @param {number} endChar - Index of the last character (exclusive) of the substring to parse. (default = input.length)
// * **Note:** startChar and endChar follow the same rules as for
// [String.prototype.slice()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/slice).
// * @returns - An object with the parser results:
// * success: true if the state is `MATCH` or `EMPTY` *and* the matched characters is the string or substring length
// * state: the final state identifier
// * stateName: the final state name, `MATCH`, `EMPTY`, or `NOMATCH`
// * length: the length of the substring being parsed, charEnd - charStart
// * matched: the number of characters matched
// * maxMatched: the maximum number of characters matched
// * maxTreeDepth: the maximum tree depth reached
// * nodeHits: the number of nodes visited
parse(startName, input, callbackData, startChar, endChar) {
const FUNCTION_NAME = `${this.#FILENAME}parse(): `;
/* check for all UDT's defined */
for (const udt of this.#udts) {
if (!this.#udtCallbacks[udt.index]) {
throw new Error(`${this.#FILENAME}UDT callback not defined: ${udt.name}`);
}
}
this.#reset();
/* validate input string type */
this.#chars = input;
if (typeof input === 'string') {
this.#charType = id.UINT32;
this.#chars = utils.stringToCodePoints(input);
} else if (input instanceof Uint8Array) {
this.#charType = id.UINT8;
} else if (input instanceof Uint16Array) {
this.#charType = id.UINT16;
} else if (input instanceof Uint32Array) {
this.#charType = id.UINT32;
} else if (Array.isArray(input)) {
this.#charType = id.ARRAY;
} else {
throw new Error(
`${FUNCTION_NAME} invalid input - must string, Array, Buffer, Uint8Array, Uint16Array or Uin32Array`
);
}
/* get the sub-string, if any */
const ret = utils.sliceInterval(this.#chars.length, startChar, endChar);
this.#charStart = ret.indexStart;
this.#charEnd = ret.indexEnd;
/* Find start rule index (case-insensitive) */
const r = this.#rules;
const lower = startName.toLowerCase();
let startIndex = undefined;
for (const rule of this.#rules) {
if (lower === rule.lower) {
startIndex = rule.index;
break;
}
}
if (startIndex === undefined) {
throw new Error(`${FUNCTION_NAME}start name not a valid grammar rule name: '${startName}'`);
}
/* Initialize AST/stats/trace if provided */
if (this.#trace) {
this.#trace.init(this.#rules, this.#udts, this.#chars, this.#charEnd, this.#charType);
}
if (this.#stats) {
this.#stats.init(this.#rules, this.#udts);
}
if (this.#ast) {
this.#ast.initChars(this.#chars);
}
/* Create a dummy opcode for the root (start rule) node and begin parsing */
this.#opcodes = [
{
type: id.RNM,
index: startIndex,
},
];
this.#userData = callbackData;
this.#opExecute(0, this.#charStart);
this.#opcodes = undefined;
/* Determine final result from sysData */
let success = false;
switch (this.#sysData.state) {
case id.ACTIVE:
throw new Error(`${FUNCTION_NAME}final state should never be 'ACTIVE'`);
case id.NOMATCH:
success = false;
break;
case id.EMPTY:
case id.MATCH:
success = this.#sysData.phraseLength === this.#charEnd - this.#charStart;
break;
default:
/* should never get here, but just in case */
throw new Error(`${FUNCTION_NAME}unrecognized final state: ${this.#sysData.state}`);
}
return {
success,
state: this.#sysData.state,
stateName: id.idName(this.#sysData.state),
length: this.#charEnd - this.#charStart,
matched: this.#sysData.phraseLength,
maxMatched: this.#maxMatched - this.#charStart,
maxTreeDepth: this.#maxTreeDepth,
nodeHits: this.#nodeHits,
};
}
// Resets the internal state variables. Called by the parser before each parse.
#reset() {
this.#lookAhead = 0;
this.#treeDepth = 0;
this.#maxTreeDepth = 0;
this.#nodeHits = 0;
this.#maxMatched = 0;
this.#opcodes = undefined;
this.#chars = undefined;
this.#charStart = 0;
this.#charEnd = 0;
this.#sysData.state = id.ACTIVE;
this.#sysData.phraseLength = 0;
this.#userData = undefined;
}
// For each node visit in the parse tree, executes the appropriate operation.
// Having a single point of execution for each node hit facilitates
// `trace` and `stats` collection as well as other parsing statistics
// reported in the return object.
#opExecute(opIndex, phraseIndex) {
const FUNCTION_NAME = `${this.#FILENAME}#opExecute(): `;
const op = this.#opcodes[opIndex];
this.#nodeHits++;
if (this.#treeDepth > this.#maxTreeDepth) {
this.#maxTreeDepth = this.#treeDepth;
}
this.#treeDepth++;
this.#sysData.state = id.ACTIVE;
this.#sysData.phraseLength = 0;
if (this.#trace) {
this.#trace.down(op, phraseIndex);
}
switch (op.type) {
case id.ALT:
this.#opALT(opIndex, phraseIndex);
break;
case id.CAT:
this.#opCAT(opIndex, phraseIndex);
break;
case id.REP:
this.#opREP(opIndex, phraseIndex);
break;
case id.RNM:
this.#opRNM(opIndex, phraseIndex);
break;
case id.TRG:
this.#opTRG(opIndex, phraseIndex);
break;
case id.TBS:
this.#opTBS(opIndex, phraseIndex);
break;
case id.TLS:
this.#opTLS(opIndex, phraseIndex);
break;
case id.UDT:
this.#opUDT(opIndex, phraseIndex);
break;
case id.AND:
this.#opAND(opIndex, phraseIndex);
break;
case id.NOT:
this.#opNOT(opIndex, phraseIndex);
break;
default:
throw new Error(`${FUNCTION_NAME}unrecognized operator`);
}
if (!this.#lookAhead) {
if (phraseIndex + this.#sysData.phraseLength > this.#maxMatched) {
this.#maxMatched = phraseIndex + this.#sysData.phraseLength;
}
}
if (this.#stats) {
this.#stats.collect(op, this.#sysData);
}
if (this.#trace) {
this.#trace.up(op, this.#sysData.state, phraseIndex, this.#sysData.phraseLength);
}
this.#treeDepth--;
}
// The alteration `ALT` operator.<br>
// Executes its child nodes, from left to right, until it finds a match.
// Fails if *all* of its child nodes fail.
#opALT(opIndex, phraseIndex) {
const op = this.#opcodes[opIndex];
for (let i = 0; i < op.children.length; i += 1) {
this.#opExecute(op.children[i], phraseIndex);
if (this.#sysData.state !== id.NOMATCH) {
break;
}
}
}
// The concatenation `CAT` operator.<br>
// Executes all of its child nodes, from left to right,
// concatenating the matched phrases.
// Fails if *any* child nodes fail.
#opCAT(opIndex, phraseIndex) {
const op = this.#opcodes[opIndex];
let success = true;
let catCharIndex = phraseIndex;
let catPhrase = 0;
let astLength;
if (this.#ast) {
astLength = this.#ast.getLength();
}
for (let i = 0; i < op.children.length; i += 1) {
this.#opExecute(op.children[i], catCharIndex);
if (this.#sysData.state === id.NOMATCH) {
success = false;
break;
} else {
catCharIndex += this.#sysData.phraseLength;
catPhrase += this.#sysData.phraseLength;
}
}
if (success) {
this.#sysData.state = catPhrase === 0 ? id.EMPTY : id.MATCH;
this.#sysData.phraseLength = catPhrase;
} else {
this.#sysData.state = id.NOMATCH;
this.#sysData.phraseLength = 0;
if (this.#ast) {
this.#ast.setLength(astLength);
}
}
}
// The repetion `REP` operator.<br>
// Repeatedly executes its single child node,
// concatenating each of the matched phrases found.
// The number of repetitions executed and its final `sysData` depends
// on its `min` & `max` repetition values.
// Zero repetitions (`0*0RuleName` or `0RuleName`) will represent an empty string
// but is deprecated.
#opREP(opIndex, phraseIndex) {
const op = this.#opcodes[opIndex];
if (op.max === 0) {
/* this is an empty-string acceptor
* deprecated: use the TLS empty string operator, "", instead
*/
this.#sysData.state = id.EMPTY;
this.#sysData.phraseLength = 0;
return;
}
let repCharIndex = phraseIndex;
let repPhrase = 0;
let repCount = 0;
let astLength;
if (this.#ast) {
astLength = this.#ast.getLength();
}
while (1) {
if (repCharIndex >= this.#charEnd) {
/* exit on end of input string */
break;
}
this.#opExecute(opIndex + 1, repCharIndex);
if (this.#sysData.state === id.NOMATCH) {
/* always end if the child node fails */
break;
}
if (this.#sysData.state === id.EMPTY) {
/* REP always succeeds when the child node returns an empty phrase */
/* this may not seem obvious, but that's the way it works out */
break;
}
repCount += 1;
repPhrase += this.#sysData.phraseLength;
repCharIndex += this.#sysData.phraseLength;
if (repCount === op.max) {
/* end on maxed out reps */
break;
}
}
/* evaluate the match count according to the min, max values */
if (this.#sysData.state === id.EMPTY) {
this.#sysData.state = repPhrase === 0 ? id.EMPTY : id.MATCH;
this.#sysData.phraseLength = repPhrase;
} else if (repCount >= op.min) {
this.#sysData.state = repPhrase === 0 ? id.EMPTY : id.MATCH;
this.#sysData.phraseLength = repPhrase;
} else {
this.#sysData.state = id.NOMATCH;
this.#sysData.phraseLength = 0;
if (this.#ast) {
this.#ast.setLength(astLength);
}
}
}
// Validate the rule callback function's returned `sysData` values.
// It's the application's responsibility to get them right
// but `RNM` fails if not.
#validateRnmCallbackResult = (rule, sysData, charsLeft, down) => {
let FUNCTION_NAME = `${this.#FILENAME}opRNM `;
if (sysData.phraseLength > charsLeft) {
let str = `${FUNCTION_NAME}${rule.name}: callback function error: `;
str += `sysData.phraseLength: ${sysData.phraseLength}`;
str += ` must be <= remaining chars: ${charsLeft}`;
throw new Error(str);
}
switch (sysData.state) {
case id.ACTIVE:
if (!down) {
throw new Error(
`${FUNCTION_NAME}opRNM(${rule.name}): callback function return error. ACTIVE state not allowed.`
);
}
break;
case id.EMPTY:
sysData.phraseLength = 0;
break;
case id.MATCH:
if (sysData.phraseLength === 0) {
sysData.state = id.EMPTY;
}
break;
case id.NOMATCH:
sysData.phraseLength = 0;
break;
default:
throw new Error(
`${FUNCTION_NAME}(${rule.name}): callback function return error. Unrecognized return state: ${sysData.state}`
);
}
};
// The rule name `RNM` operator.<br>
// This operator will acts as a root node for a parse tree branch below and
// returns the matched phrase to its parent.
// However, its larger responsibility is handling user-defined callback functions and `AST` nodes.
// Note that the `AST` is a separate object, but `RNM` calls its functions to create its nodes.
#opRNM(opIndex, phraseIndex) {
let astLength;
let astDefined;
let savedOpcodes;
const op = this.#opcodes[opIndex];
const rule = this.#rules[op.index];
const callback = this.#ruleCallbacks[rule.index];
/* ignore AST in look ahead (AND or NOT operator above) */
if (!this.#lookAhead) {
astDefined = this.#ast && this.#ast.ruleDefined(op.index);
if (astDefined) {
astLength = this.#ast.getLength();
this.#ast.down(op.index, rule.name);
}
}
if (callback) {
/* call application's callback going down the parse tree*/
const charsLeft = this.#charEnd - phraseIndex;
callback(this.#sysData, this.#chars, phraseIndex, this.#userData);
this.#validateRnmCallbackResult(rule, this.#sysData, charsLeft, true);
if (this.#sysData.state === id.ACTIVE) {
savedOpcodes = this.#opcodes;
this.#opcodes = rule.opcodes;
this.#opExecute(0, phraseIndex);
this.#opcodes = savedOpcodes;
/* call application's callback going up the parse tree*/
callback(this.#sysData, this.#chars, phraseIndex, this.#userData);
this.#validateRnmCallbackResult(rule, this.#sysData, charsLeft, false);
} /* implied else clause: just accept the callback sysData - RNM acting as UDT */
} else {
/* no callback - just execute the rule */
savedOpcodes = this.#opcodes;
this.#opcodes = rule.opcodes;
this.#opExecute(0, phraseIndex);
this.#opcodes = savedOpcodes;
}
if (!this.#lookAhead) {
/* end AST */
if (astDefined) {
if (this.#sysData.state === id.NOMATCH) {
this.#ast.setLength(astLength);
} else {
this.#ast.up(op.index, rule.name, phraseIndex, this.#sysData.phraseLength);
}
}
}
}
// Validate the UDT callback function's returned `sysData` values.
// It's the application's responsibility to get it right but `UDT` fails if not.
#validateUdtCallbackResult = (udt, sysData, charsLeft) => {
const FUNCTION_NAME = `${this.#FILENAME}opUDT`;
if (sysData.phraseLength > charsLeft) {
let str = `${FUNCTION_NAME}(${udt.name}): callback function error: `;
str += `sysData.phraseLength: ${sysData.phraseLength}`;
str += ` must be <= remaining chars: ${charsLeft}`;
throw new Error(str);
}
switch (sysData.state) {
case id.ACTIVE:
throw new Error(`${FUNCTION_NAME}(${udt.name}) ACTIVE state return not allowed.`);
case id.EMPTY:
if (udt.empty) {
sysData.phraseLength = 0;
} else {
throw new Error(`${FUNCTION_NAME}(${udt.name}) may not return EMPTY.`);
}
break;
case id.MATCH:
if (sysData.phraseLength === 0) {
if (udt.empty) {
sysData.state = id.EMPTY;
} else {
throw new Error(`${FUNCTION_NAME}(${udt.name}) may not return EMPTY.`);
}
}
break;
case id.NOMATCH:
sysData.phraseLength = 0;
break;
default:
throw new Error(
`${FUNCTION_NAME}(${udt.name}): callback function return error. Unrecognized return state: ${sysData.state}`
);
}
};
// The User-Define Terminal `UDT` operator.<br>
// Simply calls the application's callback function, but operates like `RNM` with regard to the `AST`.
// There is some ambiguity here. `UDT`s act as terminals for phrase recognition but as named rules
// for `AST` nodes.
#opUDT(opIndex, phraseIndex) {
let astLength;
let astIndex;
let astDefined;
const op = this.#opcodes[opIndex];
const udt = this.#udts[op.index];
this.#sysData.UdtIndex = udt.index;
/* ignore AST in look ahead */
if (!this.#lookAhead) {
astDefined = this.#ast && this.#ast.udtDefined(op.index);
if (astDefined) {
astIndex = this.#rules.length + op.index;
astLength = this.#ast.getLength();
this.#ast.down(astIndex, udt.name);
}
}
/* call the UDT */
const charsLeft = this.#charEnd - phraseIndex;
this.#udtCallbacks[op.index](this.#sysData, this.#chars, phraseIndex, this.#userData);
this.#validateUdtCallbackResult(udt, this.#sysData, charsLeft);
if (!this.#lookAhead) {
/* end AST */
if (astDefined) {
if (this.#sysData.state === id.NOMATCH) {
this.#ast.setLength(astLength);
} else {
this.#ast.up(astIndex, udt.name, phraseIndex, this.#sysData.phraseLength);
}
}
}
}
// The Terminal Literal String `TLS` operator.<br>
// Matches its pre-defined phrase against the input string.
// A case-insensitive match is attempted for ASCII alphbetical characters.
// The `TLS` explicitly allows empty phrases.
// This is the preferred method of explicitly representing an empty string.
#opTLS(opIndex, phraseIndex) {
let code;
const op = this.#opcodes[opIndex];
this.#sysData.state = id.NOMATCH;
const len = op.string.length;
if (len === 0) {
/* EMPTY match allowed for TLS */
this.#sysData.state = id.EMPTY;
return;
}
if (phraseIndex + len <= this.#charEnd) {
for (let i = 0; i < len; i += 1) {
code = this.#chars[phraseIndex + i];
if (code >= 65 && code <= 90) {
code += 32;
}
if (code !== op.string[i]) {
return;
}
}
this.#sysData.state = id.MATCH;
this.#sysData.phraseLength = len;
} /* implied else NOMATCH */
}
// The Terminal Binary String`TBS` operator.<br>
// Matches its pre-defined phrase against the input string.
// All characters must match exactly.
// Case-sensitive literal strings (`'string'` & `%s"string"`) are translated to `TBS`
// operators by `apg`.
// '' or %s"" will represent an empty string
// but is deprecated.
#opTBS(opIndex, phraseIndex) {
const op = this.#opcodes[opIndex];
const len = op.string.length;
this.#sysData.state = id.NOMATCH;
if (phraseIndex + len <= this.#charEnd) {
for (let i = 0; i < len; i += 1) {
if (this.#chars[phraseIndex + i] !== op.string[i]) {
return;
}
}
this.#sysData.state = id.MATCH;
this.#sysData.phraseLength = len;
} /* implied else NOMATCH */
}
// The Terminal Range`TRG` operator.<br>
// Succeeds if the single first character of the phrase is
// within the `min - max` range.
#opTRG(opIndex, phraseIndex) {
const op = this.#opcodes[opIndex];
this.#sysData.state = id.NOMATCH;
if (phraseIndex < this.#charEnd) {
if (op.min <= this.#chars[phraseIndex] && this.#chars[phraseIndex] <= op.max) {
this.#sysData.state = id.MATCH;
this.#sysData.phraseLength = 1;
}
}
}
// The positive lookahead, `AND`, operator.<br>
// Executes its single child node, returning the EMPTY state
// if it succeeds and NOMATCH if it fails.
// *Always* backtracks on any matched phrase and returns EMPTY on success.
#opAND(opIndex, phraseIndex) {
const FUNCTION_NAME = `${this.#FILENAME}opAND: `;
this.#lookAhead++;
this.#opExecute(opIndex + 1, phraseIndex);
this.#lookAhead--;
this.#sysData.phraseLength = 0;
switch (this.#sysData.state) {
case id.EMPTY:
this.#sysData.state = id.EMPTY;
break;
case id.MATCH:
this.#sysData.state = id.EMPTY;
break;
case id.NOMATCH:
this.#sysData.state = id.NOMATCH;
break;
default:
throw new Error(`${FUNCTION_NAME}invalid state ${sysData.state}`);
}
}
// The negative look ahead, `NOT`, operator.<br>
// Executes its single child node, returning the EMPTY state
// if it **fails** and NOMATCH if it succeeds.
// *Always* backtracks on any matched phrase and returns EMPTY
// on success (failure of its child node).
#opNOT(opIndex, phraseIndex) {
this.#lookAhead++;
this.#opExecute(opIndex + 1, phraseIndex);
this.#lookAhead--;
this.#sysData.phraseLength = 0;
switch (this.#sysData.state) {
case id.EMPTY:
case id.MATCH:
this.#sysData.state = id.NOMATCH;
break;
case id.NOMATCH:
this.#sysData.state = id.EMPTY;
break;
default:
throw new Error(`${this.#FILENAME}opNOT: invalid state ${sysData.state}`);
}
}
}