UNPKG

apg-unicode

Version:

JavaScript APG parser of Unicode code point arrays

697 lines (677 loc) 25.1 kB
// # Overview of the Parser Class // The `Parser` class serves as the core interface to the `apg-unicode` parsing engine. // It encapsulates the primary `parse` function and exposes modular facilities for // enhanced parsing diagnostics and customization. // ## Design Philosophy // - **Modular Diagnostics**: AST, Trace, and Stats are opt-in, allowing lightweight parsing when needed and deep introspection when desired. // - **Callback-Driven Customization**: Callback functions can be attached to Rule and UDT names for input string translation during the pass through the parse tree. // - **Semantic Clarity**: Each method is purpose-built to expose a distinct parsing concern, following the single-responsibility principle. import { identifiers as id } from '../src/identifiers.js'; import { utilities as utils } from '../src/utilities.js'; export { Parser }; class Parser { #FILENAME = 'parser.js: '; #charType; #ast; #stats; #trace; #lookAhead; #treeDepth; #maxTreeDepth; #nodeHits; #maxMatched; #rules; #udts; #opcodes; #chars; #charStart; #charEnd; #ruleCallbacks; #udtCallbacks; #userData; #sysData = {}; // The constructor takes a single argument, an SABNF grammar object. // See any of the examples in the `examples` directory for usage. constructor(grammar) { this.#rules = grammar.rules; this.#udts = grammar.udts; this.#ruleCallbacks = Array(this.#rules.length).fill(undefined); this.#udtCallbacks = this.#udts.length ? Array(this.#udts.length).fill(undefined) : undefined; } // Attach a Trace object to the parser. // The parser will interact with the Trace object // generating a trace of the parser through all of the nodes // of the parse tree. See `examples/trace` for usage. setTrace(trace) { if (!trace) { this.#trace = undefined; } else { this.#trace = trace; } } // Attach a Stats object to the parser. // The parser will interact with the Stats object // generating a collection of node hits by the parser. // See `examples/stats` for usage. setStats(stats) { if (!stats) { this.#stats = undefined; } else { this.#stats = stats; } } // Attach an Ast object to the parser. // The parser will interact with the Ast object // generating an AST. // See `examples/ast` for usage. setAst(ast) { if (!ast) { this.#ast = undefined; } else { this.#ast = ast; this.#ast.initGrammar(this.#rules, this.#udts); } } // Attach a callback function to a rule or UDT named in the SABNF grammar. // * @param {string | undefined} name - a valid rule or UDT name. If `undefined` all callback functions will be cleared. // * @param {function} callback - a user-written function for processing the node phrases setCallback(name, callback) { if (name === undefined) { /* clear all callbacks */ this.#ruleCallbacks = Array(this.#rules.length).fill(undefined); this.#udtCallbacks = Array(this.#udts.length).fill(undefined); return; } if (!(typeof name === 'string' && typeof callback === 'function')) { throw new Error(`${this.#FILENAME}: setCallback() argument types not "string", "function"`); } /* see if this is a rule name */ const lower = name.toLowerCase(); for (const rule of this.#rules) { if (lower === rule.lower) { this.#ruleCallbacks[rule.index] = callback; return; } } /* see if this is a UDT name */ for (const udt of this.#udts) { if (lower === udt.lower) { this.#udtCallbacks[udt.index] = callback; return; } } throw new Error(`${this.#FILENAME}: setCallback name not a rule name or UDT name: ${name}`); } // Parse an input string against the configured SABNF grammar. // * @param {string} startName - Name of the start rule. // * @param {*} input - The input string. May be of type: // * string(converted internally to Uint32Array of codepoints) // * Array // * Buffer // * Uint8Array // * Uint16Array // * Uint32Array // * @param {*} callbackData - User callback data. Not used by the parser but // available to callback functions for the application's use. // * @param {number} startChar - Index of the first character (inclusive) of the substring to parse. (default = 0) // * @param {number} endChar - Index of the last character (exclusive) of the substring to parse. (default = input.length) // * **Note:** startChar and endChar follow the same rules as for // [String.prototype.slice()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/slice). // * @returns - An object with the parser results: // * success: true if the state is `MATCH` or `EMPTY` *and* the matched characters is the string or substring length // * state: the final state identifier // * stateName: the final state name, `MATCH`, `EMPTY`, or `NOMATCH` // * length: the length of the substring being parsed, charEnd - charStart // * matched: the number of characters matched // * maxMatched: the maximum number of characters matched // * maxTreeDepth: the maximum tree depth reached // * nodeHits: the number of nodes visited parse(startName, input, callbackData, startChar, endChar) { const FUNCTION_NAME = `${this.#FILENAME}parse(): `; /* check for all UDT's defined */ for (const udt of this.#udts) { if (!this.#udtCallbacks[udt.index]) { throw new Error(`${this.#FILENAME}UDT callback not defined: ${udt.name}`); } } this.#reset(); /* validate input string type */ this.#chars = input; if (typeof input === 'string') { this.#charType = id.UINT32; this.#chars = utils.stringToCodePoints(input); } else if (input instanceof Uint8Array) { this.#charType = id.UINT8; } else if (input instanceof Uint16Array) { this.#charType = id.UINT16; } else if (input instanceof Uint32Array) { this.#charType = id.UINT32; } else if (Array.isArray(input)) { this.#charType = id.ARRAY; } else { throw new Error( `${FUNCTION_NAME} invalid input - must string, Array, Buffer, Uint8Array, Uint16Array or Uin32Array` ); } /* get the sub-string, if any */ const ret = utils.sliceInterval(this.#chars.length, startChar, endChar); this.#charStart = ret.indexStart; this.#charEnd = ret.indexEnd; /* Find start rule index (case-insensitive) */ const r = this.#rules; const lower = startName.toLowerCase(); let startIndex = undefined; for (const rule of this.#rules) { if (lower === rule.lower) { startIndex = rule.index; break; } } if (startIndex === undefined) { throw new Error(`${FUNCTION_NAME}start name not a valid grammar rule name: '${startName}'`); } /* Initialize AST/stats/trace if provided */ if (this.#trace) { this.#trace.init(this.#rules, this.#udts, this.#chars, this.#charEnd, this.#charType); } if (this.#stats) { this.#stats.init(this.#rules, this.#udts); } if (this.#ast) { this.#ast.initChars(this.#chars); } /* Create a dummy opcode for the root (start rule) node and begin parsing */ this.#opcodes = [ { type: id.RNM, index: startIndex, }, ]; this.#userData = callbackData; this.#opExecute(0, this.#charStart); this.#opcodes = undefined; /* Determine final result from sysData */ let success = false; switch (this.#sysData.state) { case id.ACTIVE: throw new Error(`${FUNCTION_NAME}final state should never be 'ACTIVE'`); case id.NOMATCH: success = false; break; case id.EMPTY: case id.MATCH: success = this.#sysData.phraseLength === this.#charEnd - this.#charStart; break; default: /* should never get here, but just in case */ throw new Error(`${FUNCTION_NAME}unrecognized final state: ${this.#sysData.state}`); } return { success, state: this.#sysData.state, stateName: id.idName(this.#sysData.state), length: this.#charEnd - this.#charStart, matched: this.#sysData.phraseLength, maxMatched: this.#maxMatched - this.#charStart, maxTreeDepth: this.#maxTreeDepth, nodeHits: this.#nodeHits, }; } // Resets the internal state variables. Called by the parser before each parse. #reset() { this.#lookAhead = 0; this.#treeDepth = 0; this.#maxTreeDepth = 0; this.#nodeHits = 0; this.#maxMatched = 0; this.#opcodes = undefined; this.#chars = undefined; this.#charStart = 0; this.#charEnd = 0; this.#sysData.state = id.ACTIVE; this.#sysData.phraseLength = 0; this.#userData = undefined; } // For each node visit in the parse tree, executes the appropriate operation. // Having a single point of execution for each node hit facilitates // `trace` and `stats` collection as well as other parsing statistics // reported in the return object. #opExecute(opIndex, phraseIndex) { const FUNCTION_NAME = `${this.#FILENAME}#opExecute(): `; const op = this.#opcodes[opIndex]; this.#nodeHits++; if (this.#treeDepth > this.#maxTreeDepth) { this.#maxTreeDepth = this.#treeDepth; } this.#treeDepth++; this.#sysData.state = id.ACTIVE; this.#sysData.phraseLength = 0; if (this.#trace) { this.#trace.down(op, phraseIndex); } switch (op.type) { case id.ALT: this.#opALT(opIndex, phraseIndex); break; case id.CAT: this.#opCAT(opIndex, phraseIndex); break; case id.REP: this.#opREP(opIndex, phraseIndex); break; case id.RNM: this.#opRNM(opIndex, phraseIndex); break; case id.TRG: this.#opTRG(opIndex, phraseIndex); break; case id.TBS: this.#opTBS(opIndex, phraseIndex); break; case id.TLS: this.#opTLS(opIndex, phraseIndex); break; case id.UDT: this.#opUDT(opIndex, phraseIndex); break; case id.AND: this.#opAND(opIndex, phraseIndex); break; case id.NOT: this.#opNOT(opIndex, phraseIndex); break; default: throw new Error(`${FUNCTION_NAME}unrecognized operator`); } if (!this.#lookAhead) { if (phraseIndex + this.#sysData.phraseLength > this.#maxMatched) { this.#maxMatched = phraseIndex + this.#sysData.phraseLength; } } if (this.#stats) { this.#stats.collect(op, this.#sysData); } if (this.#trace) { this.#trace.up(op, this.#sysData.state, phraseIndex, this.#sysData.phraseLength); } this.#treeDepth--; } // The alteration `ALT` operator.<br> // Executes its child nodes, from left to right, until it finds a match. // Fails if *all* of its child nodes fail. #opALT(opIndex, phraseIndex) { const op = this.#opcodes[opIndex]; for (let i = 0; i < op.children.length; i += 1) { this.#opExecute(op.children[i], phraseIndex); if (this.#sysData.state !== id.NOMATCH) { break; } } } // The concatenation `CAT` operator.<br> // Executes all of its child nodes, from left to right, // concatenating the matched phrases. // Fails if *any* child nodes fail. #opCAT(opIndex, phraseIndex) { const op = this.#opcodes[opIndex]; let success = true; let catCharIndex = phraseIndex; let catPhrase = 0; let astLength; if (this.#ast) { astLength = this.#ast.getLength(); } for (let i = 0; i < op.children.length; i += 1) { this.#opExecute(op.children[i], catCharIndex); if (this.#sysData.state === id.NOMATCH) { success = false; break; } else { catCharIndex += this.#sysData.phraseLength; catPhrase += this.#sysData.phraseLength; } } if (success) { this.#sysData.state = catPhrase === 0 ? id.EMPTY : id.MATCH; this.#sysData.phraseLength = catPhrase; } else { this.#sysData.state = id.NOMATCH; this.#sysData.phraseLength = 0; if (this.#ast) { this.#ast.setLength(astLength); } } } // The repetion `REP` operator.<br> // Repeatedly executes its single child node, // concatenating each of the matched phrases found. // The number of repetitions executed and its final `sysData` depends // on its `min` & `max` repetition values. // Zero repetitions (`0*0RuleName` or `0RuleName`) will represent an empty string // but is deprecated. #opREP(opIndex, phraseIndex) { const op = this.#opcodes[opIndex]; if (op.max === 0) { /* this is an empty-string acceptor * deprecated: use the TLS empty string operator, "", instead */ this.#sysData.state = id.EMPTY; this.#sysData.phraseLength = 0; return; } let repCharIndex = phraseIndex; let repPhrase = 0; let repCount = 0; let astLength; if (this.#ast) { astLength = this.#ast.getLength(); } while (1) { if (repCharIndex >= this.#charEnd) { /* exit on end of input string */ break; } this.#opExecute(opIndex + 1, repCharIndex); if (this.#sysData.state === id.NOMATCH) { /* always end if the child node fails */ break; } if (this.#sysData.state === id.EMPTY) { /* REP always succeeds when the child node returns an empty phrase */ /* this may not seem obvious, but that's the way it works out */ break; } repCount += 1; repPhrase += this.#sysData.phraseLength; repCharIndex += this.#sysData.phraseLength; if (repCount === op.max) { /* end on maxed out reps */ break; } } /* evaluate the match count according to the min, max values */ if (this.#sysData.state === id.EMPTY) { this.#sysData.state = repPhrase === 0 ? id.EMPTY : id.MATCH; this.#sysData.phraseLength = repPhrase; } else if (repCount >= op.min) { this.#sysData.state = repPhrase === 0 ? id.EMPTY : id.MATCH; this.#sysData.phraseLength = repPhrase; } else { this.#sysData.state = id.NOMATCH; this.#sysData.phraseLength = 0; if (this.#ast) { this.#ast.setLength(astLength); } } } // Validate the rule callback function's returned `sysData` values. // It's the application's responsibility to get them right // but `RNM` fails if not. #validateRnmCallbackResult = (rule, sysData, charsLeft, down) => { let FUNCTION_NAME = `${this.#FILENAME}opRNM `; if (sysData.phraseLength > charsLeft) { let str = `${FUNCTION_NAME}${rule.name}: callback function error: `; str += `sysData.phraseLength: ${sysData.phraseLength}`; str += ` must be <= remaining chars: ${charsLeft}`; throw new Error(str); } switch (sysData.state) { case id.ACTIVE: if (!down) { throw new Error( `${FUNCTION_NAME}opRNM(${rule.name}): callback function return error. ACTIVE state not allowed.` ); } break; case id.EMPTY: sysData.phraseLength = 0; break; case id.MATCH: if (sysData.phraseLength === 0) { sysData.state = id.EMPTY; } break; case id.NOMATCH: sysData.phraseLength = 0; break; default: throw new Error( `${FUNCTION_NAME}(${rule.name}): callback function return error. Unrecognized return state: ${sysData.state}` ); } }; // The rule name `RNM` operator.<br> // This operator will acts as a root node for a parse tree branch below and // returns the matched phrase to its parent. // However, its larger responsibility is handling user-defined callback functions and `AST` nodes. // Note that the `AST` is a separate object, but `RNM` calls its functions to create its nodes. #opRNM(opIndex, phraseIndex) { let astLength; let astDefined; let savedOpcodes; const op = this.#opcodes[opIndex]; const rule = this.#rules[op.index]; const callback = this.#ruleCallbacks[rule.index]; /* ignore AST in look ahead (AND or NOT operator above) */ if (!this.#lookAhead) { astDefined = this.#ast && this.#ast.ruleDefined(op.index); if (astDefined) { astLength = this.#ast.getLength(); this.#ast.down(op.index, rule.name); } } if (callback) { /* call application's callback going down the parse tree*/ const charsLeft = this.#charEnd - phraseIndex; callback(this.#sysData, this.#chars, phraseIndex, this.#userData); this.#validateRnmCallbackResult(rule, this.#sysData, charsLeft, true); if (this.#sysData.state === id.ACTIVE) { savedOpcodes = this.#opcodes; this.#opcodes = rule.opcodes; this.#opExecute(0, phraseIndex); this.#opcodes = savedOpcodes; /* call application's callback going up the parse tree*/ callback(this.#sysData, this.#chars, phraseIndex, this.#userData); this.#validateRnmCallbackResult(rule, this.#sysData, charsLeft, false); } /* implied else clause: just accept the callback sysData - RNM acting as UDT */ } else { /* no callback - just execute the rule */ savedOpcodes = this.#opcodes; this.#opcodes = rule.opcodes; this.#opExecute(0, phraseIndex); this.#opcodes = savedOpcodes; } if (!this.#lookAhead) { /* end AST */ if (astDefined) { if (this.#sysData.state === id.NOMATCH) { this.#ast.setLength(astLength); } else { this.#ast.up(op.index, rule.name, phraseIndex, this.#sysData.phraseLength); } } } } // Validate the UDT callback function's returned `sysData` values. // It's the application's responsibility to get it right but `UDT` fails if not. #validateUdtCallbackResult = (udt, sysData, charsLeft) => { const FUNCTION_NAME = `${this.#FILENAME}opUDT`; if (sysData.phraseLength > charsLeft) { let str = `${FUNCTION_NAME}(${udt.name}): callback function error: `; str += `sysData.phraseLength: ${sysData.phraseLength}`; str += ` must be <= remaining chars: ${charsLeft}`; throw new Error(str); } switch (sysData.state) { case id.ACTIVE: throw new Error(`${FUNCTION_NAME}(${udt.name}) ACTIVE state return not allowed.`); case id.EMPTY: if (udt.empty) { sysData.phraseLength = 0; } else { throw new Error(`${FUNCTION_NAME}(${udt.name}) may not return EMPTY.`); } break; case id.MATCH: if (sysData.phraseLength === 0) { if (udt.empty) { sysData.state = id.EMPTY; } else { throw new Error(`${FUNCTION_NAME}(${udt.name}) may not return EMPTY.`); } } break; case id.NOMATCH: sysData.phraseLength = 0; break; default: throw new Error( `${FUNCTION_NAME}(${udt.name}): callback function return error. Unrecognized return state: ${sysData.state}` ); } }; // The User-Define Terminal `UDT` operator.<br> // Simply calls the application's callback function, but operates like `RNM` with regard to the `AST`. // There is some ambiguity here. `UDT`s act as terminals for phrase recognition but as named rules // for `AST` nodes. #opUDT(opIndex, phraseIndex) { let astLength; let astIndex; let astDefined; const op = this.#opcodes[opIndex]; const udt = this.#udts[op.index]; this.#sysData.UdtIndex = udt.index; /* ignore AST in look ahead */ if (!this.#lookAhead) { astDefined = this.#ast && this.#ast.udtDefined(op.index); if (astDefined) { astIndex = this.#rules.length + op.index; astLength = this.#ast.getLength(); this.#ast.down(astIndex, udt.name); } } /* call the UDT */ const charsLeft = this.#charEnd - phraseIndex; this.#udtCallbacks[op.index](this.#sysData, this.#chars, phraseIndex, this.#userData); this.#validateUdtCallbackResult(udt, this.#sysData, charsLeft); if (!this.#lookAhead) { /* end AST */ if (astDefined) { if (this.#sysData.state === id.NOMATCH) { this.#ast.setLength(astLength); } else { this.#ast.up(astIndex, udt.name, phraseIndex, this.#sysData.phraseLength); } } } } // The Terminal Literal String `TLS` operator.<br> // Matches its pre-defined phrase against the input string. // A case-insensitive match is attempted for ASCII alphbetical characters. // The `TLS` explicitly allows empty phrases. // This is the preferred method of explicitly representing an empty string. #opTLS(opIndex, phraseIndex) { let code; const op = this.#opcodes[opIndex]; this.#sysData.state = id.NOMATCH; const len = op.string.length; if (len === 0) { /* EMPTY match allowed for TLS */ this.#sysData.state = id.EMPTY; return; } if (phraseIndex + len <= this.#charEnd) { for (let i = 0; i < len; i += 1) { code = this.#chars[phraseIndex + i]; if (code >= 65 && code <= 90) { code += 32; } if (code !== op.string[i]) { return; } } this.#sysData.state = id.MATCH; this.#sysData.phraseLength = len; } /* implied else NOMATCH */ } // The Terminal Binary String`TBS` operator.<br> // Matches its pre-defined phrase against the input string. // All characters must match exactly. // Case-sensitive literal strings (`'string'` & `%s"string"`) are translated to `TBS` // operators by `apg`. // '' or %s"" will represent an empty string // but is deprecated. #opTBS(opIndex, phraseIndex) { const op = this.#opcodes[opIndex]; const len = op.string.length; this.#sysData.state = id.NOMATCH; if (phraseIndex + len <= this.#charEnd) { for (let i = 0; i < len; i += 1) { if (this.#chars[phraseIndex + i] !== op.string[i]) { return; } } this.#sysData.state = id.MATCH; this.#sysData.phraseLength = len; } /* implied else NOMATCH */ } // The Terminal Range`TRG` operator.<br> // Succeeds if the single first character of the phrase is // within the `min - max` range. #opTRG(opIndex, phraseIndex) { const op = this.#opcodes[opIndex]; this.#sysData.state = id.NOMATCH; if (phraseIndex < this.#charEnd) { if (op.min <= this.#chars[phraseIndex] && this.#chars[phraseIndex] <= op.max) { this.#sysData.state = id.MATCH; this.#sysData.phraseLength = 1; } } } // The positive lookahead, `AND`, operator.<br> // Executes its single child node, returning the EMPTY state // if it succeeds and NOMATCH if it fails. // *Always* backtracks on any matched phrase and returns EMPTY on success. #opAND(opIndex, phraseIndex) { const FUNCTION_NAME = `${this.#FILENAME}opAND: `; this.#lookAhead++; this.#opExecute(opIndex + 1, phraseIndex); this.#lookAhead--; this.#sysData.phraseLength = 0; switch (this.#sysData.state) { case id.EMPTY: this.#sysData.state = id.EMPTY; break; case id.MATCH: this.#sysData.state = id.EMPTY; break; case id.NOMATCH: this.#sysData.state = id.NOMATCH; break; default: throw new Error(`${FUNCTION_NAME}invalid state ${sysData.state}`); } } // The negative look ahead, `NOT`, operator.<br> // Executes its single child node, returning the EMPTY state // if it **fails** and NOMATCH if it succeeds. // *Always* backtracks on any matched phrase and returns EMPTY // on success (failure of its child node). #opNOT(opIndex, phraseIndex) { this.#lookAhead++; this.#opExecute(opIndex + 1, phraseIndex); this.#lookAhead--; this.#sysData.phraseLength = 0; switch (this.#sysData.state) { case id.EMPTY: case id.MATCH: this.#sysData.state = id.NOMATCH; break; case id.NOMATCH: this.#sysData.state = id.EMPTY; break; default: throw new Error(`${this.#FILENAME}opNOT: invalid state ${sysData.state}`); } } }