UNPKG

buffer-apg-js

Version:

JavaScript APG, an ABNF Parser Generator

github.com/tfalencar/buffer-apg-js

tfalencar/buffer-apg-js

1,466 lines (1,463 loc) • 50.6 kB

JavaScript

/* eslint-disable func-names */ /* eslint-disable no-restricted-syntax */ /* eslint-disable new-cap */ /* eslint-disable guard-for-in */ /* ************************************************************************************* * copyright: Copyright (c) 2021 Lowell D. Thomas, all rights reserved * license: BSD-2-Clause (https://opensource.org/licenses/BSD-2-Clause) * ********************************************************************************* */ // This is the primary object of `apg-lib`. Calling its `parse()` member function // walks the parse tree of opcodes, matching phrases from the input string as it goes. // The working code for all of the operators, `ALT`, `CAT`, etc. is in this module. module.exports = function parser() { const id = require('./identifiers'); const utils = require('./utilities'); const thisFileName = 'parser.js: '; const thisThis = this; let opExecute; this.ast = null; this.stats = null; this.trace = null; this.callbacks = []; let opcodes = null; let chars = null; let charsBegin; let charsLength; let charsEnd; let lookAround; let treeDepth = 0; let maxTreeDepth = 0; let nodeHits = 0; let ruleCallbacks = null; let udtCallbacks = null; let rules = null; let udts = null; let syntaxData = null; let maxMatched = 0; let limitTreeDepth = Infinity; let limitNodeHits = Infinity; // Evaluates any given rule. This can be called from the syntax callback // functions to evaluate any rule in the grammar's rule list. Great caution // should be used. Use of this function will alter the language that the // parser accepts. const evaluateRule = function evaluateRule(ruleIndex, phraseIndex, sysData) { const functionName = `${thisFileName}evaluateRule(): `; if (ruleIndex >= rules.length) { throw new Error(`${functionName}rule index: ${ruleIndex} out of range`); } if (phraseIndex >= charsEnd) { throw new Error(`${functionName}phrase index: ${phraseIndex} out of range`); } const { length } = opcodes; opcodes.push({ type: id.RNM, index: ruleIndex, }); opExecute(length, phraseIndex, sysData); opcodes.pop(); }; // Evaluates any given UDT. This can be called from the syntax callback // functions to evaluate any UDT in the grammar's UDT list. Great caution // should be used. Use of this function will alter the language that the // parser accepts. const evaluateUdt = function (udtIndex, phraseIndex, sysData) { const functionName = `${thisFileName}evaluateUdt(): `; if (udtIndex >= udts.length) { throw new Error(`${functionName}udt index: ${udtIndex} out of range`); } if (phraseIndex >= charsEnd) { throw new Error(`${functionName}phrase index: ${phraseIndex} out of range`); } const { length } = opcodes; opcodes.push({ type: id.UDT, empty: udts[udtIndex].empty, index: udtIndex, }); opExecute(length, phraseIndex, sysData); opcodes.pop(); }; /* Clears this object of any/all data that has been initialized or added to it. */ /* Called by parse() on initialization, allowing this object to be re-used for multiple parsing calls. */ const clear = function () { treeDepth = 0; maxTreeDepth = 0; nodeHits = 0; maxMatched = 0; lookAround = [ { lookAround: id.LOOKAROUND_NONE, anchor: 0, charsEnd: 0, charsLength: 0, }, ]; rules = null; udts = null; chars = null; charsBegin = 0; charsLength = 0; charsEnd = 0; ruleCallbacks = null; udtCallbacks = null; syntaxData = null; opcodes = null; }; /* object for maintaining a stack of back reference frames */ const backRef = function () { const stack = []; const init = function () { const obj = {}; rules.forEach((rule) => { if (rule.isBkr) { obj[rule.lower] = null; } }); if (udts.length > 0) { udts.forEach((udt) => { if (udt.isBkr) { obj[udt.lower] = null; } }); } stack.push(obj); }; const copy = function () { const top = stack[stack.length - 1]; const obj = {}; /* // eslint-disable-next-line no-restricted-syntax */ for (const name in top) { obj[name] = top[name]; } return obj; }; this.push = function push() { stack.push(copy()); }; this.pop = function pop(lengthArg) { let length = lengthArg; if (!length) { length = stack.length - 1; } if (length < 1 || length > stack.length) { throw new Error(`${thisFileName}backRef.pop(): bad length: ${length}`); } stack.length = length; return stack[stack.length - 1]; }; this.length = function length() { return stack.length; }; this.savePhrase = function savePhrase(name, index, length) { stack[stack.length - 1][name] = { phraseIndex: index, phraseLength: length, }; }; this.getPhrase = function (name) { return stack[stack.length - 1][name]; }; /* constructor */ init(); }; // The system data structure that relays system information to and from the rule and UDT callback functions. // - *state* - the state of the parser, ACTIVE, MATCH, EMPTY or NOMATCH (see the `identifiers` object in // [`apg-lib`](https://github.com/ldthomas/apg-js2-lib)) // - *phraseLength* - the number of characters matched if the state is MATCHED or EMPTY // - *lookaround* - the top of the stack holds the current look around state, // LOOKAROUND_NONE, LOOKAROUND_AHEAD or LOOKAROUND_BEHIND, // - *uFrame* - the "universal" back reference frame. // Holds the last matched phrase for each of the back referenced rules and UDTs. // - *pFrame* - the stack of "parent" back reference frames. // Holds the matched phrase from the parent frame of each back referenced rules and UDTs. // - *evaluateRule* - a reference to this object's `evaluateRule()` function. // Can be called from a callback function (use with extreme caution!) // - *evaluateUdt* - a reference to this object's `evaluateUdt()` function. // Can be called from a callback function (use with extreme caution!) const systemData = function systemData() { const thisData = this; this.state = id.ACTIVE; this.phraseLength = 0; this.ruleIndex = 0; this.udtIndex = 0; this.lookAround = lookAround[lookAround.length - 1]; this.uFrame = new backRef(); this.pFrame = new backRef(); this.evaluateRule = evaluateRule; this.evaluateUdt = evaluateUdt; /* refresh the parser state for the next operation */ this.refresh = function refresh() { thisData.state = id.ACTIVE; thisData.phraseLength = 0; thisData.lookAround = lookAround[lookAround.length - 1]; }; }; /* some look around helper functions */ const lookAroundValue = function lookAroundValue() { return lookAround[lookAround.length - 1]; }; /* return true if parser is in look around (ahead or behind) state */ const inLookAround = function inLookAround() { return lookAround.length > 1; }; /* return true if parser is in look behind state */ const inLookBehind = function () { return lookAround[lookAround.length - 1].lookAround === id.LOOKAROUND_BEHIND; }; /* called by parse() to initialize the AST object, if one has been defined */ const initializeAst = function () { const functionName = `${thisFileName}initializeAst(): `; const TRUE = true; while (TRUE) { if (thisThis.ast === undefined) { thisThis.ast = null; break; } if (thisThis.ast === null) { break; } if (thisThis.ast.astObject !== 'astObject') { throw new Error(`${functionName}ast object not recognized`); } break; } if (thisThis.ast !== null) { thisThis.ast.init(rules, udts, chars); } }; /* called by parse() to initialize the trace object, if one has been defined */ const initializeTrace = function () { const functionName = `${thisFileName}initializeTrace(): `; const TRUE = true; while (TRUE) { if (thisThis.trace === undefined) { thisThis.trace = null; break; } if (thisThis.trace === null) { break; } if (thisThis.trace.traceObject !== 'traceObject') { throw new Error(`${functionName}trace object not recognized`); } break; } if (thisThis.trace !== null) { thisThis.trace.init(rules, udts, chars); } }; /* called by parse() to initialize the statistics object, if one has been defined */ const initializeStats = function () { const functionName = `${thisFileName}initializeStats(): `; const TRUE = true; while (TRUE) { if (thisThis.stats === undefined) { thisThis.stats = null; break; } if (thisThis.stats === null) { break; } if (thisThis.stats.statsObject !== 'statsObject') { throw new Error(`${functionName}stats object not recognized`); } break; } if (thisThis.stats !== null) { thisThis.stats.init(rules, udts); } }; /* called by parse() to initialize the rules & udts from the grammar object */ /* (the grammar object generated previously by apg) */ const initializeGrammar = function (grammar) { const functionName = `${thisFileName}initializeGrammar(): `; if (!grammar) { throw new Error(`${functionName}grammar object undefined`); } if (grammar.grammarObject !== 'grammarObject') { throw new Error(`${functionName}bad grammar object`); } rules = grammar.rules; udts = grammar.udts; }; /* called by parse() to initialize the start rule */ const initializeStartRule = function (startRule) { const functionName = `${thisFileName}initializeStartRule(): `; let start = null; if (typeof startRule === 'number') { if (startRule >= rules.length) { throw new Error(`${functionName}start rule index too large: max: ${rules.length}: index: ${startRule}`); } start = startRule; } else if (typeof startRule === 'string') { const lower = startRule.toLowerCase(); for (let i = 0; i < rules.length; i += 1) { if (lower === rules[i].lower) { start = rules[i].index; break; } } if (start === null) { throw new Error(`${functionName}start rule name '${startRule}' not recognized`); } } else { throw new Error(`${functionName}type of start rule '${typeof startRule}' not recognized`); } return start; }; /* called by parse() to initialize the array of characters codes representing the input string */ const initializeInputChars = function initializeInputChars(inputArg, begArg, lenArg) { const functionName = `${thisFileName}initializeInputChars(): `; /* varify and normalize input */ let input = inputArg; let beg = begArg; let len = lenArg; if (input === undefined) { throw new Error(`${functionName}input string is undefined`); } if (input === null) { throw new Error(`${functionName}input string is null`); } if (typeof input === 'string') { input = utils.stringToChars(input); } else if (!Array.isArray(input)) { throw new Error(`${functionName}input string is not a string or array`); } if (input.length > 0) { if (typeof input[0] !== 'number') { throw new Error(`${functionName}input string not an array of integers`); } } /* verify and normalize beginning index */ if (typeof beg !== 'number') { beg = 0; } else { beg = Math.floor(beg); if (beg < 0 || beg > input.length) { throw new Error(`${functionName}input beginning index out of range: ${beg}`); } } /* verify and normalize input length */ if (typeof len !== 'number') { len = input.length - beg; } else { len = Math.floor(len); if (len < 0 || len > input.length - beg) { throw new Error(`${functionName}input length out of range: ${len}`); } } chars = input; charsBegin = beg; charsLength = len; charsEnd = charsBegin + charsLength; }; /* called by parse() to initialize the user-written, syntax callback functions, if any */ const initializeCallbacks = function () { const functionName = `${thisFileName}initializeCallbacks(): `; let i; ruleCallbacks = []; udtCallbacks = []; for (i = 0; i < rules.length; i += 1) { ruleCallbacks[i] = null; } for (i = 0; i < udts.length; i += 1) { udtCallbacks[i] = null; } let func; const list = []; for (i = 0; i < rules.length; i += 1) { list.push(rules[i].lower); } for (i = 0; i < udts.length; i += 1) { list.push(udts[i].lower); } for (const index in thisThis.callbacks) { i = list.indexOf(index.toLowerCase()); if (i < 0) { throw new Error(`${functionName}syntax callback '${index}' not a rule or udt name`); } func = thisThis.callbacks[index]; if (!func) { func = null; } if (typeof func === 'function' || func === null) { if (i < rules.length) { ruleCallbacks[i] = func; } else { udtCallbacks[i - rules.length] = func; } } else { throw new Error( `${functionName}syntax callback[${index}] must be function reference or 'false' (false/null/undefined/etc.)` ); } } /* make sure all udts have been defined - the parser can't work without them */ for (i = 0; i < udts.length; i += 1) { if (udtCallbacks[i] === null) { throw new Error( `${functionName}all UDT callbacks must be defined. UDT callback[${udts[i].lower}] not a function reference` ); } } }; // Set the maximum parse tree depth allowed. The default is `Infinity`. // A limit is not normally needed, but can be used to protect against an // exponentual or "catastrophically backtracking" grammar. // <ul> // <li> // depth - max allowed parse tree depth. An exception is thrown if exceeded. // </li> // </ul> this.setMaxTreeDepth = function (depth) { if (typeof depth !== 'number') { throw new Error(`parser: max tree depth must be integer > 0: ${depth}`); } limitTreeDepth = Math.floor(depth); if (limitTreeDepth <= 0) { throw new Error(`parser: max tree depth must be integer > 0: ${depth}`); } }; // Set the maximum number of node hits (parser unit steps or opcode function calls) allowed. // The default is `Infinity`. // A limit is not normally needed, but can be used to protect against an // exponentual or "catastrophically backtracking" grammar. // <ul> // <li> // hits - maximum number of node hits or parser unit steps allowed. // An exception thrown if exceeded. // </li> // </ul> this.setMaxNodeHits = function (hits) { if (typeof hits !== 'number') { throw new Error(`parser: max node hits must be integer > 0: ${hits}`); } limitNodeHits = Math.floor(hits); if (limitNodeHits <= 0) { throw new Error(`parser: max node hits must be integer > 0: ${hits}`); } }; /* the main parser function */ const privateParse = function (grammar, startRuleArg, callbackData) { let success; const functionName = `${thisFileName}parse(): `; initializeGrammar(grammar); const startRule = initializeStartRule(startRuleArg); initializeCallbacks(); initializeTrace(); initializeStats(); initializeAst(); const sysData = new systemData(); if (!(callbackData === undefined || callbackData === null)) { syntaxData = callbackData; } /* create a dummy opcode for the start rule */ opcodes = [ { type: id.RNM, index: startRule, }, ]; /* execute the start rule */ opExecute(0, charsBegin, sysData); opcodes = null; /* test and return the sysData */ switch (sysData.state) { case id.ACTIVE: throw new Error(`${functionName}final state should never be 'ACTIVE'`); case id.NOMATCH: success = false; break; case id.EMPTY: case id.MATCH: if (sysData.phraseLength === charsLength) { success = true; } else { success = false; } break; default: throw new Error('unrecognized state'); } return { success, state: sysData.state, length: charsLength, matched: sysData.phraseLength, maxMatched, maxTreeDepth, nodeHits, inputLength: chars.length, subBegin: charsBegin, subEnd: charsEnd, subLength: charsLength, }; }; // This form allows parsing of a sub-string of the full input string. // <ul> // <li>*inputIndex* - index of the first character in the sub-string</li> // <li>*inputLength* - length of the sub-string</li> // </ul> // All other parameters as for the above function `parse()`. this.parseSubstring = function parseSubstring(grammar, startRule, inputChars, inputIndex, inputLength, callbackData) { clear(); initializeInputChars(inputChars, inputIndex, inputLength); return privateParse(grammar, startRule, callbackData); }; // This is the main function, called to parse an input string. // <ul> // <li>*grammar* - an instantiated grammar object - the output of `apg` for a // specific SABNF grammar</li> // <li>*startRule* - the rule name or rule index to be used as the root of the // parse tree. This is usually the first rule, index = 0, of the grammar // but can be any rule defined in the above grammar object.</li> // <li>*inputChars* - the input string. Can be a string or an array of integer character codes representing the // string.</li> // <li>*callbackData* - user-defined data object to be passed to the user's // callback functions. // This is not used by the parser in any way, merely passed on to the user. // May be `null` or omitted.</li> // </ul> this.parse = function parse(grammar, startRule, inputChars, callbackData) { clear(); initializeInputChars(inputChars, 0, inputChars.length); return privateParse(grammar, startRule, callbackData); }; // The `ALT` operator.<br> // Executes its child nodes, from left to right, until it finds a match. // Fails if *all* of its child nodes fail. const opALT = function (opIndex, phraseIndex, sysData) { const op = opcodes[opIndex]; for (let i = 0; i < op.children.length; i += 1) { opExecute(op.children[i], phraseIndex, sysData); if (sysData.state !== id.NOMATCH) { break; } } }; // The `CAT` operator.<br> // Executes all of its child nodes, from left to right, // concatenating the matched phrases. // Fails if *any* child nodes fail. const opCAT = function (opIndex, phraseIndex, sysData) { let success; let astLength; let catCharIndex; let catPhrase; const op = opcodes[opIndex]; const ulen = sysData.uFrame.length(); const plen = sysData.pFrame.length(); if (thisThis.ast) { astLength = thisThis.ast.getLength(); } success = true; catCharIndex = phraseIndex; catPhrase = 0; for (let i = 0; i < op.children.length; i += 1) { opExecute(op.children[i], catCharIndex, sysData); if (sysData.state === id.NOMATCH) { success = false; break; } else { catCharIndex += sysData.phraseLength; catPhrase += sysData.phraseLength; } } if (success) { sysData.state = catPhrase === 0 ? id.EMPTY : id.MATCH; sysData.phraseLength = catPhrase; } else { sysData.state = id.NOMATCH; sysData.phraseLength = 0; /* reset the back referencing frames on failure */ sysData.uFrame.pop(ulen); sysData.pFrame.pop(plen); if (thisThis.ast) { thisThis.ast.setLength(astLength); } } }; // The `REP` operator.<br> // Repeatedly executes its single child node, // concatenating each of the matched phrases found. // The number of repetitions executed and its final sysData depends // on its `min` & `max` repetition values. const opREP = function (opIndex, phraseIndex, sysData) { let astLength; let repCharIndex; let repPhrase; let repCount; const op = opcodes[opIndex]; repCharIndex = phraseIndex; repPhrase = 0; repCount = 0; const ulen = sysData.uFrame.length(); const plen = sysData.pFrame.length(); if (thisThis.ast) { astLength = thisThis.ast.getLength(); } const TRUE = true; while (TRUE) { if (repCharIndex >= charsEnd) { /* exit on end of input string */ break; } opExecute(opIndex + 1, repCharIndex, sysData); if (sysData.state === id.NOMATCH) { /* always end if the child node fails */ break; } if (sysData.state === id.EMPTY) { /* REP always succeeds when the child node returns an empty phrase */ /* this may not seem obvious, but that's the way it works out */ break; } repCount += 1; repPhrase += sysData.phraseLength; repCharIndex += sysData.phraseLength; if (repCount === op.max) { /* end on maxed out reps */ break; } } /* evaluate the match count according to the min, max values */ if (sysData.state === id.EMPTY) { sysData.state = repPhrase === 0 ? id.EMPTY : id.MATCH; sysData.phraseLength = repPhrase; } else if (repCount >= op.min) { sysData.state = repPhrase === 0 ? id.EMPTY : id.MATCH; sysData.phraseLength = repPhrase; } else { sysData.state = id.NOMATCH; sysData.phraseLength = 0; /* reset the back referencing frames on failure */ sysData.uFrame.pop(ulen); sysData.pFrame.pop(plen); if (thisThis.ast) { thisThis.ast.setLength(astLength); } } }; // Validate the callback function's returned sysData values. // It's the user's responsibility to get them right // but `RNM` fails if not. const validateRnmCallbackResult = function (rule, sysData, charsLeft, down) { if (sysData.phraseLength > charsLeft) { let str = `${thisFileName}opRNM(${rule.name}): callback function error: `; str += `sysData.phraseLength: ${sysData.phraseLength}`; str += ` must be <= remaining chars: ${charsLeft}`; throw new Error(str); } switch (sysData.state) { case id.ACTIVE: if (down !== true) { throw new Error( `${thisFileName}opRNM(${rule.name}): callback function return error. ACTIVE state not allowed.` ); } break; case id.EMPTY: sysData.phraseLength = 0; break; case id.MATCH: if (sysData.phraseLength === 0) { sysData.state = id.EMPTY; } break; case id.NOMATCH: sysData.phraseLength = 0; break; default: throw new Error( `${thisFileName}opRNM(${rule.name}): callback function return error. Unrecognized return state: ${sysData.state}` ); } }; // The `RNM` operator.<br> // This operator will acts as a root node for a parse tree branch below and // returns the matched phrase to its parent. // However, its larger responsibility is handling user-defined callback functions, back references and `AST` nodes. // Note that the `AST` is a separate object, but `RNM` calls its functions to create its nodes. // See [`ast.js`](./ast.html) for usage. const opRNM = function (opIndex, phraseIndex, sysData) { let astLength; let astDefined; let savedOpcodes; let ulen; let plen; let saveFrame; const op = opcodes[opIndex]; const rule = rules[op.index]; const callback = ruleCallbacks[rule.index]; const notLookAround = !inLookAround(); /* ignore AST and back references in lookaround */ if (notLookAround) { /* begin AST and back references */ astDefined = thisThis.ast && thisThis.ast.ruleDefined(op.index); if (astDefined) { astLength = thisThis.ast.getLength(); thisThis.ast.down(op.index, rules[op.index].name); } ulen = sysData.uFrame.length(); plen = sysData.pFrame.length(); sysData.uFrame.push(); sysData.pFrame.push(); saveFrame = sysData.pFrame; sysData.pFrame = new backRef(); } if (callback === null) { /* no callback - just execute the rule */ savedOpcodes = opcodes; opcodes = rule.opcodes; opExecute(0, phraseIndex, sysData); opcodes = savedOpcodes; } else { /* call user's callback */ const charsLeft = charsEnd - phraseIndex; sysData.ruleIndex = rule.index; callback(sysData, chars, phraseIndex, syntaxData); validateRnmCallbackResult(rule, sysData, charsLeft, true); if (sysData.state === id.ACTIVE) { savedOpcodes = opcodes; opcodes = rule.opcodes; opExecute(0, phraseIndex, sysData); opcodes = savedOpcodes; sysData.ruleIndex = rule.index; callback(sysData, chars, phraseIndex, syntaxData); validateRnmCallbackResult(rule, sysData, charsLeft, false); } /* implied else clause: just accept the callback sysData - RNM acting as UDT */ } if (notLookAround) { /* end AST */ if (astDefined) { if (sysData.state === id.NOMATCH) { thisThis.ast.setLength(astLength); } else { thisThis.ast.up(op.index, rule.name, phraseIndex, sysData.phraseLength); } } /* end back reference */ sysData.pFrame = saveFrame; if (sysData.state === id.NOMATCH) { sysData.uFrame.pop(ulen); sysData.pFrame.pop(plen); } else if (rule.isBkr) { /* save phrase on both the parent and universal frames */ /* BKR operator will decide which to use later */ sysData.pFrame.savePhrase(rule.lower, phraseIndex, sysData.phraseLength); sysData.uFrame.savePhrase(rule.lower, phraseIndex, sysData.phraseLength); } } }; // Validate the callback function's returned sysData values. // It's the user's responsibility to get it right but `UDT` fails if not. const validateUdtCallbackResult = function (udt, sysData, charsLeft) { if (sysData.phraseLength > charsLeft) { let str = `${thisFileName}opUDT(${udt.name}): callback function error: `; str += `sysData.phraseLength: ${sysData.phraseLength}`; str += ` must be <= remaining chars: ${charsLeft}`; throw new Error(str); } switch (sysData.state) { case id.ACTIVE: throw new Error(`${thisFileName}opUDT(${udt.name}): callback function return error. ACTIVE state not allowed.`); case id.EMPTY: if (udt.empty === false) { throw new Error(`${thisFileName}opUDT(${udt.name}): callback function return error. May not return EMPTY.`); } else { sysData.phraseLength = 0; } break; case id.MATCH: if (sysData.phraseLength === 0) { if (udt.empty === false) { throw new Error(`${thisFileName}opUDT(${udt.name}): callback function return error. May not return EMPTY.`); } else { sysData.state = id.EMPTY; } } break; case id.NOMATCH: sysData.phraseLength = 0; break; default: throw new Error( `${thisFileName}opUDT(${udt.name}): callback function return error. Unrecognized return state: ${sysData.state}` ); } }; // The `UDT` operator.<br> // Simply calls the user's callback function, but operates like `RNM` with regard to the `AST` // and back referencing. // There is some ambiguity here. `UDT`s act as terminals for phrase recognition but as named rules // for `AST` nodes and back referencing. // See [`ast.js`](./ast.html) for usage. const opUDT = function (opIndex, phraseIndex, sysData) { let astLength; let astIndex; let astDefined; let ulen; let plen; let saveFrame; const op = opcodes[opIndex]; const udt = udts[op.index]; sysData.UdtIndex = udt.index; const notLookAround = !inLookAround(); /* ignore AST and back references in lookaround */ if (notLookAround) { /* begin AST and back reference */ astDefined = thisThis.ast && thisThis.ast.udtDefined(op.index); if (astDefined) { astIndex = rules.length + op.index; astLength = thisThis.ast.getLength(); thisThis.ast.down(astIndex, udt.name); } /* NOTE: push and pop of the back reference frame is normally not necessary */ /* only in the case that the UDT calls evaluateRule() or evaluateUdt() */ ulen = sysData.uFrame.length(); plen = sysData.pFrame.length(); sysData.uFrame.push(); sysData.pFrame.push(); saveFrame = sysData.pFrame; sysData.pFrame = new backRef(); } /* call the UDT */ const charsLeft = charsEnd - phraseIndex; udtCallbacks[op.index](sysData, chars, phraseIndex, syntaxData); validateUdtCallbackResult(udt, sysData, charsLeft); if (notLookAround) { /* end AST */ if (astDefined) { if (sysData.state === id.NOMATCH) { thisThis.ast.setLength(astLength); } else { thisThis.ast.up(astIndex, udt.name, phraseIndex, sysData.phraseLength); } } /* end back reference */ sysData.pFrame = saveFrame; if (sysData.state === id.NOMATCH) { sysData.uFrame.pop(ulen); sysData.pFrame.pop(plen); } else if (udt.isBkr) { /* save phrase on both the parent and universal frames */ /* BKR operator will decide which to use later */ sysData.pFrame.savePhrase(udt.lower, phraseIndex, sysData.phraseLength); sysData.uFrame.savePhrase(udt.lower, phraseIndex, sysData.phraseLength); } } }; // The `AND` operator.<br> // This is the positive `look ahead` operator. // Executes its single child node, returning the EMPTY state // if it succeedsand NOMATCH if it fails. // *Always* backtracks on any matched phrase and returns EMPTY on success. const opAND = function (opIndex, phraseIndex, sysData) { lookAround.push({ lookAround: id.LOOKAROUND_AHEAD, anchor: phraseIndex, charsEnd, charsLength, }); charsEnd = chars.length; charsLength = chars.length - charsBegin; opExecute(opIndex + 1, phraseIndex, sysData); const pop = lookAround.pop(); charsEnd = pop.charsEnd; charsLength = pop.charsLength; sysData.phraseLength = 0; switch (sysData.state) { case id.EMPTY: sysData.state = id.EMPTY; break; case id.MATCH: sysData.state = id.EMPTY; break; case id.NOMATCH: sysData.state = id.NOMATCH; break; default: throw new Error(`opAND: invalid state ${sysData.state}`); } }; // The `NOT` operator.<br> // This is the negative `look ahead` operator. // Executes its single child node, returning the EMPTY state // if it *fails* and NOMATCH if it succeeds. // *Always* backtracks on any matched phrase and returns EMPTY // on success (failure of its child node). const opNOT = function (opIndex, phraseIndex, sysData) { lookAround.push({ lookAround: id.LOOKAROUND_AHEAD, anchor: phraseIndex, charsEnd, charsLength, }); charsEnd = chars.length; charsLength = chars.length - charsBegin; opExecute(opIndex + 1, phraseIndex, sysData); const pop = lookAround.pop(); charsEnd = pop.charsEnd; charsLength = pop.charsLength; sysData.phraseLength = 0; switch (sysData.state) { case id.EMPTY: case id.MATCH: sysData.state = id.NOMATCH; break; case id.NOMATCH: sysData.state = id.EMPTY; break; default: throw new Error(`opNOT: invalid state ${sysData.state}`); } }; // The `TRG` operator.<br> // Succeeds if the single first character of the phrase is // within the `min - max` range. const opTRG = function (opIndex, phraseIndex, sysData) { const op = opcodes[opIndex]; sysData.state = id.NOMATCH; if (phraseIndex < charsEnd) { if (op.min <= chars[phraseIndex] && chars[phraseIndex] <= op.max) { sysData.state = id.MATCH; sysData.phraseLength = 1; } } }; // The `TBS` operator.<br> // Matches its pre-defined phrase against the input string. // All characters must match exactly. // Case-sensitive literal strings (`'string'` & `%s"string"`) are translated to `TBS` // operators by `apg`. // Phrase length of zero is not allowed. // Empty phrases can only be defined with `TLS` operators. const opTBS = function (opIndex, phraseIndex, sysData) { let i; const op = opcodes[opIndex]; const len = op.string.length; sysData.state = id.NOMATCH; if (phraseIndex + len <= charsEnd) { for (i = 0; i < len; i += 1) { if (chars[phraseIndex + i] !== op.string[i]) { return; } } sysData.state = id.MATCH; sysData.phraseLength = len; } /* implied else NOMATCH */ }; // The `TLS` operator.<br> // Matches its pre-defined phrase against the input string. // A case-insensitive match is attempted for ASCII alphbetical characters. // `TLS` is the only operator that explicitly allows empty phrases. // `apg` will fail for empty `TBS`, case-sensitive strings (`''`) or // zero repetitions (`0*0RuleName` or `0RuleName`). const opTLS = function (opIndex, phraseIndex, sysData) { let i; let code; const op = opcodes[opIndex]; sysData.state = id.NOMATCH; const len = op.string.length; if (len === 0) { /* EMPTY match allowed for TLS */ sysData.state = id.EMPTY; return; } if (phraseIndex + len <= charsEnd) { for (i = 0; i < len; i += 1) { code = chars[phraseIndex + i]; if (code >= 65 && code <= 90) { code += 32; } if (code !== op.string[i]) { return; } } sysData.state = id.MATCH; sysData.phraseLength = len; } /* implied else NOMATCH */ }; // The `ABG` operator.<br> // This is an "anchor" for the beginning of the string, similar to the familiar regex `^` anchor. // An anchor matches a position rather than a phrase. // Returns EMPTY if `phraseIndex` is 0, NOMATCH otherwise. const opABG = function (opIndex, phraseIndex, sysData) { sysData.state = id.NOMATCH; sysData.phraseLength = 0; sysData.state = phraseIndex === 0 ? id.EMPTY : id.NOMATCH; }; // The `AEN` operator.<br> // This is an "anchor" for the end of the string, similar to the familiar regex `$` anchor. // An anchor matches a position rather than a phrase. // Returns EMPTY if `phraseIndex` equals the input string length, NOMATCH otherwise. const opAEN = function (opIndex, phraseIndex, sysData) { sysData.state = id.NOMATCH; sysData.phraseLength = 0; sysData.state = phraseIndex === chars.length ? id.EMPTY : id.NOMATCH; }; // The `BKR` operator.<br> // The back reference operator. // Matches the last matched phrase of the named rule or UDT against the input string. // For ASCII alphbetical characters the match may be case sensitive (`%s`) or insensitive (`%i`), // depending on the back reference definition. // For `universal` mode (`%u`) matches the last phrase found anywhere in the grammar. // For `parent frame` mode (`%p`) matches the last phrase found in the parent rule only. const opBKR = function (opIndex, phraseIndex, sysData) { let i; let code; let lmcode; let lower; const op = opcodes[opIndex]; sysData.state = id.NOMATCH; if (op.index < rules.length) { lower = rules[op.index].lower; } else { lower = udts[op.index - rules.length].lower; } const frame = op.bkrMode === id.BKR_MODE_PM ? sysData.pFrame.getPhrase(lower) : sysData.uFrame.getPhrase(lower); const insensitive = op.bkrCase === id.BKR_MODE_CI; if (frame === null) { return; } const lmIndex = frame.phraseIndex; const len = frame.phraseLength; if (len === 0) { sysData.state = id.EMPTY; return; } if (phraseIndex + len <= charsEnd) { if (insensitive) { /* case-insensitive match */ for (i = 0; i < len; i += 1) { code = chars[phraseIndex + i]; lmcode = chars[lmIndex + i]; if (code >= 65 && code <= 90) { code += 32; } if (lmcode >= 65 && lmcode <= 90) { lmcode += 32; } if (code !== lmcode) { return; } } sysData.state = id.MATCH; sysData.phraseLength = len; } else { /* case-sensitive match */ for (i = 0; i < len; i += 1) { code = chars[phraseIndex + i]; lmcode = chars[lmIndex + i]; if (code !== lmcode) { return; } } } sysData.state = id.MATCH; sysData.phraseLength = len; } }; // The `BKA` operator.<br> // This is the positive `look behind` operator. // It's child node is parsed right-to-left. // Returns the EMPTY state if a match is found, NOMATCH otherwise. // Like the look ahead operators, it always backtracks to `phraseIndex`. const opBKA = function (opIndex, phraseIndex, sysData) { lookAround.push({ lookAround: id.LOOKAROUND_BEHIND, anchor: phraseIndex, }); opExecute(opIndex + 1, phraseIndex, sysData); lookAround.pop(); sysData.phraseLength = 0; switch (sysData.state) { case id.EMPTY: sysData.state = id.EMPTY; break; case id.MATCH: sysData.state = id.EMPTY; break; case id.NOMATCH: sysData.state = id.NOMATCH; break; default: throw new Error(`opBKA: invalid state ${sysData.state}`); } }; // The `BKN` operator.<br> // This is the negative `look behind` operator. // It's child node is parsed right-to-left. // Returns the EMPTY state if a match is *not* found, NOMATCH otherwise. // Like the look ahead operators, it always backtracks to `phraseIndex`. const opBKN = function (opIndex, phraseIndex, sysData) { // let op; // op = opcodes[opIndex]; lookAround.push({ lookAround: id.LOOKAROUND_BEHIND, anchor: phraseIndex, }); opExecute(opIndex + 1, phraseIndex, sysData); lookAround.pop(); sysData.phraseLength = 0; switch (sysData.state) { case id.EMPTY: case id.MATCH: sysData.state = id.NOMATCH; break; case id.NOMATCH: sysData.state = id.EMPTY; break; default: throw new Error(`opBKN: invalid state ${sysData.state}`); } }; // The right-to-left `CAT` operator.<br> // Called for `CAT` operators when in look behind mode. // Calls its child nodes from right to left concatenating matched phrases right to left. const opCATBehind = function (opIndex, phraseIndex, sysData) { let success; let astLength; let catCharIndex; let catMatched; const op = opcodes[opIndex]; const ulen = sysData.uFrame.length(); const plen = sysData.pFrame.length(); if (thisThis.ast) { astLength = thisThis.ast.getLength(); } success = true; catCharIndex = phraseIndex; catMatched = 0; // catPhrase = 0; for (let i = op.children.length - 1; i >= 0; i -= 1) { opExecute(op.children[i], catCharIndex, sysData); catCharIndex -= sysData.phraseLength; catMatched += sysData.phraseLength; // catPhrase += sysData.phraseLength; if (sysData.state === id.NOMATCH) { success = false; break; } } if (success) { sysData.state = catMatched === 0 ? id.EMPTY : id.MATCH; sysData.phraseLength = catMatched; } else { sysData.state = id.NOMATCH; sysData.phraseLength = 0; sysData.uFrame.pop(ulen); sysData.pFrame.pop(plen); if (thisThis.ast) { thisThis.ast.setLength(astLength); } } }; // The right-to-left `REP` operator.<br> // Called for `REP` operators in look behind mode. // Makes repeated calls to its child node, concatenating matched phrases right to left. const opREPBehind = function (opIndex, phraseIndex, sysData) { let astLength; let repCharIndex; let repPhrase; let repCount; const op = opcodes[opIndex]; repCharIndex = phraseIndex; repPhrase = 0; repCount = 0; const ulen = sysData.uFrame.length(); const plen = sysData.pFrame.length(); if (thisThis.ast) { astLength = thisThis.ast.getLength(); } const TRUE = true; while (TRUE) { if (repCharIndex <= 0) { /* exit on end of input string */ break; } opExecute(opIndex + 1, repCharIndex, sysData); if (sysData.state === id.NOMATCH) { /* always end if the child node fails */ break; } if (sysData.state === id.EMPTY) { /* REP always succeeds when the child node returns an empty phrase */ /* this may not seem obvious, but that's the way it works out */ break; } repCount += 1; repPhrase += sysData.phraseLength; repCharIndex -= sysData.phraseLength; if (repCount === op.max) { /* end on maxed out reps */ break; } } /* evaluate the match count according to the min, max values */ if (sysData.state === id.EMPTY) { sysData.state = repPhrase === 0 ? id.EMPTY : id.MATCH; sysData.phraseLength = repPhrase; } else if (repCount >= op.min) { sysData.state = repPhrase === 0 ? id.EMPTY : id.MATCH; sysData.phraseLength = repPhrase; } else { sysData.state = id.NOMATCH; sysData.phraseLength = 0; sysData.uFrame.pop(ulen); sysData.pFrame.pop(plen); if (thisThis.ast) { thisThis.ast.setLength(astLength); } } }; // The right-to-left `TRG` operator.<br> // Called for `TRG` operators in look behind mode. // Matches a single character at `phraseIndex - 1` to the `min` - `max` range. const opTRGBehind = function (opIndex, phraseIndex, sysData) { const op = opcodes[opIndex]; sysData.state = id.NOMATCH; sysData.phraseLength = 0; if (phraseIndex > 0) { const char = chars[phraseIndex - 1]; if (op.min <= char && char <= op.max) { sysData.state = id.MATCH; sysData.phraseLength = 1; } } }; // The right-to-left `TBS` operator.<br> // Called for `TBS` operators in look behind mode. // Matches the `TBS` phrase to the left of `phraseIndex`. const opTBSBehind = function (opIndex, phraseIndex, sysData) { let i; const op = opcodes[opIndex]; sysData.state = id.NOMATCH; const len = op.string.length; const beg = phraseIndex - len; if (beg >= 0) { for (i = 0; i < len; i += 1) { if (chars[beg + i] !== op.string[i]) { return; } } sysData.state = id.MATCH; sysData.phraseLength = len; } }; // The right-to-left `TLS` operator.<br> // Called for `TLS` operators in look behind mode. // Matches the `TLS` phrase to the left of `phraseIndex`. const opTLSBehind = function (opIndex, phraseIndex, sysData) { let char; const op = opcodes[opIndex]; sysData.state = id.NOMATCH; const len = op.string.length; if (len === 0) { /* EMPTY match allowed for TLS */ sysData.state = id.EMPTY; return; } const beg = phraseIndex - len; if (beg >= 0) { for (let i = 0; i < len; i += 1) { char = chars[beg + i]; if (char >= 65 && char <= 90) { char += 32; } if (char !== op.string[i]) { return; } } sysData.state = id.MATCH; sysData.phraseLength = len; } }; // The right-to-left back reference operator.<br> // Matches the back referenced phrase to the left of `phraseIndex`. const opBKRBehind = function (opIndex, phraseIndex, sysData) { let i; let code; let lmcode; let lower; const op = opcodes[opIndex]; /* NOMATCH default */ sysData.state = id.NOMATCH; sysData.phraseLength = 0; if (op.index < rules.length) { lower = rules[op.index].lower; } else { lower = udts[op.index - rules.length].lower; } const frame = op.bkrMode === id.BKR_MODE_PM ? sysData.pFrame.getPhrase(lower) : sysData.uFrame.getPhrase(lower); const insensitive = op.bkrCase === id.BKR_MODE_CI; if (frame === null) { return; } const lmIndex = frame.phraseIndex; const len = frame.phraseLength; if (len === 0) { sysData.state = id.EMPTY; sysData.phraseLength = 0; return; } const beg = phraseIndex - len; if (beg >= 0) { if (insensitive) { /* case-insensitive match */ for (i = 0; i < len; i += 1) { code = chars[beg + i]; lmcode = chars[lmIndex + i]; if (code >= 65 && code <= 90) { code += 32; } if (lmcode >= 65 && lmcode <= 90) { lmcode += 32; } if (code !== lmcode) { return; } } sysData.state = id.MATCH; sysData.phraseLength = len; } else { /* case-sensitive match */ for (i = 0; i < len; i += 1) { code = chars[beg + i]; lmcode = chars[lmIndex + i]; if (code !== lmcode) { return; } } } sysData.state = id.MATCH; sysData.phraseLength = len; } }; // Generalized execution function.<br> // Having a single, generalized function, allows a single location // for tracing and statistics gathering functions to be called. // Tracing and statistics are handled in separate objects. // However, the parser calls their API to build the object data records. // See [`trace.js`](./trace.html) and [`stats.js`](./stats.html) for their // usage. opExecute = function opExecuteFunc(opIndex, phraseIndex, sysData) { let ret = true; const op = opcodes[opIndex]; nodeHits += 1; if (nodeHits > limitNodeHits) { throw new Error(`parser: maximum number of node hits exceeded: ${limitNodeHits}`); } treeDepth += 1; if (treeDepth > maxTreeDepth) { maxTreeDepth = treeDepth; if (maxTreeDepth > limitTreeDepth) { throw new Error(`parser: maximum parse tree depth exceeded: ${limitTreeDepth}`); } } sysData.refresh(); if (thisThis.trace !== null) { /* collect the trace record for down the parse tree */ const lk = lookAroundValue(); thisThis.trace.down(op, sysData.state, phraseIndex, sysData.phraseLength, lk.anchor, lk.lookAround); } if (inLookBehind()) { switch (op.type) { case id.ALT: opALT(opIndex, phraseIndex, sysData); break; case id.CAT: opCATBehind(opIndex, phraseIndex, sysData); break; case id.REP: opREPBehind(opIndex, phraseIndex, sysData); break; case id.RNM: opRNM(opIndex, phraseIndex, sysData); break; case id.UDT: opUDT(opIndex, phraseIndex, sysData); break; case id.AND: opAND(opIndex, phraseIndex, sysData); break; case id.NOT: opNOT(opIndex, phraseIndex, sysData); break; case id.TRG: opTRGBehind(opIndex, phraseIndex, sysData); break; case id.TBS: opTBSBehind(opIndex, phraseIndex, sysData); break; case id.TLS: opTLSBehind(opIndex, phraseIndex, sysData); break; case id.BKR: opBKRBehind(opIndex, phraseIndex, sysData); break; case id.BKA: opBKA(opIndex, phraseIndex, sysData); break; case id.BKN: opBKN(opIndex, phraseIndex, sysData); break; case id.ABG: opABG(opIndex, phraseIndex, sysData); break; case id.AEN: opAEN(opIndex, phraseIndex, sysData); break; default: ret = false; break; } } else { switch (op.type) { case id.ALT: opALT(opIndex, phraseIndex, sysData); break; case id.CAT: opCAT(opIndex, phraseIndex, sysData); break; case id.REP: opREP(opIndex, phraseIndex, sysData); break; case id.RNM: opRNM(opIndex, phraseIndex, sysData); break; case id.UDT: opUDT(opIndex, phraseIndex, sysData); break; case id.AND: opAND(opIndex, phraseIndex, sysData); break; case id.NOT: opNOT(opIndex, phraseIndex, sysData); break; case id.TRG: opTRG(opIndex, phraseIndex, sysData); break; case id.TBS: opTBS(opIndex, phraseIndex, sysData); break; case id.TLS: opTLS(opIndex, phraseIndex, sysData); break; case id.BKR: opBKR(opIndex, phraseIndex, sysData); break; case id.BKA: opBKA(opIndex, phraseIndex, sysData); break; case id.BKN: opBKN(opIndex, phraseIndex, sysData); break; case id.ABG: opABG(opIndex, phraseIndex, sysData); break; case id.AEN: opAEN(opIndex, phraseIndex, sysData); break; default: ret = fa