UNPKG

openrosa-xpath-evaluator

Version:

Wrapper for browsers' XPath evaluator with added support for OpenRosa extensions.

677 lines (651 loc) • 27.6 kB

JavaScript

const { handleOperation } = require('./utils/operation'); const { preprocessNativeArgs } = require('./utils/native'); const { toSnapshotResult } = require('./utils/result'); const { asBoolean, asNumber, asString } = require('./utils/xpath-cast'); /* * From http://www.w3.org/TR/xpath/#section-Expressions XPath infix operator * precedence is left-associative. In the constants that follow, all but the * bottom two bits indicate precedence, and the entire value represents the * unique ID of the operator. * * These values are defined here rather than imported in an object so that they * can be inlined. Copy/paste the definitions into other files where they are * used. */ const OR = 0b00000; // --- precedence group separator const AND = 0b00100; // --- precedence group separator const EQ = 0b01000; const NE = 0b01001; // --- precedence group separator const LT = 0b01100; const LTE = 0b01101; const GT = 0b01110; const GTE = 0b01111; // --- precedence group separator const PLUS = 0b10000; const MINUS = 0b10001; // --- precedence group separator const MULT = 0b10100; const DIV = 0b10101; const MOD = 0b10110; // --- precedence group separator const UNION = 0b11000; // --- end operators const FUNCTION_NAME = /^[a-z]/; const D = 0xdead; // dead-end marker for the unevaluated side of a lazy expression module.exports = function (wrapped, extensions) { const extendedFuncs = extensions.func || {}; const extendedProcessors = extensions.process || {}; const toInternalResult = function (r) { let v; let i; let ordrd; switch (r.resultType) { case XPathResult.NUMBER_TYPE: return { t: 'num', v: r.numberValue }; case XPathResult.BOOLEAN_TYPE: return { t: 'bool', v: r.booleanValue }; case XPathResult.STRING_TYPE: return { t: 'str', v: r.stringValue }; case XPathResult.ORDERED_NODE_ITERATOR_TYPE: ordrd = true; /* falls through */ case XPathResult.UNORDERED_NODE_ITERATOR_TYPE: v = []; // eslint-disable-next-line no-cond-assign while ((i = r.iterateNext())) v.push(i); return { t: 'arr', v, ordrd }; case XPathResult.ORDERED_NODE_SNAPSHOT_TYPE: ordrd = true; /* falls through */ case XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE: v = []; for (i = 0; i < r.snapshotLength; ++i) { v.push(r.snapshotItem(i)); } return { t: 'arr', v, ordrd }; case XPathResult.ANY_UNORDERED_NODE_TYPE: case XPathResult.FIRST_ORDERED_NODE_TYPE: return { t: 'arr', v: [r.singleNodeValue] }; default: throw new Error(`no handling for result type: ${r.resultType}`); } }; const toExternalResult = function (r, rt) { if (extendedProcessors.toExternalResult) { const res = extendedProcessors.toExternalResult(r, rt); if (res) return res; } switch (rt) { case null: case undefined: case XPathResult.ANY_TYPE: // derive return type from the return value switch (r.t) { case 'num': return toExternalResult(r, XPathResult.NUMBER_TYPE); case 'str': return toExternalResult(r, XPathResult.STRING_TYPE); case 'bool': return toExternalResult(r, XPathResult.BOOLEAN_TYPE); case 'arr': return toExternalResult( r, XPathResult.UNORDERED_NODE_ITERATOR_TYPE ); default: throw new Error(`unrecognised internal type: ${r.t}`); } case XPathResult.NUMBER_TYPE: return { resultType: rt, stringValue: asString(r), numberValue: asNumber(r), }; case XPathResult.STRING_TYPE: return { resultType: rt, stringValue: asString(r) }; case XPathResult.BOOLEAN_TYPE: return { resultType: rt, stringValue: asString(r), booleanValue: asBoolean(r), }; case XPathResult.UNORDERED_NODE_ITERATOR_TYPE: case XPathResult.ORDERED_NODE_ITERATOR_TYPE: case XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE: case XPathResult.ORDERED_NODE_SNAPSHOT_TYPE: case XPathResult.ANY_UNORDERED_NODE_TYPE: case XPathResult.FIRST_ORDERED_NODE_TYPE: return toSnapshotResult(r, rt); default: throw new Error('unrecognised return type:', rt); } }; const typefor = function (val) { if (extendedProcessors.typefor) { const res = extendedProcessors.typefor(val); if (res) return res; } if (typeof val === 'boolean') return 'bool'; if (typeof val === 'number') return 'num'; return 'str'; }; /** * @type {typeof document.evaluate} * @see https://developer.mozilla.org/en-US/docs/Web/API/Document/evaluate */ const evaluate = function ( input, cN, nR, rT, _, contextSize = 1, contextPosition = 1 ) { let i; let cur; const stack = [{ t: 'root', tokens: [] }]; const peek = () => stack[stack.length - 1]; const pushToken = (t) => { const { tokens } = peek(); if (prevToken() !== D || t !== D) tokens.push(t); }; const isDeadBranch = () => { const { dead, t, tokens } = peek(); if (dead) return true; if (t === 'fn') { return prevToken() === D; } return tokens.includes(D); }; const err = (m) => { throw new Error((m || '') + JSON.stringify({ stack, cur })); }; const newCurrent = function () { cur = { v: '' }; }; const pushOp = function (t) { if (t <= AND) { evalOps(t); } pushToken({ t: 'op', v: t }); if (t <= AND) { const { tokens } = peek(); const prev = asBoolean(tokens[tokens.length - 2]); if (t === OR ? prev : !prev) pushToken(D); } newCurrent(); }; const callFn = function (name, supplied) { // Every second arg should be a comma, but we allow for a trailing comma. // From the spec, this looks valid, if you assume that ExprWhitespace is a // valid Expr. // see: https://www.w3.org/TR/1999/REC-xpath-19991116/#section-Function-Calls const args = []; for (let i = 0; i < supplied.length; ++i) { if (i % 2) { if (supplied[i] !== ',') throw new Error( `Weird args (should be separated by commas):${JSON.stringify( supplied )}` ); } else args.push(supplied[i]); } if (Object.prototype.hasOwnProperty.call(extendedFuncs, name)) { return extendedFuncs[name].apply( { cN, contextSize, contextPosition }, args ); } return callNative(name, preprocessNativeArgs(name, args)); }; const callNative = function (name, args) { let argString = `${name}(`; for (let i = 0; i < args.length; ++i) { if (i) argString += ','; const arg = args[i]; switch (arg.t) { case 'arr': throw new Error( `callNative() can't handle nodeset functions yet for ${name}()` ); case 'bool': argString += `${arg.v}()`; break; case 'num': if (arg.v === Infinity) argString += '( 1 div 0)'; else if (arg.v === -Infinity) argString += '(-1 div 0)'; else argString += arg.v.toFixed(20); // Prevent JS from converting to scientific notation break; case 'str': { const quote = arg.quote || (arg.v.indexOf('"') === -1 ? '"' : "'"); // Firefox's native XPath implementation is 3.0, but Chrome's is 1.0. // XPath 1.0 has no support for escaping quotes in strings, so: if (arg.v.indexOf(quote) !== -1) throw new Error( `Quote character found in String Literal: ${JSON.stringify( arg.v )}` ); argString += quote + arg.v + quote; break; } // there aren't any other native types TODO do we need a hook for allowing date conversion? default: break; } } return toInternalResult( wrapped.evaluate( `${argString})`, cN, nR, XPathResult.ANY_TYPE, null ) ); }; const evalOp = function (lhs, op, rhs) { if (op > AND && (lhs === D || rhs === D)) { return D; } if (extendedProcessors.handleInfix) { let res = extendedProcessors.handleInfix(err, lhs, op, rhs); if (res && res.t === 'continue') { lhs = res.lhs; op = res.op; rhs = res.rhs; res = null; } if (typeof res !== 'undefined' && res !== null) return res; } return handleOperation(lhs, op, rhs); }; const evalOps = function (lastOp) { const { tokens } = peek(); if (tokens.length < 2) return; if (tokens[2] === D && tokens[1].v >= lastOp) { const endExpr = tokens.indexOf(',', 2); tokens.splice(0, endExpr === -1 ? tokens.length : endExpr, { t: 'bool', v: asBoolean(tokens[0]), }); } for (let j = UNION; j >= lastOp; j -= 0b100) { let i = 1; while (i < tokens.length - 1) { if (tokens[i].t === 'op' && tokens[i].v >= j) { const res = evalOp( tokens[i - 1], tokens[i].v, tokens[i + 1] ); tokens.splice(i, 2); tokens[i - 1] = { t: typefor(res), v: res }; } else ++i; } } }; const handleXpathExpr = function () { if (isDeadBranch()) { newCurrent(); return; } let expr = cur.v; const prev = prevToken(); if (prev && prev.t === 'arr') { // chop the leading slash from expr if (expr.charAt(0) !== '/') err( `not sure how to handle expression called on nodeset that doesn't start with a '/': ${expr}` ); // prefix a '.' to make the expression relative to the context node: expr = wrapped.createExpression(`.${expr}`, nR); const newNodeset = []; prev.v.forEach((node) => { const res = toInternalResult(expr.evaluate(node)); newNodeset.push(...res.v); }); prev.v = newNodeset; } else { // This addresses a bug in Chrome and Safari, where an absolute // nodeset expression evaluated with an attribute contex node // does not evaluate to that nodeset as expected. Using the // attribute's owner document evaluates the expression correctly, // ensuring consistent behavior between Chrome, Safari and Firefox. const contextNode = cN?.nodeType === Node.ATTRIBUTE_NODE && expr.startsWith('/') ? cN.ownerDocument : cN; pushToken( toInternalResult( wrapped.evaluate( expr, contextNode, nR, XPathResult.ANY_TYPE, null ) ) ); } newCurrent(); }; const nextChar = function () { return input.charAt(i + 1); }; const finaliseNum = function () { cur.v = parseFloat(cur.str); pushToken(cur); newCurrent(); }; const prevToken = function () { const peeked = peek().tokens; return peeked[peeked.length - 1]; }; const isNum = function (c) { return c >= '0' && c <= '9'; }; newCurrent(); for (i = 0; i < input.length; ++i) { const c = input.charAt(i); if (cur.t === 'sq') { // Build the entire expression found within the square brackets: // // > A predicate filters a node-set with respect to an axis to produce a // > new node-set. For each node in the node-set to be filtered, the // > PredicateExpr is evaluated with that node as the context node, with // > the number of nodes in the node-set as the context size, and with // > the proximity position of the node in the node-set with respect to // > the axis as the context position; if PredicateExpr evaluates to // > true for that node, the node is included in the new node-set; // > otherwise, it is not included. // - https://www.w3.org/TR/1999/REC-xpath-19991116/#predicates // // Note because the ']' character is allowed within a Literal (string), // there is special handling for tracking when we're within a string. if (cur.inString) { if (cur.inString === c) delete cur.inString; } else if (c === '[') { ++cur.depth; } else if (c === "'" || c === '"') { cur.inString = c; } else if (c === ']') { if (--cur.depth) { cur.v += c; } else { if (isDeadBranch()) { newCurrent(); continue; } let contextNodes; const prev = prevToken(); if (prev.t === 'arr') { contextNodes = prev.v; } else throw new Error( 'Not sure how to handle context node for predicate in this situation.' ); // > A PredicateExpr is evaluated by evaluating the Expr and converting // > the result to a boolean. If the result is a number, the result will // > be converted to true if the number is equal to the context position // > and will be converted to false otherwise; if the result is not a // > number, then the result will be converted as if by a call to the // > boolean function. Thus a location path para[3] is equivalent to // > para[position()=3]. // - https://www.w3.org/TR/1999/REC-xpath-19991116/#predicates const expr = cur.v; const filteredNodes = contextNodes.filter((cN, i) => { const res = toInternalResult( evaluate( expr, cN, nR, XPathResult.ANY_TYPE, null, contextNodes.length, i + 1 ) ); return res.t === 'num' ? asNumber(res) === 1 + i : asBoolean(res); }); prev.v = filteredNodes; newCurrent(); } continue; } cur.v += c; continue; } if (cur.t === 'str') { if (c === cur.quote) { pushToken(cur); newCurrent(); } else cur.v += c; continue; } if (cur.t === 'num') { if ( isNum(c) || c === 'e' || (c === '-' && input[i - 1] === 'e') ) { cur.str += c; continue; } else if (c === ' ' && cur.str === '-') { continue; } else if (c === '.' && !cur.decimal) { cur.decimal = 1; cur.str += c; } else finaliseNum(); } if (isNum(c)) { if (cur.v === '') { cur = { t: 'num', str: c }; } else cur.v += c; } else switch (c) { case "'": case '"': if (cur.v === '') { cur = { t: 'str', quote: c, v: '' }; } else err(`Not sure how to handle: ${c}`); break; case '(': stack.push({ v: cur.v, t: 'fn', dead: isDeadBranch(), tokens: [], }); newCurrent(); break; case ')': if (cur.v !== '') handleXpathExpr(); evalOps(OR); cur = stack.pop(); if (cur.t !== 'fn') err('")" outside function!'); if (cur.dead) { pushToken(D); } else if (cur.v) { if (cur.v.charAt(0) === '/') { if (cur.tokens.length) err( 'Unexpected args for node test function!' ); cur.v += '()'; handleXpathExpr(); } else { pushToken(callFn(cur.v, cur.tokens)); } } else { // bracketed expression if (cur.tokens.length !== 1) err( `Expected one token, but found: ${cur.tokens.length}` ); pushToken(cur.tokens[0]); } newCurrent(); break; case ',': if (peek().t !== 'fn') err('Unexpected comma outside function arguments.'); if (cur.v) handleXpathExpr(); pushToken(','); break; case '*': { // check if part of an XPath expression const prev = prevToken(); if ( !prev || prev === ',' || prev.t === 'op' || cur.v ) { cur.v += c; break; } pushOp(MULT); } break; case '-': { const prev = prevToken(); if ( cur.v !== '' && nextChar() !== ' ' && input.charAt(i - 1) !== ' ' ) { // function name expr cur.v += c; } else if ( cur.v === '' && (!prev || // match case: ...+-1 prev.t === 'op' || // previous was a separate function arg prev === ',') ) { // -ve number cur = { t: 'num', str: '-' }; } else { // TODO do we need to check for cur.v here? pushOp(MINUS); } } break; case '=': switch (cur.v) { case '<': pushOp(LTE); break; case '>': pushOp(GTE); break; case '!': pushOp(NE); break; default: if (cur.v) handleXpathExpr(); pushOp(EQ); } break; case '!': if (cur.v) handleXpathExpr(); cur.v = c; break; case '>': case '<': if (cur.v) handleXpathExpr(); if (nextChar() === '=') { cur.v = c; } else { pushOp(c === '>' ? GT : LT); } break; case '+': if (cur.v) handleXpathExpr(); pushOp(PLUS); break; case '|': if (cur.v) handleXpathExpr(); pushOp(UNION); break; case '\n': case '\r': case '\t': case ' ': // whitespace, as defined at https://www.w3.org/TR/REC-xml/#NT-S if (cur.v === '') break; // trim leading whitespace if (!FUNCTION_NAME.test(cur.v)) handleXpathExpr(); break; case 'v': // Mad as it seems, according to https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex, // there is no requirement for ExprWhitespace before or after any // ExprToken, including OperatorName. if (cur.v === 'di') { // OperatorName: 'div' pushOp(DIV); } else cur.v += c; break; case 'r': // Mad as it seems, according to https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex, // there is no requirement for ExprWhitespace before or after any // ExprToken, including OperatorName. if (cur.v === 'o') { // OperatorName: 'or' pushOp(OR); } else cur.v += c; break; case 'd': // Mad as it seems, according to https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex, // there is no requirement for ExprWhitespace before or after any // ExprToken, including OperatorName. if (cur.v === 'an') { // OperatorName: 'and' pushOp(AND); } else if (cur.v === 'mo') { // OperatorName: 'mod' pushOp(MOD); } else cur.v += c; break; case '[': // evaluate previous part if there is any if (cur.v) { handleXpathExpr(); newCurrent(); } cur.t = 'sq'; cur.depth = 1; break; case '.': if (cur.v === '' && isNum(nextChar())) { cur = { t: 'num', str: c }; break; } /* falls through */ default: cur.v += c; } } if (cur.t === 'num') finaliseNum(); if (cur.v) handleXpathExpr(); if (stack.length !== 1) err('Stuff left on stack.'); if (stack[0].t !== 'root') err('Weird stuff on stack.'); if (stack[0].tokens.length === 0) err('No tokens.'); evalOps(OR); if (stack[0].tokens.length !== 1) err('Too many tokens.'); return toExternalResult(stack[0].tokens[0], rT); }; this.evaluate = evaluate; };