@aws-lambda-powertools/jmespath
Version:
A type safe and modern jmespath module to parse and extract data from JSON documents using JMESPath
727 lines (726 loc) • 25.1 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Parser = void 0;
const node_crypto_1 = require("node:crypto");
const Lexer_js_1 = require("./Lexer.js");
const ParsedResult_js_1 = require("./ParsedResult.js");
const ast_js_1 = require("./ast.js");
const constants_js_1 = require("./constants.js");
const errors_js_1 = require("./errors.js");
/**
* Top down operaotr precedence parser for JMESPath.
*
* ## References
* The implementation of this Parser is based on the implementation of
* [jmespath.py](https://github.com/jmespath/jmespath.py/), which in turn
* is based on [Vaughan R. Pratt's "Top Down Operator Precedence"](http://dl.acm.org/citation.cfm?doid=512927.512931).
*
* If you don't want to read the full paper, there are some other good
* overviews that explain the general idea:
* - [Pratt Parsers: Expression Parsing Made Easy](https://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/)
* - [Simple Top-Down Parsing in Python](https://11l-lang.org/archive/simple-top-down-parsing/)
* - [Top Down Operator Precedence](http://javascript.crockford.com/tdop/tdop.html)
*/
class Parser {
/**
* The maximum binding power for a token
* that can stop a projection.
*/
#projectionStop = 10;
/**
* Cache object
*/
#cache = {};
/**
* The maximum size of the cache.
*/
#maxCacheSize = 128;
#tokenizer;
#tokens;
#index = 0;
constructor(lookahead = 2) {
this.#tokens = Array.from({ length: lookahead });
}
/**
* Parse a JMESPath expression and return the Abstract Syntax Tree (AST)
* that represents the expression.
*
* The AST is cached, so if you parse the same expression multiple times,
* the AST will be returned from the cache.
*
* @param expression The JMESPath expression to parse.
*/
parse(expression) {
const cached = this.#cache[expression];
if (cached) {
return cached;
}
const parsedResult = this.#doParse(expression);
this.#cache[expression] = parsedResult;
if (Object.keys(this.#cache).length > this.#maxCacheSize) {
this.#evictCache();
}
return parsedResult;
}
/**
* Purge the entire cache.
*/
purgeCache() {
this.#cache = {};
}
/**
* Do the actual parsing of the expression.
*
* @param expression The JMESPath expression to parse.
*/
#doParse(expression) {
try {
return this.#parse(expression);
}
catch (error) {
if (error instanceof errors_js_1.LexerError ||
error instanceof errors_js_1.IncompleteExpressionError ||
error instanceof errors_js_1.ParseError) {
error.setExpression(expression);
throw error;
}
throw error;
}
}
/**
* Parse a JMESPath expression and return the parsed result.
*/
#parse(expression) {
this.#tokenizer = new Lexer_js_1.Lexer();
this.#tokens = [...this.#tokenizer.tokenize(expression)];
this.#index = 0;
const parsed = this.#expression(0);
if (this.#currentToken() !== 'eof') {
this.#throwParseError();
}
return new ParsedResult_js_1.ParsedResult(expression, parsed);
}
/**
* Process an expression.
*/
#expression(bindingPower = 0) {
const leftToken = this.#lookaheadToken(0);
this.#advance();
let left = this.#getNudFunction(leftToken);
let currentToken = this.#currentToken();
while (bindingPower < constants_js_1.BINDING_POWER[currentToken]) {
this.#advance();
left = this.#getLedFunction(currentToken, left);
currentToken = this.#currentToken();
}
return left;
}
/**
* Get the nud function for a token. This is the function that
* is called when a token is found at the beginning of an expression.
*
* @param tokenType The type of token to get the nud function for.
*/
#getNudFunction(token) {
const { type: tokenType } = token;
switch (tokenType) {
case 'literal':
return (0, ast_js_1.literal)(token.value);
case 'unquoted_identifier':
return (0, ast_js_1.field)(token.value);
case 'quoted_identifier':
return this.#processQuotedIdentifier(token);
case 'star':
return this.#processStarToken();
case 'filter':
return this.#getLedFunction(tokenType, (0, ast_js_1.identity)());
case 'lbrace':
return this.#parseMultiSelectHash();
case 'lparen':
return this.#processLParenTokenNud();
case 'flatten':
return this.#processFlattenTokenNud();
case 'not':
return (0, ast_js_1.notExpression)(this.#expression(constants_js_1.BINDING_POWER.not));
case 'lbracket':
return this.#processLBracketTokenNud();
case 'current':
return (0, ast_js_1.currentNode)();
case 'expref':
return (0, ast_js_1.expref)(this.#expression(constants_js_1.BINDING_POWER.expref));
default:
return this.#processDefaultToken(token);
}
}
/**
* Process a quoted identifier.
*
* A quoted identifier is a string that is enclosed in double quotes.
*
* @example s."foo"
*
* @param token The token to process
*/
#processQuotedIdentifier(token) {
const fieldValue = (0, ast_js_1.field)(token.value);
if (this.#currentToken() === 'lparen') {
this.#throwParseError({
lexPosition: 0,
reason: 'quoted identifiers cannot be used as a function name',
});
}
return fieldValue;
}
/**
* Process a star token.
*
* A star token is a syntax that allows you to project all the
* elements in a list or dictionary.
*
* @example foo[*]
*/
#processStarToken() {
const left = (0, ast_js_1.identity)();
let right;
if (this.#currentToken() === 'rbracket') {
right = (0, ast_js_1.identity)();
}
else {
right = this.#parseProjectionRhs(constants_js_1.BINDING_POWER.star);
}
return (0, ast_js_1.valueProjection)(left, right);
}
/**
* Process a left parenthesis token.
*
* A left parenthesis token is a syntax that allows you to group
* expressions together.
*
* @example (foo.bar)
*/
#processLParenTokenNud() {
const expression = this.#expression();
this.#match('rparen');
return expression;
}
/**
* Process a flatten token.
*
* A flatten token is a syntax that allows you to flatten the
* results of a subexpression.
*
* @example foo[].bar
*/
#processFlattenTokenNud() {
const left = (0, ast_js_1.flatten)((0, ast_js_1.identity)());
const right = this.#parseProjectionRhs(constants_js_1.BINDING_POWER.flatten);
return (0, ast_js_1.projection)(left, right);
}
/**
* Process a left bracket token.
*
* A left bracket token is a syntax that allows you to access
* elements in a list or dictionary.
*
* @example foo[0]
*/
#processLBracketTokenNud() {
if (['number', 'colon'].includes(this.#currentToken())) {
const right = this.#parseIndexExpression();
return this.#projectIfSlice((0, ast_js_1.identity)(), right);
}
if (this.#currentToken() === 'star' && this.#lookahead(1) === 'rbracket') {
this.#advance();
this.#advance();
const right = this.#parseProjectionRhs(constants_js_1.BINDING_POWER.star);
return (0, ast_js_1.projection)((0, ast_js_1.identity)(), right);
}
return this.#parseMultiSelectList();
}
/**
* Process a default token.
*
* A default token is a syntax that allows you to access
* elements in a list or dictionary.
*
* @param token The token to process
*/
#processDefaultToken(token) {
if (token.type === 'eof') {
throw new errors_js_1.IncompleteExpressionError({
lexPosition: token.start,
tokenValue: token.value,
tokenType: token.type,
});
}
throw new errors_js_1.ParseError({
lexPosition: token.start,
tokenValue: token.value,
tokenType: token.type,
});
}
/**
* Get the led function for a token. This is the function that
* is called when a token is found in the middle of an expression.
*
* @param tokenType The type of token to get the led function for.
* @param leftNode The left hand side of the expression.
*/
#getLedFunction(tokenType, leftNode) {
switch (tokenType) {
case 'dot':
return this.#processDotToken(leftNode);
case 'pipe':
return this.#processPipeToken(leftNode);
case 'or':
return this.#processOrToken(leftNode);
case 'and':
return this.#processAndToken(leftNode);
case 'lparen':
return this.#processLParenToken(leftNode);
case 'filter':
return this.#processFilterToken(leftNode);
case 'eq':
case 'ne':
case 'gt':
case 'gte':
case 'lt':
case 'lte':
return this.#parseComparator(leftNode, tokenType);
case 'flatten':
return this.#processFlattenToken(leftNode);
case 'lbracket':
return this.#processLBracketToken(leftNode);
default:
return this.#throwParseError();
}
}
/**
* Process a dot token.
*
* A dot token is a syntax that allows you to access
* fields in a dictionary or elements in a list.
*
* @example foo.bar
*
* @param leftNode The left hand side of the expression.
*/
#processDotToken(leftNode) {
if (this.#currentToken() !== 'star') {
const right = this.#parseDotRhs(constants_js_1.BINDING_POWER.dot);
if (leftNode.type === 'subexpression') {
leftNode.children.push(right);
return leftNode;
}
return (0, ast_js_1.subexpression)([leftNode, right]);
}
// We are creating a value projection
this.#advance();
const right = this.#parseProjectionRhs(constants_js_1.BINDING_POWER.dot);
return (0, ast_js_1.valueProjection)(leftNode, right);
}
/**
* Process a pipe token.
*
* A pipe token is a syntax that allows you to combine two
* expressions using the pipe operator.
*
* @example foo | bar
*
* @param leftNode The left hand side of the expression.
*/
#processPipeToken(leftNode) {
const right = this.#expression(constants_js_1.BINDING_POWER.pipe);
return (0, ast_js_1.pipe)(leftNode, right);
}
/**
* Process an or token.
*
* An or token is a syntax that allows you to combine two
* expressions using the logical or operator.
*
* @example foo || bar
*
* @param leftNode The left hand side of the expression.
*/
#processOrToken(leftNode) {
const right = this.#expression(constants_js_1.BINDING_POWER.or);
return (0, ast_js_1.orExpression)(leftNode, right);
}
/**
* Process an and token.
*
* An and token is a syntax that allows you to combine two
* expressions using the logical and operator.
*
* @example foo && bar
*
* @param leftNode The left hand side of the expression.
*/
#processAndToken(leftNode) {
const right = this.#expression(constants_js_1.BINDING_POWER.and);
return (0, ast_js_1.andExpression)(leftNode, right);
}
#processLParenToken(leftNode) {
const name = leftNode.value;
const args = [];
while (this.#currentToken() !== 'rparen') {
const expression = this.#expression();
if (this.#currentToken() === 'comma') {
this.#match('comma');
}
args.push(expression);
}
this.#match('rparen');
return (0, ast_js_1.functionExpression)(name, args);
}
#processFilterToken(leftNode) {
// Filters are projections
const condition = this.#expression(0);
this.#match('rbracket');
let right;
if (this.#currentToken() === 'flatten') {
right = (0, ast_js_1.identity)();
}
else {
right = this.#parseProjectionRhs(constants_js_1.BINDING_POWER.flatten);
}
return (0, ast_js_1.filterProjection)(leftNode, right, condition);
}
#processFlattenToken(leftNode) {
const left = (0, ast_js_1.flatten)(leftNode);
const right = this.#parseProjectionRhs(constants_js_1.BINDING_POWER.flatten);
return (0, ast_js_1.projection)(left, right);
}
#processLBracketToken(leftNode) {
const token = this.#lookaheadToken(0);
if (['number', 'colon'].includes(token.type)) {
const right = this.#parseIndexExpression();
if (leftNode.type === 'index_expression') {
// Optimization: if the left node is an index expression
// we can avoid creating another node and instead just
// add the right node as a child of the left node.
leftNode.children.push(right);
return leftNode;
}
return this.#projectIfSlice(leftNode, right);
}
// We have a projection
this.#match('star');
this.#match('rbracket');
const right = this.#parseProjectionRhs(constants_js_1.BINDING_POWER.star);
return (0, ast_js_1.projection)(leftNode, right);
}
/**
* Throw a parse error.
*
* This type of error indicates that the parser encountered
* a syntax error while processing the expression.
*
* The error includes the position in the expression where
* the error occurred, the value of the token that caused
* the error, the type of the token, and an optional reason.
*
* @param options The options to use when throwing the error.
*/
#throwParseError(options) {
const token = this.#lookaheadToken(0);
throw new errors_js_1.ParseError({
lexPosition: options?.lexPosition ?? token.start,
tokenValue: options?.tokenValue ?? token.value,
tokenType: options?.tokenType ?? token.type,
reason: options?.reason,
});
}
/**
* Process an index expression.
*
* An index expression is a syntax that allows you to
* access elements in a list or dictionary. For example
* `foo[0]` accesses the first element in the list `foo`.
*/
#parseIndexExpression() {
// We're here:
// [<current>
// ^
// | (currentToken)
if (this.#lookahead(0) === 'colon' || this.#lookahead(1) === 'colon') {
return this.#parseSliceExpression();
}
// Parse the syntax [number]
const node = (0, ast_js_1.index)(this.#lookaheadToken(0).value);
this.#advance();
this.#match('rbracket');
return node;
}
/**
* Process a slice expression.
*
* A slice expression is a syntax that allows you to
* access a range of elements in a list. For example
* `foo[0:10:2]` accesses every other element in the
* list `foo` from index 0 to 10.
*
* In a slice expression, the first index represents the
* start of the slice, the second index represents the
* end of the slice, and the third index represents the
* step.
*
* If the first index is omitted, it defaults to 0.
* If the second index is omitted, it defaults to the
* length of the list. If the third index is omitted, it
* defaults to 1. If the last colon is omitted, it defaults
* to a single index.
*/
#parseSliceExpression() {
// [start:end:step]
// Where start, end, and step are optional.
// The last colon is optional as well.
const parts = [];
let index = 0;
let currentToken = this.#currentToken();
while (currentToken !== 'rbracket' && index < 3) {
if (currentToken === 'colon') {
index += 1;
if (index === 3) {
this.#throwParseError();
}
this.#advance();
}
else if (currentToken === 'number') {
parts[index] = this.#lookaheadToken(0).value;
this.#advance();
}
else {
this.#throwParseError();
}
currentToken = this.#currentToken();
}
this.#match('rbracket');
return (0, ast_js_1.slice)(parts[0], parts[1], parts[2]);
}
/**
* Process a projection if the right hand side of the
* projection is a slice.
*
* @param left The left hand side of the projection.
* @param right The right hand side of the projection.
*/
#projectIfSlice(left, right) {
const idxExpression = (0, ast_js_1.indexExpression)([left, right]);
if (right.type === 'slice') {
return (0, ast_js_1.projection)(idxExpression, this.#parseProjectionRhs(constants_js_1.BINDING_POWER.star));
}
return idxExpression;
}
/**
* Process a comparator.
*
* A comparator is a syntax that allows you to compare
* two values. For example `foo == bar` compares the
* value of `foo` with the value of `bar`.
*
* @param left The left hand side of the comparator.
* @param comparatorChar The comparator character.
*/
#parseComparator(left, comparatorChar) {
return (0, ast_js_1.comparator)(comparatorChar, left, this.#expression(constants_js_1.BINDING_POWER[comparatorChar]));
}
/**
* Process a multi-select list.
*
* A multi-select list is a syntax that allows you to
* select multiple elements from a list. For example
* `foo[*]` selects all elements in the list `foo`.
*/
#parseMultiSelectList() {
const expressions = [];
while (true) {
const expression = this.#expression();
expressions.push(expression);
if (this.#currentToken() === 'rbracket') {
break;
}
this.#match('comma');
}
this.#match('rbracket');
return (0, ast_js_1.multiSelectList)(expressions);
}
/**
* Process a multi-select hash.
*
* A multi-select hash is a syntax that allows you to
* select multiple key-value pairs from a dictionary.
* For example `foo{a: a, b: b}` selects the keys `a`
* and `b` from the dictionary `foo`.
*/
#parseMultiSelectHash() {
const pairs = [];
while (true) {
const keyToken = this.#lookaheadToken(0);
// Before getting the token value, verify it's
// an identifier.
this.#matchMultipleTokens(['quoted_identifier', 'unquoted_identifier']); // token types
const keyName = keyToken.value;
this.#match('colon');
const value = this.#expression(0);
const node = (0, ast_js_1.keyValPair)(keyName, value);
pairs.push(node);
if (this.#currentToken() === 'comma') {
this.#match('comma');
}
else if (this.#currentToken() === 'rbrace') {
this.#match('rbrace');
break;
}
}
return (0, ast_js_1.multiSelectObject)(pairs);
}
/**
* Process the right hand side of a projection.
*
* @param bindingPower The binding power of the current token.
*/
#parseProjectionRhs(bindingPower) {
// Parse the right hand side of the projection.
let right;
if (constants_js_1.BINDING_POWER[this.#currentToken()] < this.#projectionStop) {
// BP of 10 are all the tokens that stop a projection.
right = (0, ast_js_1.identity)();
}
else if (this.#currentToken() === 'lbracket') {
right = this.#expression(bindingPower);
}
else if (this.#currentToken() === 'filter') {
right = this.#expression(bindingPower);
}
else if (this.#currentToken() === 'dot') {
this.#match('dot');
right = this.#parseDotRhs(bindingPower);
}
else {
this.#throwParseError();
}
return right;
}
/**
* Process the right hand side of a dot expression.
*
* @param bindingPower The binding power of the current token.
*/
#parseDotRhs(bindingPower) {
// From the grammar:
// expression '.' ( identifier /
// multi-select-list /
// multi-select-hash /
// function-expression /
// *
// In terms of tokens that means that after a '.',
// you can have:
const lookahead = this.#currentToken();
// Common case "foo.bar", so first check for an identifier.
if (['quoted_identifier', 'unquoted_identifier', 'star'].includes(lookahead)) {
return this.#expression(bindingPower);
}
if (lookahead === 'lbracket') {
this.#match('lbracket');
return this.#parseMultiSelectList();
}
if (lookahead === 'lbrace') {
this.#match('lbrace');
return this.#parseMultiSelectHash();
}
this.#throwParseError();
}
/**
* Process a token and throw an error if it doesn't match the expected token.
*
* @param tokenType The expected token type.
*/
#match(tokenType) {
const currentToken = this.#currentToken();
if (currentToken === tokenType) {
this.#advance();
}
else {
const token = this.#lookaheadToken(0);
if (token.type === 'eof') {
throw new errors_js_1.IncompleteExpressionError({
lexPosition: token.start,
tokenValue: token.value,
tokenType: token.type,
});
}
throw new errors_js_1.ParseError({
lexPosition: token.start,
tokenValue: token.value,
tokenType: token.type,
});
}
}
/**
* Process a token and throw an error if it doesn't match the expected token.
*
* @param tokenTypes The expected token types.
*/
#matchMultipleTokens(tokenTypes) {
const currentToken = this.#currentToken();
if (!tokenTypes.includes(currentToken)) {
const token = this.#lookaheadToken(0);
if (token.type === 'eof') {
throw new errors_js_1.IncompleteExpressionError({
lexPosition: token.start,
tokenValue: token.value,
tokenType: token.type,
});
}
throw new errors_js_1.ParseError({
lexPosition: token.start,
tokenValue: token.value,
tokenType: token.type,
});
}
this.#advance();
}
/**
* Advance the index to the next token.
*/
#advance() {
this.#index += 1;
}
/**
* Get the current token type.
*/
#currentToken() {
return this.#tokens[this.#index].type;
}
/**
* Look ahead in the token stream and get the type of the token
*
* @param number The number of tokens to look ahead.
*/
#lookahead(number) {
return this.#tokens[this.#index + number].type;
}
/**
* Look ahead in the token stream and get the token
*
* @param number The number of tokens to look ahead.
*/
#lookaheadToken(number) {
return this.#tokens[this.#index + number];
}
/**
* Remove half of the cached expressions randomly.
*/
#evictCache() {
const newCache = Object.keys(this.#cache).reduce((acc, key) => {
if ((0, node_crypto_1.randomInt)(0, 100) > 50) {
acc[key] = this.#cache[key];
}
return acc;
}, {});
this.#cache = newCache;
}
}
exports.Parser = Parser;