UNPKG

@aws-lambda-powertools/jmespath

Version:

A type safe and modern jmespath module to parse and extract data from JSON documents using JMESPath

332 lines (331 loc) 10.9 kB
import { SIMPLE_TOKENS, START_IDENTIFIER, VALID_IDENTIFIER, VALID_NUMBER, WHITESPACE, } from './constants.js'; import { EmptyExpressionError, LexerError } from './errors.js'; /** * A lexer for JMESPath expressions. * * This lexer tokenizes a JMESPath expression into a sequence of tokens. */ class Lexer { #position; #expression; #chars; #current; #length; /** * Tokenize a JMESPath expression. * * This method is a generator that yields tokens for the given expression. * * @param expression The JMESPath expression to tokenize. */ *tokenize(expression) { this.#initializeForExpression(expression); while (this.#current !== '' && this.#current !== undefined) { if (SIMPLE_TOKENS.has(this.#current)) { yield { // biome-ignore lint/style/noNonNullAssertion: We know that SIMPLE_TOKENS has this.#current as a key because we checked for that above. type: SIMPLE_TOKENS.get(this.#current), value: this.#current, start: this.#position, end: this.#position + 1, }; this.#next(); } else if (START_IDENTIFIER.has(this.#current)) { yield this.#consumeIdentifier(); } else if (WHITESPACE.has(this.#current)) { this.#next(); } else if (this.#current === '[') { yield this.#consumeSquareBracket(); } else if (this.#current === `'`) { yield this.#consumeRawStringLiteral(); } else if (this.#current === '`') { yield this.#consumeLiteral(); } else if (VALID_NUMBER.has(this.#current)) { const start = this.#position; const buff = this.#consumeNumber(); yield { type: 'number', value: Number.parseInt(buff, 10), start: start, end: start + buff.length, }; } else if (this.#current === '-') { yield this.#consumeNegativeNumber(); } else if (this.#current === '"') { yield this.#consumeQuotedIdentifier(); } else if (['<', '>', '!', '=', '|', '&'].includes(this.#current)) { yield this.#consumeComparatorSigns(this.#current); } else { throw new LexerError(this.#position, this.#current); } } yield { type: 'eof', value: '', start: this.#length, end: this.#length }; } /** * Consume a comparator sign. * * This method is called when the lexer encounters a comparator sign. * * @param current The current character */ #consumeComparatorSigns = (current) => { switch (current) { case '<': return this.#matchOrElse('=', 'lte', 'lt'); case '>': return this.#matchOrElse('=', 'gte', 'gt'); case '!': return this.#matchOrElse('=', 'ne', 'not'); case '|': return this.#matchOrElse('|', 'or', 'pipe'); case '&': return this.#matchOrElse('&', 'and', 'expref'); default: return this.#consumeEqualSign(); } }; /** * Consume an equal sign. * * This method is called when the lexer encounters an equal sign. * It checks if the next character is also an equal sign and returns * the corresponding token. */ #consumeEqualSign() { if (this.#next() === '=') { this.#next(); return { type: 'eq', value: '==', start: this.#position - 1, end: this.#position, }; } throw new LexerError(this.#position - 1, '='); } /** * Consume an unquoted identifier. * * This method is called when the lexer encounters a character that is a valid * identifier. It advances the lexer until it finds a character that is not a * valid identifier and returns the corresponding token. */ #consumeIdentifier() { const start = this.#position; let buff = this.#current; while (VALID_IDENTIFIER.has(this.#next())) { buff += this.#current; } return { type: 'unquoted_identifier', value: buff, start, end: start + buff.length, }; } /** * Consume a negative number. * * This method is called when the lexer encounters a negative sign. * It checks if the next character is a number and returns the corresponding token. */ #consumeNegativeNumber() { const start = this.#position; const buff = this.#consumeNumber(); if (buff.length > 1) { return { type: 'number', value: Number.parseInt(buff, 10), start: start, end: start + buff.length, }; } // If the negative sign is not followed by a number, it is an error. throw new LexerError(start, 'Unknown token after "-"'); } /** * Consume a raw string that is a number. * * It takes the current position and advances * the lexer until it finds a character that * is not a number. */ #consumeNumber() { let buff = this.#current; while (VALID_NUMBER.has(this.#next())) { buff += this.#current; } return buff; } /** * Consume a square bracket. * * This method is called when the lexer encounters a square bracket. * It checks if the next character is a question mark or a closing * square bracket and returns the corresponding token. */ #consumeSquareBracket() { const start = this.#position; const nextChar = this.#next(); if (nextChar === ']') { this.#next(); return { type: 'flatten', value: '[]', start: start, end: start + 2 }; } if (nextChar === '?') { this.#next(); return { type: 'filter', value: '[?', start: start, end: start + 2 }; } return { type: 'lbracket', value: '[', start: start, end: start + 1 }; } /** * Initializes the lexer for the given expression. * * We use a separate method for this instead of the constructor * because we want to be able to reuse the same lexer instance * and also because we want to be able to expose a public API * for tokenizing expressions like `new Lexer().tokenize(expression)`. * * @param expression The JMESPath expression to tokenize. */ #initializeForExpression(expression) { if (typeof expression !== 'string') { throw new EmptyExpressionError(); } this.#position = 0; this.#expression = expression; this.#chars = Array.from(expression); this.#current = this.#chars[0]; this.#length = this.#expression.length; } /** * Advance the lexer to the next character in the expression. */ #next() { if (this.#position === this.#length - 1) { this.#current = ''; } else { this.#position += 1; this.#current = this.#chars[this.#position]; } return this.#current; } /** * Consume until the given delimiter is reached allowing * for escaping of the delimiter with a backslash (`\`). * * @param delimiter The delimiter to consume until. */ #consumeUntil(delimiter) { const start = this.#position; let buff = ''; this.#next(); while (this.#current !== delimiter) { if (this.#current === '\\') { buff += '\\'; this.#next(); } if (this.#current === '') { // We've reached the end of the expression (EOF) before // we found the delimiter. This is an error. throw new LexerError(start, this.#expression.substring(start)); } buff += this.#current; this.#next(); } // Skip the closing delimiter this.#next(); return buff; } /** * Process a literal. * * A literal is a JSON string that is enclosed in backticks. */ #consumeLiteral() { const start = this.#position; const lexeme = this.#consumeUntil('`').replace('\\`', '`'); try { const parsedJson = JSON.parse(lexeme); return { type: 'literal', value: parsedJson, start, end: this.#position - start, }; } catch { throw new LexerError(start, lexeme); } } /** * Process a quoted identifier. * * A quoted identifier is a string that is enclosed in double quotes. */ #consumeQuotedIdentifier() { const start = this.#position; const lexeme = `"${this.#consumeUntil('"')}"`; const tokenLen = this.#position - start; return { type: 'quoted_identifier', value: JSON.parse(lexeme), start, end: tokenLen, }; } /** * Process a raw string literal. * * A raw string literal is a string that is enclosed in single quotes. */ #consumeRawStringLiteral() { const start = this.#position; const lexeme = this.#consumeUntil(`'`).replace(`\\'`, `'`); const tokenLen = this.#position - start; return { type: 'literal', value: lexeme, start, end: tokenLen, }; } /** * Match the expected character and return the corresponding token type. * * @param expected The expected character * @param matchType The token type to return if the expected character is found * @param elseType The token type to return if the expected character is not found */ #matchOrElse(expected, matchType, elseType) { const start = this.#position; const current = this.#current; const nextChar = this.#next(); if (nextChar === expected) { this.#next(); return { type: matchType, value: current + nextChar, start, end: start + 2, }; } return { type: elseType, value: current, start, end: start, }; } } export { Lexer };