@pawel-up/jexl
Version:
Javascript Expression Language: Powerful context-based expression parser and evaluator
397 lines (377 loc) • 14 kB
text/typescript
/* eslint-disable max-len */
import type { Grammar } from './grammar.js'
/**
* Regular expression patterns and constants used by the lexer for tokenization.
* These patterns identify different types of tokens in Jexl expressions.
*/
/** Matches numeric literals (integers and floats, including negative numbers) */
const numericRegex = /^-?(?:(?:[0-9]*\.[0-9]+)|[0-9]+)$/
/** Matches valid identifier names (variables, function names, etc.) */
const identRegex =
/^[a-zA-Zа-яА-Я_\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF$][a-zA-Zа-яА-Я0-9_\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF$]*$/
/** Matches escaped backslashes in string literals */
const escEscRegex = /\\\\/
/** Matches whitespace-only strings */
const whitespaceRegex = /^\s*$/
/**
* Regex elements that are processed before operator elements.
* Includes string literals, whitespace, and boolean literals.
*/
const preOpRegexElements = [
// Strings
"'(?:(?:\\\\')|[^'])*'",
'"(?:(?:\\\\")|[^"])*"',
// Whitespace
'\\s+',
// Booleans
'\\btrue\\b',
'\\bfalse\\b',
// Null
'\\bnull\\b',
// Undefined
'\\bundefined\\b',
// Numerics (without negative symbol)
'(?:[0-9]+(?:\\.[0-9]+)?|\\.[0-9]+)',
]
/**
* Regex elements that are processed after operator elements.
* Includes identifiers.
*/
const postOpRegexElements = [
// Identifiers
'[a-zA-Zа-яА-Я_\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\\$][a-zA-Z0-9а-яА-Я_\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\\$]*',
]
/**
* Token types after which a minus sign should be treated as a negation operator
* rather than a binary subtraction operator.
*/
const unaryOpsAfter = ['binaryOp', 'unaryOp', 'openParen', 'openBracket', 'question', 'colon', 'comma']
/**
* Represents a lexical token in a Jexl expression.
* Each token contains information about its type, processed value, and original text.
*/
interface Token {
/** The type of token (e.g., 'literal', 'identifier', 'binaryOp') */
type: string
/** The processed value of the token (e.g., parsed number, unquoted string) */
// eslint-disable-next-line @typescript-eslint/no-explicit-any
value: any
/** The original raw text from the expression string */
raw: string
}
/**
* Lexer is responsible for the lexical analysis phase of Jexl expression parsing.
* It takes a raw expression string and converts it into a sequence of tokens that
* can be consumed by the Parser.
*
* The Lexer's primary responsibilities are:
* - Identifying and categorizing different parts of the expression (literals, operators, etc.)
* - Converting raw text into meaningful tokens with appropriate types and values
* - Handling special cases like negative numbers and string escaping
* - Minimal syntax validation (only what's needed for tokenization)
*
* @example
* ```typescript
* const lexer = new Lexer(grammar)
* const tokens = lexer.tokenize('user.name | upper')
* // Results in tokens like:
* // [
* // { type: 'identifier', value: 'user', raw: 'user' },
* // { type: 'dot', value: '.', raw: '.' },
* // { type: 'identifier', value: 'name', raw: 'name' },
* // { type: 'pipe', value: '|', raw: ' | ' },
* // { type: 'identifier', value: 'upper', raw: 'upper' }
* // ]
* ```
*
* ## Tokenization Process
*
* 1. **Split**: Expression is split into elements using a regex
* 2. **Classify**: Each element is classified by type (literal, operator, etc.)
* 3. **Process**: Values are processed (e.g., parse numbers, unquote strings)
* 4. **Optimize**: Adjacent whitespace is consolidated with neighboring tokens
*
* ## Token Types
*
* - **literal**: String, number, or boolean values
* - **identifier**: Variable names, function names, property names
* - **binaryOp**: Binary operators like `+`, `-`, `==`, `&&`
* - **unaryOp**: Unary operators like `!`, `-` (negation)
* - **dot**: Property access operator `.`
* - **pipe**: Transform operator `|`
* - **openParen/closeParen**: Parentheses for grouping `()`
* - **openBracket/closeBracket**: Brackets for array access/filtering `[]`
* - **openCurl/closeCurl**: Braces for object literals `{}`
* - **question/colon**: Ternary operator parts `?` and `:`
* - **comma**: Argument separator `,`
*
* ## Error Handling
*
* The Lexer performs minimal error checking. It will throw errors only when:
* - It encounters characters that cannot be classified into any token type
* - String literals are malformed (unclosed quotes)
*
* Most syntax errors (like mismatched operators) are left for the Parser to detect.
*/
export default class Lexer {
/** The grammar configuration containing operators and other language elements */
private _grammar: Grammar
/** Cached regex for splitting expressions, built on first use */
private _splitRegex?: RegExp
/**
* Creates a new Lexer instance with the given grammar configuration.
*
* @param grammar The grammar containing operators, functions, and other language elements
*/
constructor(grammar: Grammar) {
this._grammar = grammar
}
/**
* Splits a Jexl expression string into an array of expression elements.
* @param str A Jexl expression string
* @returns An array of substrings defining the functional
* elements of the expression.
*/
getElements(str: string): string[] {
const regex = this._getSplitRegex()
return str.split(regex).filter((elem) => {
// Remove empty strings
return elem
})
}
/**
* Converts an array of expression elements into an array of tokens. Note that
* the resulting array may not equal the element array in length, as any
* elements that consist only of whitespace get appended to the previous
* token's "raw" property. For the structure of a token object, please see
* {@link Lexer#tokenize}.
* @param elements An array of Jexl expression elements to be
* converted to tokens
* @returns An array of token objects.
*/
getTokens(elements: string[]): Token[] {
const tokens: Token[] = []
let negate = false
for (let i = 0; i < elements.length; i++) {
const element = elements[i]
if (!element) continue // Skip empty elements
if (this._isWhitespace(element)) {
if (tokens.length > 0) {
tokens[tokens.length - 1].raw += element
}
} else if ((element === '+' || element === '-') && this._isUnary(tokens)) {
const lastToken = tokens.length > 0 ? tokens[tokens.length - 1] : null
if (
lastToken &&
lastToken.type === 'binaryOp' &&
(lastToken.value === '+' || lastToken.value === '-') &&
!lastToken.raw.match(/\s$/)
) {
throw new Error(`Unexpected token '${element}' after operator '${lastToken.value}'`)
}
let nextElement = ''
for (let j = i + 1; j < elements.length; j++) {
if (!this._isWhitespace(elements[j])) {
nextElement = elements[j]
break
}
}
if (element === '-') {
if (nextElement.match(numericRegex)) {
negate = true
} else {
const token = this._createToken(element)
token.type = 'unaryOp'
tokens.push(token)
}
} else {
// Unary plus. If it's not before a number, it's a unary op.
// Otherwise, it's optional and we can just ignore it.
if (!nextElement.match(numericRegex)) {
const token = this._createToken(element)
token.type = 'unaryOp'
tokens.push(token)
}
}
} else {
if (negate) {
elements[i] = '-' + element
negate = false
}
tokens.push(this._createToken(elements[i]))
}
}
// Catch a - at the end of the string. Let the parser handle that issue.
if (negate) {
tokens.push(this._createToken('-'))
}
return tokens
}
/**
* Converts a Jexl expression string into an array of tokens.
* This is the main entry point for lexical analysis.
*
* Each token is an object with the following structure:
* ```typescript
* {
* type: string, // Token type (e.g., 'literal', 'identifier', 'binaryOp')
* value: any, // Processed value (parsed number, unquoted string, etc.)
* raw: string // Original text including any whitespace
* }
* ```
*
* ## Token Types
*
* - **literal**: String, number, or boolean values
* - **identifier**: Variable names, function names, property names
* - **binaryOp**: Binary operators like `+`, `-`, `==`, `&&`
* - **unaryOp**: Unary operators like `!`, `-` (negation)
* - **Grammar elements**: Control characters defined in grammar (dot, pipe, etc.)
*
* ## Value Processing
*
* - **Strings**: Quotes are removed and escape sequences processed
* - **Numbers**: Converted to numeric values using `parseFloat()`
* - **Booleans**: `"true"` and `"false"` become boolean values
* - **Others**: Remain as original strings
*
* @param str The Jexl expression string to be tokenized
* @returns An array of token objects representing the expression
* @throws {Error} if the string contains invalid tokens
*
* @example
* ```typescript
* lexer.tokenize('user.age >= 18')
* // Returns:
* // [
* // { type: 'identifier', value: 'user', raw: 'user' },
* // { type: 'dot', value: '.', raw: '.' },
* // { type: 'identifier', value: 'age', raw: 'age' },
* // { type: 'binaryOp', value: '>=', raw: ' >= ' },
* // { type: 'literal', value: 18, raw: '18' }
* // ]
* ```
*/
tokenize(str: string): Token[] {
const elements = this.getElements(str)
return this.getTokens(elements)
}
/**
* Creates a new token object from an element of a Jexl string. See
* {@link Lexer#tokenize} for a description of the token object.
* @param element The element from which a token should be made
* @returns A token object describing the provided element.
* @throws {Error} if the provided string is not a valid expression element.
* @private
*/
_createToken(element: string): Token {
const token: Token = {
type: 'literal',
value: element,
raw: element,
}
if (element[0] === '"' || element[0] === "'") {
token.value = this._unquote(element)
} else if (element.match(numericRegex)) {
token.value = parseFloat(element)
} else if (element === 'true' || element === 'false') {
token.value = element === 'true'
} else if (element === 'null') {
token.value = null
} else if (element === 'undefined') {
token.value = undefined
} else if (this._grammar.elements[element]) {
token.type = this._grammar.elements[element].type
} else if (element.match(identRegex)) {
token.type = 'identifier'
} else {
throw new Error(`Invalid expression token: ${element}`)
}
return token
}
/**
* Escapes a string so that it can be treated as a string literal within a
* regular expression.
* @param str The string to be escaped
* @returns the RegExp-escaped string.
* @see https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions
* @private
*/
_escapeRegExp(str: string): string {
str = str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
if (str.match(identRegex)) {
str = '\\b' + str + '\\b'
}
return str
}
/**
* Gets a RegEx object appropriate for splitting a Jexl string into its core
* elements.
* @returns {RegExp} An element-splitting RegExp object
* @private
*/
_getSplitRegex(): RegExp {
if (!this._splitRegex) {
// Sort by most characters to least, then regex escape each
const elemArray = Object.keys(this._grammar.elements)
.sort((a, b) => {
return b.length - a.length
})
.map((elem) => {
return this._escapeRegExp(elem)
}, this)
this._splitRegex = new RegExp(
'(' + [preOpRegexElements.join('|'), elemArray.join('|'), postOpRegexElements.join('|')].join('|') + ')'
)
}
return this._splitRegex
}
/**
* Determines whether the addition of a '-' token should be interpreted as a
* negative symbol for an upcoming number, given an array of tokens already
* processed.
* @param {Array<Object>} tokens An array of tokens already processed
* @returns {boolean} true if adding a '-' should be considered a negative
* symbol or a '+' should be considered a positive symbol; false otherwise
* @private
*/
_isUnary(tokens: Token[]): boolean {
if (!tokens.length) return true
const lastToken = tokens[tokens.length - 1]
if (!lastToken) return true
return unaryOpsAfter.some((type) => type === lastToken.type)
}
/**
* A utility function to determine if a string consists of only space
* characters.
* @param {string} str A string to be tested
* @returns {boolean} true if the string is empty or consists of only spaces;
* false otherwise.
* @private
*/
_isWhitespace(str: string): boolean {
return !!str.match(whitespaceRegex)
}
/**
* Removes the beginning and trailing quotes from a string, unescapes any
* escaped quotes on its interior, and unescapes any escaped escape
* characters. Note that this function is not defensive; it assumes that the
* provided string is not empty, and that its first and last characters are
* actually quotes.
* @param {string} str A string whose first and last characters are quotes
* @returns {string} a string with the surrounding quotes stripped and escapes
* properly processed.
* @private
*/
_unquote(str: string): string {
const quote = str[0]
if (!quote) {
throw new Error('Cannot unquote empty string')
}
const escQuoteRegex = new RegExp('\\\\' + quote, 'g')
return str
.substr(1, str.length - 2)
.replace(escQuoteRegex, quote)
.replace(escEscRegex, '\\')
}
}