@borgar/fx

/* * Excel formula language parser * * This parser is a Top-Down Operator Precedence (Pratt) parser. It's based on * the one that Douglas Crockford describes in Chapter 9 of the O'Reilly book * Beutiful Code (http://crockford.com/javascript/tdop/tdop.html). * * The parser handles most basic things Excel/Sheets do except: * `Sheet1:Sheet2!A1` references cross contexts (3D references) */ import { isReference, isLiteral, isFunction, isWhitespace, isFxPrefix, isOperator, isError } from './isType.ts'; import { UNARY, BINARY, REFERENCE, LITERAL, ERROR_LITERAL, CALL, LAMBDA, ARRAY, IDENTIFIER, NUMBER, BOOLEAN, ERROR, STRING, LET, LET_DECL, REF_NAMED, REF_STRUCT, REF_BEAM } from './constants.ts'; import type { Token } from './types.ts'; import type { ArrayExpression, AstExpression, BinaryExpression, CallExpression, Identifier, LambdaExpression, LetDeclarator, LetExpression, UnaryExpression } from './astTypes.ts'; const END = '(END)'; const FUNCTION = '(FUNCTION)'; const WHITESPACE = '(WHITESPACE)'; const refFunctions = [ 'ANCHORARRAY', 'CHOOSE', 'DROP', 'IF', 'IFS', 'INDEX', 'INDIRECT', 'LAMBDA', 'LET', 'OFFSET', 'REDUCE', 'SINGLE', 'SWITCH', 'TAKE', 'TRIMRANGE', 'XLOOKUP' ]; const symbolTable = {}; let currentNode; let tokens: Token[]; let tokenIndex: number; let permitArrayRanges = false; let permitArrayCalls = false; let looseRefCalls = false; const isReferenceFunctionName = (fnName: string) => { return looseRefCalls || refFunctions.includes(fnName.toUpperCase()); }; const isReferenceToken = (token: Token, allowOperators = false) => { const value = (token && token.value) + ''; if (isReference(token)) { return true; } if (allowOperators && isOperator(token) && (value === ':' || value === ',' || !value.trim())) { return true; // join, union, intersection } if (isFunction(token) && isReferenceFunctionName(value)) { return true; // function that yields reference } if (isError(token) && value === '#REF!') { return true; } return false; }; const isReferenceNode = node => { return (!!node) && ( (node.type === REFERENCE) || ((node.type === ERROR_LITERAL || node.type === ERROR) && node.value === '#REF!') || (node.type === BINARY && ( node.operator === ':' || node.operator === ' ' || node.operator === ',') ) || isReference(node) || (node.type === CALL && isReferenceFunctionName(node.callee.name)) ); }; function halt (message: string, atIndex = null) { const err = new Error(message); // @ts-ignore -- FIXME: use a dedicated error class err.source = tokens.map(d => d.value).join(''); // @ts-ignore err.sourceOffset = tokens .slice(0, atIndex ?? tokenIndex) .reduce((a, d) => a + d.value.length, 0); throw err; } // A1 A1 | A1 (A1) | A1 ((A1)) | A1 ( (A1) ) | ... function refIsUpcoming (allowOperators = false): boolean { let i = tokenIndex; let next: Token; do { next = tokens[++i]; } while ( next && ( isWhitespace(next) || (isOperator(next) && next.value === '(') ) ); return isReferenceToken(next, allowOperators); } function advance (expectNext = null, leftNode = null) { if (expectNext && expectNext !== currentNode.id) { halt(`Expected ${expectNext} but got ${currentNode.id}`); } // look ahead to see if we have ( ( " ", "(" )+ REF ) if (isWhitespace(tokens[tokenIndex])) { // potential intersection operation (so don't allow operators as upcoming) const haveRef = isReferenceNode(leftNode); const possibleWSOp = haveRef && refIsUpcoming(false); const nextIsCall = haveRef && tokens[tokenIndex + 1] && tokens[tokenIndex + 1].value === '('; if (!possibleWSOp && !nextIsCall) { // ignore whitespace while (isWhitespace(tokens[tokenIndex])) { tokenIndex++; } } } // EOT if (tokenIndex >= tokens.length) { currentNode = symbolTable[END]; return; } const token = tokens[tokenIndex]; tokenIndex += 1; if (token.unterminated) { halt('Encountered an unterminated token'); } let node; if (isOperator(token)) { node = symbolTable[token.value]; if (!node) { halt(`Unknown operator ${token.value}`); } } else if (isWhitespace(token)) { node = symbolTable[WHITESPACE]; } else if (isLiteral(token)) { node = symbolTable[LITERAL]; } else if (isReference(token)) { node = symbolTable[REFERENCE]; } else if (isFunction(token)) { node = symbolTable[FUNCTION]; } else { halt(`Unexpected ${token.type} token: ${token.value}`); } currentNode = Object.create(node); currentNode.type = token.type; currentNode.value = token.value; if (token.loc) { currentNode.loc = [ ...token.loc ]; } return currentNode; } function expression (rbp: number) { let t = currentNode; advance(null, t); let left = t.nud(); while (rbp < currentNode.lbp) { t = currentNode; advance(null, t); left = t.led(left); } return left; } const original_symbol = { // null denotation nud: () => halt('Invalid syntax'), // Undefined // left denotation led: () => halt('Missing operator') }; // bp = binding power function symbol (id: string, bp = 0) { let s = symbolTable[id]; if (s) { if (bp >= s.lbp) { s.lbp = bp; } } else { s = { ...original_symbol }; s.id = id; s.value = id; s.lbp = bp; symbolTable[id] = s; } return s; } function infix (id: string, bp: number, led?) { const s = symbol(id, bp); s.led = led || function (this: BinaryExpression & { value?: string }, left) { this.type = BINARY; // @ts-expect-error -- we know this is going to be a valid operator this.operator = this.value; delete this.value; const right = expression(bp); this.arguments = [ left, right ]; if (this.loc) { this.loc = [ left.loc[0], right.loc[1] ]; } return this; }; return s; } function postfix (id: string, led?) { const s = symbol(id, 0); s.lbp = 70; s.led = led || function (this: UnaryExpression & { value?: string }, left) { this.type = UNARY; // @ts-expect-error -- we know this is going to be a valid operator this.operator = this.value; delete this.value; this.arguments = [ left ]; if (this.loc) { this.loc[0] = left.loc[0]; } return this; }; return s; } function prefix (id, nud?) { const s = symbol(id); s.nud = nud || function (this: UnaryExpression & { value?: string }) { this.type = UNARY; // @ts-expect-error -- we know this is going to be a valid operator this.operator = this.value; delete this.value; const subexpr = expression(70); this.arguments = [ subexpr ]; if (this.loc) { this.loc[1] = subexpr.loc[1]; } return this; }; return s; } function rangeInfix (id, bp) { return infix(id, bp, function (this: BinaryExpression & { id?: string, value?: string }, left) { if (!isReferenceNode(left)) { halt(`Unexpected ${id} operator`); } const right = expression(bp); if (!isReferenceNode(right)) { halt(`Unexpected ${currentNode.type} following ${this.id}`); } this.type = BINARY; // @ts-expect-error -- we know this is going to be a valid operator this.operator = this.value.trim() ? this.value : ' '; // hack around whitespace op delete this.value; this.arguments = [ left, right ]; if (this.loc) { this.loc = [ left.loc[0], right.loc[1] ]; } return this; }); } symbol(END); // reference operators rangeInfix(':', 80); // range join/extend =B7:OFFSET(A1,10,10) const comma = rangeInfix(',', 80); // union =B7:D7,C6:C8 rangeInfix(WHITESPACE, 80); // intersect: =B7:D7 C6:C8 // Excel's grammar is ambiguous. This turns the , operator's left binding // power on/off which allows us to treat , as a symbol where we need. const unionRefs = (enable?: boolean) => { const currState = comma.lbp > 0; if (enable != null) { comma.lbp = enable ? 80 : 0; } return currState; }; // arithmetic and string operations postfix('%'); // percent postfix('#', function (this: Token, left) { // spilled range (_xlfn.ANCHORARRAY) if (!isReferenceNode(left)) { halt('# expects a reference'); } this.type = UNARY; this.operator = this.value; delete this.value; this.arguments = [ left ]; return this; }); // range prefix('+'); // unary plus prefix('-'); // unary minus prefix('@'); // implicit intersection (_xlfn.SINGLE) infix('^', 50); // power infix('*', 40); // multiply infix('/', 40); // divide infix('+', 30); // add infix('-', 30); // subtract infix('&', 20); // text concat // comparison infix('=', 10); infix('<', 10); infix('>', 10); infix('<=', 10); infix('>=', 10); infix('<>', 10); symbol(LITERAL).nud = function () { const { type, value } = this; this.type = LITERAL; this.raw = value; if (type === NUMBER) { this.value = +value; } else if (type === BOOLEAN) { this.value = value.toUpperCase() === 'TRUE'; } else if (type === ERROR) { this.type = ERROR_LITERAL; this.value = value.toUpperCase(); } else if (type === STRING) { // FIXME: throw an error if the string is unterminated this.value = value.slice(1, -1).replace(/""/g, '"'); } else { throw new Error('Unsupported literal type: ' + type); } return this; }; symbol(REFERENCE).nud = function () { if (this.type === REF_NAMED) { this.kind = 'name'; } else if (this.type === REF_STRUCT) { this.kind = 'table'; // structured ? } else if (this.type === REF_BEAM) { this.kind = 'beam'; } else { this.kind = 'range'; } this.type = REFERENCE; return this; }; // parens symbol(')'); prefix('(', function () { const prevState = unionRefs(true); const e = expression(0); advance(')', e); unionRefs(prevState); return e; }); // function call symbol(FUNCTION).nud = function () { return this; }; infix('(', 90, function (this: CallExpression & { value?: string }, left) { let callee: Identifier = { type: IDENTIFIER, name: left.value }; if (left.id !== FUNCTION) { if ( left.type === LAMBDA || // Excel only allows calls to "names" and ref functions. Since we don't // differentiate between the two (this requires a table of function names) // we're overly permissive here: left.type === CALL || left.type === LET || left.type === REFERENCE || (left.type === UNARY && left.value === '#') || // Because it's really SINGLE(...)() (left.type === ERROR_LITERAL && left.value === '#REF!') ) { // in the case of REFERENCE, do we want to set the node to Identifier? callee = left; } else { halt('Unexpected call', tokenIndex - 1); } } const lcFn = left.value.toLowerCase(); if (lcFn === 'lambda') { return parseLambda.call(this, left); } if (lcFn === 'let') { return parseLet.call(this, left); } const args = []; let lastWasComma = false; if (currentNode.id !== ')') { const prevState = unionRefs(false); while (currentNode.id !== ')') { if (isWhitespace(currentNode)) { advance(); } if (currentNode.id === ',') { args.push(null); lastWasComma = true; advance(); } else { const arg = expression(0); args.push(arg); lastWasComma = false; if (currentNode.id === ',') { advance(','); lastWasComma = true; } } } unionRefs(prevState); } if (lastWasComma) { args.push(null); } const closeParen = currentNode; delete this.value; this.type = CALL; this.callee = callee; if (left.loc) { this.callee.loc = [ ...left.loc ]; } this.arguments = args; if (left.loc) { this.loc = [ left.loc[0], closeParen.loc[1] ]; } advance(')', this); return this; }); function parseLambda (this: LambdaExpression & { value?: string }, left) { const args = []; const argNames = {}; let body: AstExpression | null; let done = false; const prevState = unionRefs(false); if (currentNode.id !== ')') { while (!done) { if (isWhitespace(currentNode)) { advance(); } const argTokenIndex = tokenIndex; const arg = expression(0); if (currentNode.id === ',') { // all but last args must be names if (arg.type === REFERENCE && arg.kind === 'name') { // names may not be duplicates const currName = arg.value.toLowerCase(); if (currName in argNames) { halt('Duplicate name: ' + arg.value); } argNames[currName] = 1; const a: Identifier = { type: IDENTIFIER, name: arg.value }; if (arg.loc) { a.loc = arg.loc; } args.push(a); } else { tokenIndex = argTokenIndex; halt('LAMBDA argument is not a name'); } advance(','); } else { body = arg; done = true; } } } unionRefs(prevState); delete this.value; this.type = LAMBDA; this.params = args; this.body = body || null; if (left.loc) { this.loc = [ left.loc[0], currentNode.loc[1] ]; } advance(')', this); return this; } function parseLet (this: LetExpression & { value?: string }, left) { const args = []; const vals = []; const argNames = {}; let body: AstExpression | null; let argCounter = 0; const addArgument = (arg, lastArg?) => { if (body) { halt('Unexpected argument following calculation'); } if (lastArg && argCounter >= 2) { body = arg; } else { const wantName = !(argCounter % 2); if (wantName) { if (arg && (arg.type === REFERENCE && arg.kind === 'name')) { // names may not be duplicates const currName = arg.value.toLowerCase(); if (currName in argNames) { halt('Duplicate name: ' + arg.value); } argNames[currName] = 1; args.push({ type: IDENTIFIER, name: arg.value, loc: arg.loc }); } else if (argCounter >= 2) { body = arg; } else { halt('Argument is not a name'); } } else { vals.push(arg); } } argCounter++; }; const prevState = unionRefs(false); let lastWasComma = false; if (currentNode.id !== ')') { while (currentNode.id !== ')') { if (isWhitespace(currentNode)) { advance(); } if (currentNode.id === ',') { addArgument(null); lastWasComma = true; advance(); } else { const arg = expression(0); addArgument(arg, currentNode.id !== ','); lastWasComma = false; if (currentNode.id === ',') { advance(','); lastWasComma = true; } } } unionRefs(prevState); } if (lastWasComma) { addArgument(null, true); } if (body === undefined) { halt('Unexpected end of arguments'); } unionRefs(prevState); delete this.value; this.type = LET; this.declarations = []; if (!args.length) { halt('Unexpected end of arguments'); } for (let i = 0; i < args.length; i++) { const s: LetDeclarator = { type: LET_DECL, id: args[i], init: vals[i], loc: args[i].loc && [ args[i].loc[0], vals[i].loc[1] ] }; this.declarations.push(s); } this.body = body; if (left.loc) { this.loc = [ left.loc[0], currentNode.loc[1] ]; } advance(')', this); return this; } // array literal symbol('}'); symbol(';'); prefix('{', function (this: ArrayExpression & { value?: string }) { if (currentNode.id === '}') { // arrays must not be empty halt('Unexpected empty array'); } let row = []; let done = false; const rows = [ row ]; const prevState = unionRefs(false); while (!done) { if (isWhitespace(currentNode)) { advance(); } // arrays allow only literals, ranges (in GSheets) and ,;: operators. // FIXME: if { negativeNumbers: false } we must consume minuses as well. // Excel allows ={-1} but not ={(-1)} and ={1%} if (isLiteral(currentNode)) { row.push(symbolTable[LITERAL].nud.call(currentNode)); advance(); } else if (permitArrayRanges && isReferenceNode(currentNode)) { row.push(symbolTable[REFERENCE].nud.call(currentNode)); advance(); } else if (permitArrayCalls && isFunction(currentNode)) { const arg = expression(0); row.push(arg); } else { halt(`Unexpected ${currentNode.type} in array: ${currentNode.value}`); } if (currentNode.id === ',') { // next item advance(','); } else if (currentNode.id === ';') { // next row advance(';'); row = []; rows.push(row); } else { done = true; } } const closingBrace = currentNode; advance('}'); unionRefs(prevState); this.type = ARRAY; this.elements = rows; if (this.loc) { this.loc[1] = closingBrace.loc[1]; } delete this.value; return this; }); /** * Options for {@link parse}. */ export type OptsParse = { /** * Ranges are allowed as elements of arrays. This is a feature in Google Sheets while Excel * does not allow it. * @defaultValue false */ permitArrayRanges?: boolean, /** * Function calls are allowed as elements of arrays. This is a feature in Google Sheets * while Excel does not allow it. * @defaultValue false */ permitArrayCalls?: boolean, /** * Permits any function call where otherwise only functions that return references would * be permitted. * @defaultValue false */ looseRefCalls?: boolean, }; /** * Parses a string formula or list of tokens into an AST. * * The parser assumes `mergeRefs` and `negativeNumbers` were `true` when the tokens were generated. * It does not yet recognize reference context tokens or know how to deal with unary minuses in * arrays. * * The AST Abstract Syntax Tree's format is documented in * [AST_format.md](./AST_format.md). * * @see {@link OptsParse} * @see {@link nodeTypes} * @see {@link tokenize} * @param tokenlist An array of tokens. * @param options Options for the parsers behavior. * @returns An AST of nodes. */ export function parse ( tokenlist: Token[], options: OptsParse = {} ): AstExpression { if (!Array.isArray(tokenlist)) { throw new Error('Parse requires an array of tokens.'); } // allow ranges in array "literals"? permitArrayRanges = options?.permitArrayRanges; // allow calls in arrays "literals"? permitArrayCalls = options?.permitArrayCalls; // allow any function call in range operations? looseRefCalls = options?.looseRefCalls; // assign the tokenlist and set index to start tokens = tokenlist; tokenIndex = 0; // discard redundant whitespace and = prefix while (isWhitespace(tokens[tokenIndex]) || isFxPrefix(tokens[tokenIndex])) { tokenIndex++; } advance(); unionRefs(true); const root = expression(0); advance(END); return root; }