@haz3y0ne/parsexl
Version:
Parses Excel formulas into a clean, well-typed abstract syntax tree you can analyse or evaluate in TypeScript.
253 lines (252 loc) • 11.3 kB
JavaScript
// ─────────────────────────────────────────────────────────────────────────────
// /parser/prattParser.ts
// ---------------------------------------------------------------------------
// • Pratt / Top-Down Operator Precedence parser dedicated to Excel formulae.
// • Completely generic: precedence table knows nothing about Excel functions;
// function-specific arity / argument naming is handled in later passes
// (normalizeFilters, etc.).
// • Token expectations:
//
// LPAREN "("
// RPAREN ")"
// COMMA ","
// IDENT Function name or named range
// all operators: token.type === operator-lexeme ("+", "*", "<=", …)
//
// Make sure your lexer emits those exact spellings (see tokenize.ts).
// ─────────────────────────────────────────────────────────────────────────────
/* -------------------------------------------------------------------------- */
/* Precedence table */
/* ONLY operators belong here – not COMMA */
/* -------------------------------------------------------------------------- */
/** Map of `token.type` **or** raw `token.text` → Pratt spec. */
const table = {
/* ── literals / atoms ──────────────────────────────────────────────── */
NUMBER: { lbp: 0, nud: literal },
LITERAL: { lbp: 0, nud: literal },
BOOLEAN: { lbp: 0, nud: literal },
ERROR: { lbp: 0, nud: literal },
CELL: { lbp: 0, nud: literal },
RANGE: { lbp: 0, nud: literal },
NAMED_RANGE: { lbp: 0, nud: literal },
ARRAY: { lbp: 0, nud: literal },
/* ── prefix ±x and infix x±y ─────────────────────────────────────── */
"+": { lbp: 50, nud: prefix, led: infixLeft },
"-": { lbp: 50, nud: prefix, led: infixLeft },
/* ── postfix x% ───────────────────────────────────────────────────── */
"%": { lbp: 80, led: postfix },
/* ── exponent x ^ y (right-assoc) ─────────────────────────────────── */
"^": { lbp: 70, led: infixRight },
/* ── multiplicative x * y x / y ─────────────────────────────────── */
"*": { lbp: 60, led: infixLeft },
"/": { lbp: 60, led: infixLeft },
/* ── range operator A1:B2 (lower than * / higher than &) ─────────── */
":": { lbp: 40, led: infixLeft },
/* ── concatenation "a"&"b" ────────────────────────────────────────── */
"&": { lbp: 30, led: infixLeft },
/* ── comparisons ────────────────────────────────────────────────────── */
"=": { lbp: 30, led: infixLeft },
"<": { lbp: 30, led: infixLeft },
">": { lbp: 30, led: infixLeft },
">=": { lbp: 30, led: infixLeft },
"<=": { lbp: 30, led: infixLeft },
"<>": { lbp: 30, led: infixLeft },
/* ── parentheses & identifiers ──────────────────────────────────────── */
LPAREN: { lbp: 0, nud: parenExpression },
IDENT: { lbp: 0, nud: identifier },
/* ── stop / sentinel tokens ─────────────────────────────────────────── */
",": { lbp: 0 }, // argument separator
RPAREN: { lbp: 0 },
EOF: { lbp: 0 },
};
/* -------------------------------------------------------------------------- */
/* Helpers */
/* -------------------------------------------------------------------------- */
/**
* Convenience factory that copies the token’s location into the new node.
*
* @param type AST node type string.
* @param value Decoded literal value or `undefined`.
* @param args Child nodes array (default `[]`).
* @param tok Source token whose `pos` is forwarded into `loc`.
*/
function node(type, value, args = [], tok) {
return { type, value, args, loc: { start: tok.pos.start, end: tok.pos.end } };
}
/* ---------- literal nud --------------------------------------------------- */
/**
* Handles atoms: NUMBER, LITERAL, BOOLEAN, ERROR, CELL, etc.
* Applies Excel-specific decoding for quoted string literals.
*/
function literal(tok) {
if (tok.type === "LITERAL") {
// Strip outer quotes and un-escape doubled quotes.
const decoded = tok.text.slice(1, -1).replace(/""/g, '"');
return node("LITERAL", decoded, [], tok);
}
if (tok.type === "NUMBER")
return node("NUMBER", +tok.text, [], tok);
if (tok.type === "BOOLEAN")
return node("BOOLEAN", tok.text.toUpperCase(), [], tok);
// CELL, RANGE, ERROR, NAMED_RANGE, ARRAY fall through unchanged.
return node(tok.type, tok.text, [], tok);
}
/* ---------- identifier nud (function call or named range) ----------------- */
/**
* Parses a bare identifier. If immediately followed by `(` it becomes a
* *function call*; otherwise it becomes a *NamedRange* reference.
* Supports blank arguments (`INDEX(a,,4)` → `Missing` node).
*/
function identifier(tok) {
if (accept("LPAREN")) {
const args = [];
if (peek().type !== "RPAREN") {
for (;;) {
if (peek().type === ",") {
// blank arg →
args.push(nodeMissing(peek()));
}
else {
args.push(expression(0));
}
if (!accept(","))
break;
// comma just consumed, next token could close the call
if (peek().type === "RPAREN") {
args.push(nodeMissing(peek())); // trailing blank arg
break;
}
}
}
expect("RPAREN");
return node(tok.text.toUpperCase(), undefined, args, tok);
}
// Plain identifier → workbook / LET named range
return node("NamedRange", tok.text, [], tok);
}
/**
* Produces a placeholder node for an omitted argument.
*/
function nodeMissing(tok) {
return {
type: "Missing",
value: undefined,
args: [],
loc: { start: tok.pos.start, end: tok.pos.end },
};
}
/* ---------- prefix operator nud ------------------------------------------ */
/** Handles unary + / − . */
function prefix(tok) {
const right = expression(70); // binds tighter than any infix except ^
return node(tok.text, undefined, [right], tok);
}
/* ---------- postfix operator led (currently only %) ---------------------- */
/** Handles Excel percentage postfix (value% → value / 100). */
function postfix(left, tok) {
return node(tok.text, undefined, [left], tok);
}
/* ---------- left-associative infix led ----------------------------------- */
/** Generic left-assoc handler (+,*,&,=, etc.). */
function infixLeft(left, tok) {
const rbp = safeSpec(tok).lbp;
const right = expression(rbp);
// Range operator deserves a dedicated node type.
if (tok.text === ":") {
return node("Range", undefined, [left, right], tok);
}
return node(tok.text, undefined, [left, right], tok);
}
/* ---------- right-associative infix led (exponent) ----------------------- */
function infixRight(left, tok) {
const rbp = safeSpec(tok).lbp - 1;
const right = expression(rbp);
return node(tok.text, undefined, [left, right], tok);
}
/* ---------- parenthesised expression ------------------------------------- */
/** `( expr )` – returns inner expression node. */
function parenExpression(_lp) {
void _lp;
const expr = expression(0);
expect("RPAREN");
return expr;
}
/* -------------------------------------------------------------------------- */
/* Pratt engine */
/* -------------------------------------------------------------------------- */
let tokens; // active token stream
let idx; // current cursor
/** Resolve table entry or throw a helpful error. */
function safeSpec(tok) {
var _a;
const spec = (_a = table[tok.type]) !== null && _a !== void 0 ? _a : table[tok.text];
if (!spec)
throw new Error(`No Pratt spec for token type "${tok.type}" (${tok.text})`);
return spec;
}
/** Peeks ahead without consuming; `offset` may be positive. */
function peek(offset = 0) {
var _a;
return (_a = tokens[idx + offset]) !== null && _a !== void 0 ? _a : EOF_TOKEN;
}
/** Consumes one token from the stream (or returns EOF). */
function advance() {
var _a;
return (_a = tokens[idx++]) !== null && _a !== void 0 ? _a : EOF_TOKEN;
}
/** If next token matches `type`, consume it and return true. */
function accept(type) {
if (peek().type === type) {
advance();
return true;
}
return false;
}
/** Consume the next token or throw if it isn't `type`. */
function expect(type) {
if (peek().type !== type) {
throw new Error(`Expected ${type} at char ${peek().pos.start}`);
}
return advance();
}
/**
* Core Pratt recursive-descent walker.
*
* @param rbp Right-binding power (precedence threshold).
* @returns Parsed AST subtree.
*/
function expression(rbp = 0) {
let t = advance();
const spec = safeSpec(t);
if (!spec.nud)
throw new Error(`Unexpected token ${t.text}`);
let left = spec.nud(t);
while (rbp < safeSpec(peek()).lbp) {
t = advance();
const ledSpec = safeSpec(t);
if (!ledSpec.led)
throw new Error(`Token ${t.text} cannot appear here`);
left = ledSpec.led(left, t);
}
return left;
}
/* -------------------------------------------------------------------------- */
/* Public API */
/* -------------------------------------------------------------------------- */
const EOF_TOKEN = { type: "EOF", text: "", pos: { start: 0, end: 0 } };
/**
* Parse a token array into an AST using Pratt T.D.O.P.
*
* @param ts Token list produced by the lexer.
* @returns Root {@link ASTNode}.
* @throws If an unexpected token or dangling input is encountered.
*/
export function parseTokensPratt(ts) {
tokens = ts;
idx = 0;
const ast = expression(0);
if (peek().type !== "EOF") {
throw new Error(`Unexpected trailing token ${peek().text}`);
}
return ast;
}