@haz3y0ne/parsexl

// src/lexer/tokenize.ts // ----------------------------------------------------------------------------- // Excel formula → Token[] for the Pratt parser // • Splits sheet-qualified ranges (Sheet!$A$1:$B$2) into CELL ":" CELL // • Treats plain A1 and A1:B2 via legacy CELL_OR_RANGE // • Pass `debug = true` to print a compact token stream // ----------------------------------------------------------------------------- import { tokenRegexMap, type TokenPattern } from "./patterns"; import type { Token } from "../types"; /* kinds that may hide a colon and therefore need splitting */ const RANGEABLE: TokenPattern[] = [ "CELL", "CELL_OR_RANGE", // ← legacy pattern now handled "EXTERNAL_REFERENCE", "THREE_D_REFERENCE", ]; /* kinds that should ultimately look like CELL to Pratt */ const CELL_LIKE: TokenPattern[] = [ "CELL", "CELL_OR_RANGE", // ← legacy pattern now mapped "EXTERNAL_REFERENCE", "THREE_D_REFERENCE", "R1C1", "STRUCTURED_REFERENCE", "STRUCTURED_EXT", "TABLE_COLUMN", ]; /** Tokenise `formula`. Pass `true` to `debug` to dump the tokens. */ export function tokenize(formula: string, debug = false): Token[] { const src = formula.trim().replace(/^=/, ""); const tokens: Token[] = []; let i = 0; /** push helper */ const push = (tok: Token) => tokens.push(tok); while (i < src.length) { /* ── fast path: sheet-qualified range ──────────────────────────────── */ { const sheet = String.raw`(?:'[^']+'|\[[^\]]+]|[A-Za-z_][A-Za-z0-9_]*)`; const cell = String.raw`\$?[A-Za-z]{1,3}\$?\d+`; const m = new RegExp(`^(${sheet}!${cell}):(${cell})`).exec(src.slice(i)); if (m) { const [full, left, right] = m; push({ type: "CELL", text: left, pos: { start: i, end: i + left.length }, }); push({ type: ":", text: ":", pos: { start: i + left.length, end: i + left.length + 1 }, }); push({ type: "CELL", text: right, pos: { start: i + left.length + 1, end: i + full.length }, }); i += full.length; continue; } } /* ── longest-match scan ────────────────────────────────────────────── */ const slice = src.slice(i); let kind: TokenPattern | null = null; let lexeme = ""; let len = 0; for (const [pattern, rx] of Object.entries(tokenRegexMap) as [ TokenPattern, RegExp ][]) { const m = rx.exec(slice); if (m?.index === 0 && m[0].length > len) { kind = pattern; lexeme = m[0]; len = m[0].length; } } if (!kind) { const ctx = src.slice(Math.max(i - 3, 0), i + 3); throw new Error(`Unrecognised input at ${i}: '${src[i]}' (…${ctx}…)`); } const pos = { start: i, end: i + len }; /* ── re-tag & emit ─────────────────────────────────────────────────── */ switch (kind) { case "FUNCTION": { const name = lexeme.slice(0, -1); // drop '(' push({ type: "IDENT", text: name, pos }); push({ type: "LPAREN", text: "(", pos: { start: pos.end - 1, end: pos.end }, }); break; } case "PARENTHESIS": push({ type: lexeme === "(" ? "LPAREN" : "RPAREN", text: lexeme, pos }); break; case "ARG_SEPARATOR": push({ type: ",", text: ",", pos }); break; case "OPERATOR": case "PERCENT_OPERATOR": push({ type: lexeme, text: lexeme, pos }); break; default: if (RANGEABLE.includes(kind) && lexeme.includes(":")) { const colon = lexeme.indexOf(":"); const left = lexeme.slice(0, colon); const right = lexeme.slice(colon + 1); push({ type: "CELL", text: left, pos: { start: pos.start, end: pos.start + left.length }, }); push({ type: ":", text: ":", pos: { start: pos.start + colon, end: pos.start + colon + 1 }, }); push({ type: "CELL", text: right, pos: { start: pos.start + colon + 1, end: pos.end }, }); } else { push({ type: CELL_LIKE.includes(kind) ? "CELL" : kind, text: lexeme, pos, }); } } i += len; } push({ type: "EOF", text: "", pos: { start: src.length, end: src.length } }); /* ── debug dump ─────────────────────────────────────────────────────── */ if (debug) { console.error( tokens .map( (t, idx) => `${idx}:` + (t.type === t.text ? t.type : `${t.type}(${t.text})`) ) .join(" ") ); } return tokens; }