@borgar/fx
Version:
Utilities for working with Excel formulas
355 lines (334 loc) • 10.1 kB
text/typescript
import {
FX_PREFIX,
NEWLINE,
NUMBER,
OPERATOR,
REF_NAMED,
UNKNOWN,
WHITESPACE,
FUNCTION,
OPERATOR_TRIM,
REF_RANGE,
REF_BEAM
} from './constants.ts';
import { mergeRefTokens } from './mergeRefTokens.ts';
import { lexers, type PartLexer } from './lexers/sets.ts';
import type { Token } from './types.ts';
import { isRCTokenValue } from './isRCTokenValue.ts';
const reLetLambda = /^l(?:ambda|et)$/i;
const isType = (t: Token, type: string) => t && t.type === type;
const isTextTokenType = (tokenType: string) => tokenType === REF_NAMED || tokenType === FUNCTION;
const causesBinaryMinus = (token: Token) => {
return !isType(token, OPERATOR) || (
token.value === '%' ||
token.value === '}' ||
token.value === ')' ||
token.value === '#'
);
};
function fixRCNames (tokens: Token[], r1c1Mode?: boolean): Token[] {
let withinCall = 0;
let parenDepth = 0;
let lastToken: Token;
for (const token of tokens) {
const tokenType = token.type;
if (tokenType === OPERATOR) {
if (token.value === '(') {
parenDepth++;
if (lastToken.type === FUNCTION) {
if (reLetLambda.test(lastToken.value)) {
withinCall = parenDepth;
}
}
}
else if (token.value === ')') {
parenDepth--;
if (parenDepth < withinCall) {
withinCall = 0;
}
}
}
else if (withinCall && tokenType === UNKNOWN && isRCTokenValue(token.value)) {
token.type = REF_NAMED;
}
else if (withinCall && r1c1Mode && tokenType === REF_BEAM && isRCTokenValue(token.value)) {
token.type = REF_NAMED;
}
lastToken = token;
}
return tokens;
}
type OptsGetTokens = {
withLocation?: boolean,
mergeRefs?: boolean,
negativeNumbers?: boolean
allowTernary?: boolean
r1c1?: boolean
xlsx?: boolean
};
export function getTokens (fx: string, tokenHandlers: PartLexer[], options: OptsGetTokens = {}) {
const {
withLocation = false,
mergeRefs = true,
negativeNumbers = true
} = options;
const opts = {
withLocation: withLocation,
mergeRefs: mergeRefs,
allowTernary: options.allowTernary ?? false,
negativeNumbers: negativeNumbers,
r1c1: options.r1c1 ?? false,
xlsx: options.xlsx ?? false
};
const tokens = [];
let pos = 0;
let letOrLambda = 0;
let unknownRC = 0;
const trimOps = [];
let tail0: Token; // last non-whitespace token
let tail1: Token; // penultimate non-whitespace token
let lastToken: Token; // last token
const pushToken = (token: Token) => {
let tokenType = token.type;
const isCurrUnknown = tokenType === UNKNOWN;
const isLastUnknown = lastToken && lastToken.type === UNKNOWN;
if (lastToken && (
(isCurrUnknown && isLastUnknown) ||
(isCurrUnknown && isTextTokenType(lastToken.type)) ||
(isLastUnknown && isTextTokenType(tokenType))
)) {
// UNKNOWN tokens "contaminate" sibling text tokens
lastToken.value += token.value;
lastToken.type = UNKNOWN;
if (withLocation) {
lastToken.loc[1] = token.loc[1];
}
}
else {
if (tokenType === OPERATOR_TRIM) {
trimOps.push(tokens.length);
tokenType = UNKNOWN;
token.type = UNKNOWN;
}
// push token as normally
tokens[tokens.length] = token;
lastToken = token;
if (tokenType !== WHITESPACE && tokenType !== NEWLINE) {
tail1 = tail0;
tail0 = token;
}
}
};
if (fx.startsWith('=')) {
const token: Token = { type: FX_PREFIX, value: '=' };
if (withLocation) {
token.loc = [ 0, 1 ];
}
pos++;
pushToken(token);
}
const numHandlers = tokenHandlers.length;
while (pos < fx.length) {
const startPos = pos;
let token;
for (let i = 0; i < numHandlers; i++) {
token = tokenHandlers[i](fx, pos, opts);
if (token) {
pos += token.value.length;
break;
}
}
if (!token) {
token = {
type: UNKNOWN,
value: fx[pos]
};
pos++;
}
if (withLocation) {
token.loc = [ startPos, pos ];
}
// make a note if we found a let/lambda call
if (lastToken && token.value === '(' && lastToken.type === FUNCTION) {
if (reLetLambda.test(lastToken.value)) {
letOrLambda++;
}
}
// Make a note if we found a R or C unknown or REF_BEAM token in R1C1 mode.
// It seemse unlikely that anyone does `F2 = LET(c,1,c+F:F)` as this is a
// circular reference (and not a very useful one), so we're assuming that
// all "c" or "r" tokens found within the LET are names.
if (token.value.length === 1 && (token.type === UNKNOWN || (opts.r1c1 && token.type === REF_BEAM))) {
unknownRC += isRCTokenValue(token.value) ? 1 : 0;
}
if (negativeNumbers && token.type === NUMBER) {
const last1 = lastToken;
// do we have a number preceded by a minus?
if (last1?.type === OPERATOR && last1.value === '-') {
// missing tail1 means we are at the start of the stream
if (
!tail1 ||
tail1.type === FX_PREFIX ||
!causesBinaryMinus(tail1)
) {
const minus = tokens.pop();
token.value = '-' + token.value;
if (token.loc) {
// ensure offsets are up to date
token.loc[0] = minus.loc[0];
}
// next step tries to counter the screwing around with the tailing
// it should be correct again once we pushToken()
tail0 = tail1;
lastToken = tokens[tokens.length - 1];
}
}
}
pushToken(token);
}
// if we encountered both a LAMBDA/LET call, and unknown 'r' or 'c' tokens
// we'll turn the unknown tokens into names within the call.
if (unknownRC && letOrLambda) {
fixRCNames(tokens, opts.r1c1);
}
// Any OPERATOR_TRIM tokens have been indexed already, they now need to be
// either turned into OPERATORs or UNKNOWNs. Trim operators are only allowed
// between two REF_RANGE tokens as they are not valid in expressions as full
// operators.
for (const index of trimOps) {
const before = tokens[index - 1];
const after = tokens[index + 1];
tokens[index].type = (before?.type === REF_RANGE && after?.type === REF_RANGE)
? OPERATOR
: UNKNOWN;
}
if (mergeRefs) {
return mergeRefTokens(tokens);
}
return tokens;
}
/**
* Options for {@link tokenize}.
*/
export type OptsTokenize = {
/**
* Nodes will include source position offsets to the tokens: `{ loc: [ start, end ] }`
* @defaultValue true
*/
withLocation?: boolean,
/**
* Should ranges be returned as whole references (`Sheet1!A1:B2`) or as separate tokens for each
* part: (`Sheet1`,`!`,`A1`,`:`,`B2`). This is the same as calling [`mergeRefTokens`](#mergeRefTokens)
* @defaultValue true
*/
mergeRefs?: boolean,
/**
* Merges unary minuses with their immediately following number tokens (`-`,`1`) => `-1`
* (alternatively these will be unary operations in the tree).
* @defaultValue true
*/
negativeNumbers?: boolean
/**
* Enables the recognition of ternary ranges in the style of `A1:A` or `A1:1`. These are supported
* by Google Sheets but not Excel. See: [References.md](./References.md).
* @defaultValue false
*/
allowTernary?: boolean
/**
* Ranges are expected to be in the R1C1 style format rather than the more popular A1 style.
* @defaultValue false
*/
r1c1?: boolean
};
/**
* Breaks a string formula into a list of tokens.
*
* The returned output will be an array of objects representing the tokens:
*
* ```js
* [
* { type: FX_PREFIX, value: '=' },
* { type: FUNCTION, value: 'SUM' },
* { type: OPERATOR, value: '(' },
* { type: REF_RANGE, value: 'A1:B2' },
* { type: OPERATOR, value: ')' }
* ]
* ```
*
* A collection of token types may be found as an object as the {@link tokenTypes}
* export on the package.
*
* _Warning:_ To support syntax highlighting as you type, `STRING` tokens are allowed to be
* "unterminated". For example, the incomplete formula `="Hello world` would be
* tokenized as:
*
* ```js
* [
* { type: FX_PREFIX, value: '=' },
* { type: STRING, value: '"Hello world', unterminated: true },
* ]
* ```
*
* Parsers will need to handle this.
*
* @see {@link OptsTokenize}
* @see {@link tokenTypes}
* @param formula An Excel formula string (an Excel expression).
* @param [options] Options
* @returns An array of Tokens
*/
export function tokenize (
formula: string,
options: OptsTokenize = {}
): Token[] {
return getTokens(formula, lexers, options);
}
/**
* Breaks a string formula into a list of tokens.
*
* The returned output will be an array of objects representing the tokens:
*
* ```js
* [
* { type: FX_PREFIX, value: '=' },
* { type: FUNCTION, value: 'SUM' },
* { type: OPERATOR, value: '(' },
* { type: REF_RANGE, value: 'A1:B2' },
* { type: OPERATOR, value: ')' }
* ]
* ```
*
* A collection of token types may be found as an object as the {@link tokenTypes}
* export on the package.
*
* _Warning:_ To support syntax highlighting as you type, `STRING` tokens are allowed to be
* "unterminated". For example, the incomplete formula `="Hello world` would be
* tokenized as:
*
* ```js
* [
* { type: FX_PREFIX, value: '=' },
* { type: STRING, value: '"Hello world', unterminated: true },
* ]
* ```
*
* @see {@link OptsTokenize}
* @see {@link tokenTypes}
* @param formula An Excel formula string (an Excel expression).
* @param [options] Options
* @returns An array of Tokens
*/
export function tokenizeXlsx (
formula: string,
options: OptsTokenize = {}
): Token[] {
const opts = {
withLocation: options.withLocation ?? false,
mergeRefs: options.mergeRefs ?? true,
allowTernary: options.allowTernary ?? false,
negativeNumbers: options.negativeNumbers ?? true,
r1c1: options.r1c1 ?? false,
xlsx: true
};
return getTokens(formula, lexers, opts);
}