series-extractor
Version:
A TypeScript library for extracting data series from nested objects and arrays using a custom syntax.
865 lines (789 loc) • 25.7 kB
text/typescript
/**
* Module for extracting data series from nested objects and arrays
*
* @remarks
* TODO:
* - row extractor needs rethinking; it is too ambiguous what constitutes a "row";
* {a: b.c.$x, b: c{d:$y, $e:$f}, c: x.$z} is a great example to demonstrate this
* - have a ? prefix indicate a fixed varaible, which could be an alternative syntax to $var=value
* - parser should validate that a variable is not nested within itself
* - Could add feature to filter a dimension variable; e.g. $var in [whitelist of constants];
* syntax might be $var=value; maybe would only support string values to start; for multiple
* maybe use | as separator character? $var=a|b|c? Can be implemented completely inside
* Token.fromBuilder method without affecting other parse logic I think. Maybe a simpler
* alternative would be to not include those in the syntax, but in the config.yml instead:
* series.filter: dict[str,list[str]]
* - map token -> series id + dimension index
* - implement the *reverse* operation, taking a generator of extracted values and building a nested
* json/array object using the same syntax; abstracting Nesting.extractRows to handle any output
* format, not just row-wise
* - simplify syntax: {$key: $} -> {$key} ?
*/
/** Controls values generated during extraction */
export enum ExtractableFlags {
/** no special flags */
NONE = 0b0,
/** include {@link ExtractableStack} flags to indicate entering/exiting a nesting */
STACK = 0b1,
/** include anonymous dimension variables */
ANONYMOUS = 0b10,
}
/**
* Flags to indicate entering/exiting a dimension's nesting while extracting.
* Nesting from constant keys do not emit stack flags, only dimension variables.
*/
export enum ExtractableStack {
/** begin nesting on previously yielded {@link Value} */
PUSH = "PUSH",
/** close current nesting */
POP = "POP",
}
export class SyntaxError extends Error {
constructor(message: string) {
super(message);
this.name = "SyntaxError";
}
}
/**
* Enhanced syntax error with position information and context
*/
export class SeriesSyntaxError extends SyntaxError {
/** 1-indexed line number where error occurred */
public readonly line: number;
/** 1-indexed column number where error occurred */
public readonly col: number;
/** 1-indexed character position where error occurred */
public readonly char: number;
/** The original syntax string being parsed */
public readonly syntaxInput: string;
/** Formatted context showing the error location */
public readonly context: string;
/** The original error message without formatting */
public readonly originalMessage: string;
constructor(
message: string,
line: number,
col: number,
char: number,
syntaxInput: string
) {
// Generate context
const before = syntaxInput.substring(0, char - 1);
let after = syntaxInput.substring(char - 1, char + 20);
const newlineIndex = after.indexOf('\n');
if (newlineIndex !== -1) {
after = after.substring(0, Math.max(1, newlineIndex));
}
// Format the pointer line
let pointer = message;
if (pointer.length < col - 1) {
pointer += " ^";
pointer = pointer.padStart(col, " ");
} else {
pointer = "^".padStart(col, " ") + " " + pointer;
}
const context = `${before}${after}\n${pointer}`;
// Create the full formatted message
const fullMessage =
`Invalid series syntax: line #${line}, col #${col}, char #${char}\n` +
context;
super(fullMessage);
this.name = "SeriesSyntaxError";
this.line = line;
this.col = col;
this.char = char;
this.syntaxInput = syntaxInput;
this.context = context;
this.originalMessage = message;
}
}
/** A class which can extract values */
export abstract class Extractable {
/**
* Extracts series values from nested data
*
* @param data - data value to extract from
* @param flags - also generate {@link ExtractableStack} flags to indicate entering/exiting a nesting
* @yields extracted values
*/
public abstract extract(data: any, flags?: ExtractableFlags): Generator<Value | ExtractableStack>;
public equals(other: any): boolean {
throw new Error("Method 'equals' must be implemented by subclasses.");
}
}
/** Token type that can appear in a given context */
export enum TokenType {
/** integer array index, automatically interpreted so inside [] array nesting */
IMPLICIT_INTEGER = 0b1,
/**
* integer array index, explicitly interpreted via # prefix in shortcut syntax;
* with current syntax definition, a key is never both implicit and explicit integer
*/
EXPLICIT_INTEGER = 0b10,
/** either implicit or explicit integer type */
INTEGER = 0b11,
/** string */
STRING = 0b100,
/** integer or string */
ANY_TYPE = 0b111,
/** token which is a dimension variable */
DIMENSION = 0b1000,
}
/**
* Token holds a key or value in the data structure. It can be an integer key for arrays, or
* string for objects. The token may be a dimension variable. Values are always dimension
* variables
*/
export class Token extends Extractable {
private static reUnescape = /\\(.)/g;
private static reprMap: { [key: number]: string } = {
[TokenType.IMPLICIT_INTEGER]: "int",
[TokenType.EXPLICIT_INTEGER]: "#int",
[TokenType.STRING]: "str",
[TokenType.DIMENSION]: "var"
};
/** token identifier; integer for array index; None if its an anonymous dimension variable */
public name: string | number | null;
/** whether token is a dimension variable */
public flags: TokenType;
constructor(name: string | number | null, flags: TokenType) {
super();
this.name = name;
this.flags = flags;
if (this.flags & TokenType.DIMENSION) {
if (!(typeof this.name === 'string' || this.name === null)) {
throw new SyntaxError("variable dimension name must always be string");
}
} else if (flags & TokenType.EXPLICIT_INTEGER) {
if (!(typeof this.name === 'number')) {
throw new SyntaxError("name should be int for non-variable explicit integer");
}
} else if (typeof this.name === 'number') {
if (!(flags & TokenType.IMPLICIT_INTEGER)) {
throw new SyntaxError("name type doesn't match flags");
}
}
// flags must be a single type, except for the case where from_builder cannot yet resolve
// what the type might be; finalize_type resolves the type in that scenario
const types = this.flags & TokenType.ANY_TYPE;
if (!this.finalized) {
if (!(types === (TokenType.IMPLICIT_INTEGER | TokenType.STRING))) {
throw new SyntaxError("flags must be IMPLICIT_INTEGER | STRING when not finalized");
}
}
}
public equals(other: any): boolean {
return other instanceof Token && this.name === other.name && this.flags === other.flags;
}
public toString(): string {
const attrs: (string | number)[] = [String(this.name)];
for (const k in Token.reprMap) {
if (this.flags & Number(k)) {
attrs.push(Token.reprMap[Number(k)]);
}
}
return `<${attrs.join('|')}>`;
}
/** Whether token is an anoyomous dimension variable */
public get anonymous(): boolean {
return this.name === null;
}
/** Convert name to integer */
private static _parseInteger(name: string | number | null, flags: TokenType): number | string | null {
if (!(flags & TokenType.DIMENSION)) {
if (typeof name === 'string') {
try {
const idx = parseInt(name, 10);
if (isNaN(idx) || idx < 0) {
throw new SyntaxError("Expected positive integer key");
}
return idx;
} catch (e) {
throw new SyntaxError("Expected positive integer key");
}
}
}
return name;
}
/**
* Parse token from a string builder's contents
*
* @param flags - what kind of token we expect; if ANY_TYPE, and result is not
* EXPLICIT_INTEGER, you'll need to call finalize_type later based on lookahead
* context to set the token's type
*/
public static fromBuilder(builder: string[], flags: TokenType): Token {
if (![
TokenType.STRING, // object key
TokenType.STRING | TokenType.EXPLICIT_INTEGER, // shortcut
TokenType.ANY_TYPE // array key or shortcut
].includes(flags)) {
throw new SyntaxError("invalid flags argument");
}
let name: string | number | null = null;
let currentFlags = flags;
// # prefix explicitly marks it as an integer token
if (builder[0] === '#') {
if (!(currentFlags & TokenType.EXPLICIT_INTEGER)) {
throw new SyntaxError("Explicit integer key shortcut not allowed here");
}
currentFlags = TokenType.EXPLICIT_INTEGER;
builder = builder.slice(1);
} else {
currentFlags &= ~TokenType.EXPLICIT_INTEGER;
}
// $ prefix indicates dimension variable
if (builder[0] === '$') {
currentFlags |= TokenType.DIMENSION;
builder = builder.slice(1);
}
// convert to string
if (builder.length > 0) {
name = builder.join('');
// remove escape backslashes
name = name.replace(Token.reUnescape, "$1");
// parse int if possible
if (currentFlags & TokenType.EXPLICIT_INTEGER) {
name = Token._parseInteger(name, currentFlags);
}
} else if (!(currentFlags & TokenType.DIMENSION)) {
// only occurs with "#" string
throw new SyntaxError("Expected integer key after #");
}
return new Token(name, currentFlags);
}
/** Whether token's type has been finalized */
public get finalized(): boolean {
const bitCount = (n: number) => {
let count = 0;
while (n > 0) {
n &= (n - 1);
count++;
}
return count;
};
return bitCount(this.flags & TokenType.ANY_TYPE) === 1;
}
/**
* Ensure that the token is of a certain type. Resolves tokens whose type was undecided
* between IMPLICIT_INTEGER or STRING in from_builder factory method
*
* @param type - one of INTEGER or STRING
*/
public finalizeType(type: TokenType): void {
if (this.finalized) {
throw new SyntaxError("token type already finalized");
}
if (!((type === TokenType.INTEGER) || (type === TokenType.STRING))) {
throw new SyntaxError("invalid type argument");
}
let final_type: TokenType = type;
if (type === TokenType.INTEGER) {
final_type = TokenType.IMPLICIT_INTEGER;
}
this.flags = (this.flags & ~TokenType.ANY_TYPE) | (final_type & TokenType.ANY_TYPE);
// parse integer keys
if (this.flags & TokenType.IMPLICIT_INTEGER) {
this.name = Token._parseInteger(this.name, this.flags);
}
}
/** Wraps data as a dimension value */
public *extract(data: any, flags: ExtractableFlags = ExtractableFlags.NONE): Generator<Value> {
// always a dimension if called from Nesting
if (!(this.flags & TokenType.DIMENSION)) {
throw new SyntaxError("Cannot extract from a non-dimension token");
}
// ignore anonymous dimensions; use Extractable.Stack flags instead to monitor nesting
if ((flags & ExtractableFlags.ANONYMOUS) || !this.anonymous) {
yield new Value(this, data);
}
}
}
/** Extracted value for a dimension variable */
export class Value {
/** dimension the value comes from */
public readonly dimension: Token;
/** extracted value */
public readonly value: any;
constructor(dimension: Token, value: any) {
this.dimension = dimension;
this.value = value;
}
/** Get dimension's name*/
public get name(): string | number | null {
return this.dimension.name;
}
/** Whether dimension is anonymous */
public get anonymous(): boolean {
return this.dimension.anonymous;
}
public toString(): string {
return `${this.dimension.name}=${this.value}`;
}
}
/** Key-value lookup in the data structure. Used for both object and array nestings */
export class Pair {
public readonly key: Token;
public readonly value: Extractable;
constructor(key: Token, value: Extractable) {
this.key = key;
this.value = value;
}
public equals(other: any): boolean {
return other instanceof Pair && this.key.equals(other.key) && this.value.equals(other.value);
}
public toString(): string {
return `${this.key.toString()} → ${this.value.toString()}`;
}
}
/** A nesting of key-value pairs in the data structure. See ObjectNesting and ArrayNesting subclasses */
export abstract class Nesting extends Extractable {
/** pairs whose key is a constant */
protected constant_pairs: Pair[] = [];
/** pairs whose key is a dimension (possibly anonymous) */
protected dimension_pairs: Pair[] = [];
constructor() {
super();
}
public get length(): number {
return this.constant_pairs.length + this.dimension_pairs.length;
}
public equals(other: any): boolean {
if (!(other instanceof (this as any).constructor)) {
return false;
}
// Compare constant_pairs
if (this.constant_pairs.length !== other.constant_pairs.length) {
return false;
}
for (let i = 0; i < this.constant_pairs.length; i++) {
if (!this.constant_pairs[i].equals(other.constant_pairs[i])) {
return false;
}
}
// Compare dimension_pairs
if (this.dimension_pairs.length !== other.dimension_pairs.length) {
return false;
}
for (let i = 0; i < this.dimension_pairs.length; i++) {
if (!this.dimension_pairs[i].equals(other.dimension_pairs[i])) {
return false;
}
}
return true;
}
public toString(): string {
return `${this.constructor.name}(${this.length})`;
}
/**
* Add a key-value pair to the nesting
* @returns self, for chaining purposes
*/
public add(key: Token, value: Extractable): Nesting {
if (value instanceof Token && !(value.flags & TokenType.DIMENSION)) {
throw new SyntaxError("Leaf value must be a dimension variable");
}
const pair = new Pair(key, value);
if (key.flags & TokenType.DIMENSION) {
this.dimension_pairs.push(pair);
} else {
this.constant_pairs.push(pair);
}
return this;
}
/** Indicate nesting has closed; e.g. ] or } character */
public close(): void {
if (!this.length) {
throw new SyntaxError("Nested structure cannot be empty");
}
}
/** Check if data has some key */
public abstract has(data: any, key: string | number): boolean;
/** Iterate key, value pairs */
public abstract iter(data: any): Generator<[string | number, any]>;
/** Generator to extract dimension variables */
public *extract(data: any, flags: ExtractableFlags = ExtractableFlags.NONE): Generator<Value | ExtractableStack> {
for (const pair of this.constant_pairs) {
if (this.has(data, pair.key.name!)) {
const child = data[pair.key.name!];
yield* pair.value.extract(child, flags);
}
}
// these traverse the entire object
const stack = (flags & ExtractableFlags.STACK) !== 0;
if (this.dimension_pairs.length > 0) {
for (const [key, value] of this.iter(data)) {
for (const pair of this.dimension_pairs) {
yield* pair.key.extract(key, flags);
if (stack) {
yield ExtractableStack.PUSH;
}
yield* pair.value.extract(value, flags);
if (stack) {
yield ExtractableStack.POP;
}
}
}
}
}
/**
* Helper to organize extracted values into rows, where rows are implicitly defined
* by the data's nesting structure
*
* @yields dicts, where keys are dimension variable names and values are extracted values
* for the current row
*/
public *extractRows(data: any): Generator<{ [key: string]: any }> {
/**
* Each entry is of the form [nesting, nested_values...]. Initial entry
* represents the implicit root nesting, where only nested_values are present.
*/
const stack: (Value | undefined)[][] = [[]];
/** current row we're building */
let row: { [key: string]: any } = {};
/** last seen {@link Value}; None after stack push/pop */
let last: Value | undefined = undefined;
for (const valueOrStack of this.extract(data, ExtractableFlags.STACK | ExtractableFlags.ANONYMOUS)) {
// switch last seen value to be a new nesting
if (valueOrStack === ExtractableStack.PUSH) {
if (last === undefined) {
throw new Error("stack push should always occur after yielding a Value");
}
stack[stack.length - 1].pop();
stack.push([last]);
last = undefined;
} else if (valueOrStack === ExtractableStack.POP) {
// finalize row; clear nested values
// yield finalized row for only first POP seen
if (last !== undefined) {
yield { ...row };
last = undefined;
}
// clear nested values
const poppedStack = stack.pop();
if (poppedStack) {
for (const nested_value of poppedStack) {
if (nested_value && !nested_value.anonymous) {
delete row[nested_value.name as string];
}
}
}
} else { // It's a Value
// new value; add to current stack nesting
last = valueOrStack;
stack[stack.length - 1].push(last);
if (!last.anonymous) {
row[last.name as string] = last.value;
}
}
}
// pop final implicit root nesting
if (last !== undefined) {
yield { ...row };
}
}
}
/** Data structure nesting for objects */
export class ObjectNesting extends Nesting {
public has(obj: any, key: string): boolean {
return typeof obj === 'object' && obj !== null && obj.hasOwnProperty(key);
}
public *iter(obj: { [key: string]: any }): Generator<[string, any]> {
if (typeof obj === 'object' && obj !== null) {
for (const key in obj) {
if (obj.hasOwnProperty(key)) {
yield [key, obj[key]];
}
}
}
}
}
/** Data structure nesting for arrays */
export class ArrayNesting extends Nesting {
public has(arr: any[], key: number): boolean {
return Array.isArray(arr) && key >= 0 && key < arr.length;
}
public *iter(arr: any[]): Generator<[number, any]> {
if (Array.isArray(arr)) {
for (let i = 0; i < arr.length; i++) {
yield [i, arr[i]];
}
}
}
/** Add pair where key is current array index */
public addImplicit(value: Extractable): ArrayNesting {
this.add(new Token(this.length, TokenType.IMPLICIT_INTEGER), value);
return this;
}
}
/**
* Recursive parser for the custom series extractor syntax.
* Only the `parse` method needs to be called externally
*/
export class Parser {
/** syntax being parsed */
public syntax: string;
/** last token generated by parse_token */
public token: Token | null = null;
/** generator for characters; handles whitespace, escapes, and backtracking */
private consumer: Generator<string>;
/** repeat previously seen character */
private repeat: boolean = false;
/** 1-indexed position of last read character; 0 if no characters read yet */
private char: number = 0;
/** 1-indexed position in current line of last read character; 0 if no characters read for the current line */
private col: number = 0;
/** 1-indexed line of last read character */
private line: number = 1;
constructor(syntax: string) {
this.syntax = syntax;
this.consumer = this.consume();
}
public parse(): Nesting {
try {
const res = this.parseRoot();
// we don't allow extracting a lone value currently
if (res instanceof Token) {
throw new SyntaxError("Expected value after key");
}
if (this.next() !== undefined) {
throw new SyntaxError("Unexpected character after end of syntax");
}
return res as Nesting;
} catch (e: any) {
if (e instanceof SyntaxError) {
// Wrap in SeriesSyntaxError with position and context
throw new SeriesSyntaxError(
e.message,
this.line,
this.col,
this.char,
this.syntax
);
}
throw e; // Re-throw other errors
}
}
/**
* Generator to consume syntax characters. This is doing some extra logic to do some of the
* job of a tokenizer, so is not strictly iterating over single characters. Use
* :meth:`next` and :meth:`backtrack` to interact with the consumer
*/
private *consume(): Generator<string> {
let escape = false;
for (let c of this.syntax) {
// position for error messages
this.char++;
if (c === '\n') {
this.line++;
this.col = 0;
} else {
this.col++;
}
// escaped characters; we leave the backslash to simplify downstream logic, where
// the backslash ensures the subsequent character is not interpreted as special
if (escape) {
c = '\\' + c;
escape = false;
} else if (c === '\\') {
escape = true;
continue;
} else if (/\s/.test(c)) {
// ignore whitespace
continue;
}
// continually emit if they send truthy value back
while (true) {
yield c;
if (!this.repeat) {
break;
}
this.repeat = false;
}
}
if (escape) {
throw new SyntaxError("Backslash missing subsequent escaped character");
}
}
/** Get next character */
public next(): string | undefined {
const result = this.consumer.next();
return result.value;
}
/** Backtrack and repeat last character returned by :meth:`next` */
public backtrack(): void {
this.repeat = true;
}
/** Consume as many non-special characters as possible and add them to the token builder */
private parseToken(include: string | null, type: TokenType): void {
const builder: string[] = [];
if (include !== null) {
builder.push(include);
}
while (true) {
const c = this.next();
if (c === undefined) {
break;
}
if (".,:{}[]".includes(c)) {
this.backtrack();
break;
}
builder.push(c);
}
this.token = Token.fromBuilder(builder, type);
}
private parseRoot(): Extractable {
const c = this.next();
if (c === undefined || '.,:}]'.includes(c)) {
throw new SyntaxError("Expected value");
}
if (c === '[') {
return this.parseArray();
}
if (c === '{') {
return this.parseObject();
}
this.parseToken(c, TokenType.STRING | TokenType.EXPLICIT_INTEGER);
return this.parseShortcut();
}
/** Parse syntax inside array definition: `[...]` */
private parseArray(): ArrayNesting {
const nesting = new ArrayNesting();
let state = 0;
while (true) {
const c = this.next();
// 0: look for array key/idx or value
if (state === 0) {
if (c === ']') break;
if (c === undefined || '.,:}'.includes(c)) {
throw new SyntaxError("Expected array key or value");
}
if (c === '[') {
nesting.addImplicit(this.parseArray());
state = 1;
} else if (c === '{') {
nesting.addImplicit(this.parseObject());
state = 1;
} else {
this.parseToken(c, TokenType.ANY_TYPE);
// following state will finalize token's type
state = 2;
}
} else if (state === 1) {
// 1: look for array separator or end of array
if (c === ']') break;
if (c !== ',') {
throw new SyntaxError("Expected comma or end of array");
}
state = 0;
} else if (state === 2) {
// 2: look for value pair to go with token, or interpret token as value itself
if (c === undefined) {
throw new SyntaxError("Unclosed array");
}
if (c === '}') {
throw new SyntaxError("Unexpected closing brace inside array");
}
if (c === ']' || c === ',') {
if (this.token!.finalized) {
throw new SyntaxError("Missing value after explicit integer shortcut");
}
this.token!.finalizeType(TokenType.STRING);
nesting.addImplicit(this.token!);
if (c === ']') break;
state = 0;
} else if (c === ':') {
if (this.token!.finalized) {
throw new SyntaxError("Do not use # for integer keys inside arrays");
}
this.token!.finalizeType(TokenType.INTEGER);
nesting.add(this.token!, this.parseRoot());
state = 1;
} else {
// remaining characters are one of {[.
if (!this.token!.finalized) {
this.token!.finalizeType(TokenType.STRING);
}
this.backtrack();
nesting.addImplicit(this.parseShortcut());
state = 1;
}
}
}
if (!nesting.length) {
throw new SyntaxError("Empty array");
}
return nesting;
}
/** Parse syntax inside object: `{...}` */
private parseObject(): ObjectNesting {
const nesting = new ObjectNesting();
let state = 0;
while (true) {
const c = this.next();
// 0: look for object key
if (state === 0) {
if (c === '}') break;
if (c === undefined || '.,:[]{'.includes(c)) {
throw new SyntaxError("Expected object key");
}
this.parseToken(c, TokenType.STRING);
state = 1;
} else if (state === 1) {
// 1: look for colon
if (c !== ':') {
throw new SyntaxError("Expected colon");
}
nesting.add(this.token!, this.parseRoot());
state = 2;
} else if (state === 2) {
// 2: look for comma or end of object
if (c === '}') break;
if (c !== ',') {
throw new SyntaxError("Expected comma or end of object");
}
state = 0;
}
}
if (!nesting.length) {
throw new SyntaxError("Empty object");
}
return nesting;
}
/**
* Parse syntax for shortcut array or object definition: `a.b.c`. Expects a token to have
* been previously parsed, and its type finalized to be one of STRING or EXPLICIT_INTEGER
*/
private parseShortcut(): Extractable {
const isArray = (this.token!.flags & TokenType.EXPLICIT_INTEGER) !== 0;
const c = this.next();
// no well defined closing character for shortcut syntax; parent parser will decide what to do
if (c === undefined || ',:}]'.includes(c)) {
if (isArray) {
throw new SyntaxError("Missing value after explicit integer shortcut");
}
this.backtrack();
return this.token!;
}
const nesting = isArray ? new ArrayNesting() : new ObjectNesting();
if (c === '{') {
nesting.add(this.token!, this.parseObject());
} else if (c === '[') {
nesting.add(this.token!, this.parseArray());
} else if (c === '.') {
// remaining character is '.'
const key = this.token!;
// parse value token
this.parseToken(null, TokenType.STRING | TokenType.EXPLICIT_INTEGER);
nesting.add(key, this.parseShortcut());
} else {
throw new SyntaxError(`Unexpected character ${c}`);
}
return nesting;
}
}
/** Parse series extraction syntax */
export function seriesExtractor(syntax: string): Nesting {
return new Parser(syntax).parse();
}