UNPKG

series-extractor

Version:

A TypeScript library for extracting data series from nested objects and arrays using a custom syntax.

865 lines (789 loc) 25.7 kB
/** * Module for extracting data series from nested objects and arrays * * @remarks * TODO: * - row extractor needs rethinking; it is too ambiguous what constitutes a "row"; * {a: b.c.$x, b: c{d:$y, $e:$f}, c: x.$z} is a great example to demonstrate this * - have a ? prefix indicate a fixed varaible, which could be an alternative syntax to $var=value * - parser should validate that a variable is not nested within itself * - Could add feature to filter a dimension variable; e.g. $var in [whitelist of constants]; * syntax might be $var=value; maybe would only support string values to start; for multiple * maybe use | as separator character? $var=a|b|c? Can be implemented completely inside * Token.fromBuilder method without affecting other parse logic I think. Maybe a simpler * alternative would be to not include those in the syntax, but in the config.yml instead: * series.filter: dict[str,list[str]] * - map token -> series id + dimension index * - implement the *reverse* operation, taking a generator of extracted values and building a nested * json/array object using the same syntax; abstracting Nesting.extractRows to handle any output * format, not just row-wise * - simplify syntax: {$key: $} -> {$key} ? */ /** Controls values generated during extraction */ export enum ExtractableFlags { /** no special flags */ NONE = 0b0, /** include {@link ExtractableStack} flags to indicate entering/exiting a nesting */ STACK = 0b1, /** include anonymous dimension variables */ ANONYMOUS = 0b10, } /** * Flags to indicate entering/exiting a dimension's nesting while extracting. * Nesting from constant keys do not emit stack flags, only dimension variables. */ export enum ExtractableStack { /** begin nesting on previously yielded {@link Value} */ PUSH = "PUSH", /** close current nesting */ POP = "POP", } export class SyntaxError extends Error { constructor(message: string) { super(message); this.name = "SyntaxError"; } } /** * Enhanced syntax error with position information and context */ export class SeriesSyntaxError extends SyntaxError { /** 1-indexed line number where error occurred */ public readonly line: number; /** 1-indexed column number where error occurred */ public readonly col: number; /** 1-indexed character position where error occurred */ public readonly char: number; /** The original syntax string being parsed */ public readonly syntaxInput: string; /** Formatted context showing the error location */ public readonly context: string; /** The original error message without formatting */ public readonly originalMessage: string; constructor( message: string, line: number, col: number, char: number, syntaxInput: string ) { // Generate context const before = syntaxInput.substring(0, char - 1); let after = syntaxInput.substring(char - 1, char + 20); const newlineIndex = after.indexOf('\n'); if (newlineIndex !== -1) { after = after.substring(0, Math.max(1, newlineIndex)); } // Format the pointer line let pointer = message; if (pointer.length < col - 1) { pointer += " ^"; pointer = pointer.padStart(col, " "); } else { pointer = "^".padStart(col, " ") + " " + pointer; } const context = `${before}${after}\n${pointer}`; // Create the full formatted message const fullMessage = `Invalid series syntax: line #${line}, col #${col}, char #${char}\n` + context; super(fullMessage); this.name = "SeriesSyntaxError"; this.line = line; this.col = col; this.char = char; this.syntaxInput = syntaxInput; this.context = context; this.originalMessage = message; } } /** A class which can extract values */ export abstract class Extractable { /** * Extracts series values from nested data * * @param data - data value to extract from * @param flags - also generate {@link ExtractableStack} flags to indicate entering/exiting a nesting * @yields extracted values */ public abstract extract(data: any, flags?: ExtractableFlags): Generator<Value | ExtractableStack>; public equals(other: any): boolean { throw new Error("Method 'equals' must be implemented by subclasses."); } } /** Token type that can appear in a given context */ export enum TokenType { /** integer array index, automatically interpreted so inside [] array nesting */ IMPLICIT_INTEGER = 0b1, /** * integer array index, explicitly interpreted via # prefix in shortcut syntax; * with current syntax definition, a key is never both implicit and explicit integer */ EXPLICIT_INTEGER = 0b10, /** either implicit or explicit integer type */ INTEGER = 0b11, /** string */ STRING = 0b100, /** integer or string */ ANY_TYPE = 0b111, /** token which is a dimension variable */ DIMENSION = 0b1000, } /** * Token holds a key or value in the data structure. It can be an integer key for arrays, or * string for objects. The token may be a dimension variable. Values are always dimension * variables */ export class Token extends Extractable { private static reUnescape = /\\(.)/g; private static reprMap: { [key: number]: string } = { [TokenType.IMPLICIT_INTEGER]: "int", [TokenType.EXPLICIT_INTEGER]: "#int", [TokenType.STRING]: "str", [TokenType.DIMENSION]: "var" }; /** token identifier; integer for array index; None if its an anonymous dimension variable */ public name: string | number | null; /** whether token is a dimension variable */ public flags: TokenType; constructor(name: string | number | null, flags: TokenType) { super(); this.name = name; this.flags = flags; if (this.flags & TokenType.DIMENSION) { if (!(typeof this.name === 'string' || this.name === null)) { throw new SyntaxError("variable dimension name must always be string"); } } else if (flags & TokenType.EXPLICIT_INTEGER) { if (!(typeof this.name === 'number')) { throw new SyntaxError("name should be int for non-variable explicit integer"); } } else if (typeof this.name === 'number') { if (!(flags & TokenType.IMPLICIT_INTEGER)) { throw new SyntaxError("name type doesn't match flags"); } } // flags must be a single type, except for the case where from_builder cannot yet resolve // what the type might be; finalize_type resolves the type in that scenario const types = this.flags & TokenType.ANY_TYPE; if (!this.finalized) { if (!(types === (TokenType.IMPLICIT_INTEGER | TokenType.STRING))) { throw new SyntaxError("flags must be IMPLICIT_INTEGER | STRING when not finalized"); } } } public equals(other: any): boolean { return other instanceof Token && this.name === other.name && this.flags === other.flags; } public toString(): string { const attrs: (string | number)[] = [String(this.name)]; for (const k in Token.reprMap) { if (this.flags & Number(k)) { attrs.push(Token.reprMap[Number(k)]); } } return `<${attrs.join('|')}>`; } /** Whether token is an anoyomous dimension variable */ public get anonymous(): boolean { return this.name === null; } /** Convert name to integer */ private static _parseInteger(name: string | number | null, flags: TokenType): number | string | null { if (!(flags & TokenType.DIMENSION)) { if (typeof name === 'string') { try { const idx = parseInt(name, 10); if (isNaN(idx) || idx < 0) { throw new SyntaxError("Expected positive integer key"); } return idx; } catch (e) { throw new SyntaxError("Expected positive integer key"); } } } return name; } /** * Parse token from a string builder's contents * * @param flags - what kind of token we expect; if ANY_TYPE, and result is not * EXPLICIT_INTEGER, you'll need to call finalize_type later based on lookahead * context to set the token's type */ public static fromBuilder(builder: string[], flags: TokenType): Token { if (![ TokenType.STRING, // object key TokenType.STRING | TokenType.EXPLICIT_INTEGER, // shortcut TokenType.ANY_TYPE // array key or shortcut ].includes(flags)) { throw new SyntaxError("invalid flags argument"); } let name: string | number | null = null; let currentFlags = flags; // # prefix explicitly marks it as an integer token if (builder[0] === '#') { if (!(currentFlags & TokenType.EXPLICIT_INTEGER)) { throw new SyntaxError("Explicit integer key shortcut not allowed here"); } currentFlags = TokenType.EXPLICIT_INTEGER; builder = builder.slice(1); } else { currentFlags &= ~TokenType.EXPLICIT_INTEGER; } // $ prefix indicates dimension variable if (builder[0] === '$') { currentFlags |= TokenType.DIMENSION; builder = builder.slice(1); } // convert to string if (builder.length > 0) { name = builder.join(''); // remove escape backslashes name = name.replace(Token.reUnescape, "$1"); // parse int if possible if (currentFlags & TokenType.EXPLICIT_INTEGER) { name = Token._parseInteger(name, currentFlags); } } else if (!(currentFlags & TokenType.DIMENSION)) { // only occurs with "#" string throw new SyntaxError("Expected integer key after #"); } return new Token(name, currentFlags); } /** Whether token's type has been finalized */ public get finalized(): boolean { const bitCount = (n: number) => { let count = 0; while (n > 0) { n &= (n - 1); count++; } return count; }; return bitCount(this.flags & TokenType.ANY_TYPE) === 1; } /** * Ensure that the token is of a certain type. Resolves tokens whose type was undecided * between IMPLICIT_INTEGER or STRING in from_builder factory method * * @param type - one of INTEGER or STRING */ public finalizeType(type: TokenType): void { if (this.finalized) { throw new SyntaxError("token type already finalized"); } if (!((type === TokenType.INTEGER) || (type === TokenType.STRING))) { throw new SyntaxError("invalid type argument"); } let final_type: TokenType = type; if (type === TokenType.INTEGER) { final_type = TokenType.IMPLICIT_INTEGER; } this.flags = (this.flags & ~TokenType.ANY_TYPE) | (final_type & TokenType.ANY_TYPE); // parse integer keys if (this.flags & TokenType.IMPLICIT_INTEGER) { this.name = Token._parseInteger(this.name, this.flags); } } /** Wraps data as a dimension value */ public *extract(data: any, flags: ExtractableFlags = ExtractableFlags.NONE): Generator<Value> { // always a dimension if called from Nesting if (!(this.flags & TokenType.DIMENSION)) { throw new SyntaxError("Cannot extract from a non-dimension token"); } // ignore anonymous dimensions; use Extractable.Stack flags instead to monitor nesting if ((flags & ExtractableFlags.ANONYMOUS) || !this.anonymous) { yield new Value(this, data); } } } /** Extracted value for a dimension variable */ export class Value { /** dimension the value comes from */ public readonly dimension: Token; /** extracted value */ public readonly value: any; constructor(dimension: Token, value: any) { this.dimension = dimension; this.value = value; } /** Get dimension's name*/ public get name(): string | number | null { return this.dimension.name; } /** Whether dimension is anonymous */ public get anonymous(): boolean { return this.dimension.anonymous; } public toString(): string { return `${this.dimension.name}=${this.value}`; } } /** Key-value lookup in the data structure. Used for both object and array nestings */ export class Pair { public readonly key: Token; public readonly value: Extractable; constructor(key: Token, value: Extractable) { this.key = key; this.value = value; } public equals(other: any): boolean { return other instanceof Pair && this.key.equals(other.key) && this.value.equals(other.value); } public toString(): string { return `${this.key.toString()} → ${this.value.toString()}`; } } /** A nesting of key-value pairs in the data structure. See ObjectNesting and ArrayNesting subclasses */ export abstract class Nesting extends Extractable { /** pairs whose key is a constant */ protected constant_pairs: Pair[] = []; /** pairs whose key is a dimension (possibly anonymous) */ protected dimension_pairs: Pair[] = []; constructor() { super(); } public get length(): number { return this.constant_pairs.length + this.dimension_pairs.length; } public equals(other: any): boolean { if (!(other instanceof (this as any).constructor)) { return false; } // Compare constant_pairs if (this.constant_pairs.length !== other.constant_pairs.length) { return false; } for (let i = 0; i < this.constant_pairs.length; i++) { if (!this.constant_pairs[i].equals(other.constant_pairs[i])) { return false; } } // Compare dimension_pairs if (this.dimension_pairs.length !== other.dimension_pairs.length) { return false; } for (let i = 0; i < this.dimension_pairs.length; i++) { if (!this.dimension_pairs[i].equals(other.dimension_pairs[i])) { return false; } } return true; } public toString(): string { return `${this.constructor.name}(${this.length})`; } /** * Add a key-value pair to the nesting * @returns self, for chaining purposes */ public add(key: Token, value: Extractable): Nesting { if (value instanceof Token && !(value.flags & TokenType.DIMENSION)) { throw new SyntaxError("Leaf value must be a dimension variable"); } const pair = new Pair(key, value); if (key.flags & TokenType.DIMENSION) { this.dimension_pairs.push(pair); } else { this.constant_pairs.push(pair); } return this; } /** Indicate nesting has closed; e.g. ] or } character */ public close(): void { if (!this.length) { throw new SyntaxError("Nested structure cannot be empty"); } } /** Check if data has some key */ public abstract has(data: any, key: string | number): boolean; /** Iterate key, value pairs */ public abstract iter(data: any): Generator<[string | number, any]>; /** Generator to extract dimension variables */ public *extract(data: any, flags: ExtractableFlags = ExtractableFlags.NONE): Generator<Value | ExtractableStack> { for (const pair of this.constant_pairs) { if (this.has(data, pair.key.name!)) { const child = data[pair.key.name!]; yield* pair.value.extract(child, flags); } } // these traverse the entire object const stack = (flags & ExtractableFlags.STACK) !== 0; if (this.dimension_pairs.length > 0) { for (const [key, value] of this.iter(data)) { for (const pair of this.dimension_pairs) { yield* pair.key.extract(key, flags); if (stack) { yield ExtractableStack.PUSH; } yield* pair.value.extract(value, flags); if (stack) { yield ExtractableStack.POP; } } } } } /** * Helper to organize extracted values into rows, where rows are implicitly defined * by the data's nesting structure * * @yields dicts, where keys are dimension variable names and values are extracted values * for the current row */ public *extractRows(data: any): Generator<{ [key: string]: any }> { /** * Each entry is of the form [nesting, nested_values...]. Initial entry * represents the implicit root nesting, where only nested_values are present. */ const stack: (Value | undefined)[][] = [[]]; /** current row we're building */ let row: { [key: string]: any } = {}; /** last seen {@link Value}; None after stack push/pop */ let last: Value | undefined = undefined; for (const valueOrStack of this.extract(data, ExtractableFlags.STACK | ExtractableFlags.ANONYMOUS)) { // switch last seen value to be a new nesting if (valueOrStack === ExtractableStack.PUSH) { if (last === undefined) { throw new Error("stack push should always occur after yielding a Value"); } stack[stack.length - 1].pop(); stack.push([last]); last = undefined; } else if (valueOrStack === ExtractableStack.POP) { // finalize row; clear nested values // yield finalized row for only first POP seen if (last !== undefined) { yield { ...row }; last = undefined; } // clear nested values const poppedStack = stack.pop(); if (poppedStack) { for (const nested_value of poppedStack) { if (nested_value && !nested_value.anonymous) { delete row[nested_value.name as string]; } } } } else { // It's a Value // new value; add to current stack nesting last = valueOrStack; stack[stack.length - 1].push(last); if (!last.anonymous) { row[last.name as string] = last.value; } } } // pop final implicit root nesting if (last !== undefined) { yield { ...row }; } } } /** Data structure nesting for objects */ export class ObjectNesting extends Nesting { public has(obj: any, key: string): boolean { return typeof obj === 'object' && obj !== null && obj.hasOwnProperty(key); } public *iter(obj: { [key: string]: any }): Generator<[string, any]> { if (typeof obj === 'object' && obj !== null) { for (const key in obj) { if (obj.hasOwnProperty(key)) { yield [key, obj[key]]; } } } } } /** Data structure nesting for arrays */ export class ArrayNesting extends Nesting { public has(arr: any[], key: number): boolean { return Array.isArray(arr) && key >= 0 && key < arr.length; } public *iter(arr: any[]): Generator<[number, any]> { if (Array.isArray(arr)) { for (let i = 0; i < arr.length; i++) { yield [i, arr[i]]; } } } /** Add pair where key is current array index */ public addImplicit(value: Extractable): ArrayNesting { this.add(new Token(this.length, TokenType.IMPLICIT_INTEGER), value); return this; } } /** * Recursive parser for the custom series extractor syntax. * Only the `parse` method needs to be called externally */ export class Parser { /** syntax being parsed */ public syntax: string; /** last token generated by parse_token */ public token: Token | null = null; /** generator for characters; handles whitespace, escapes, and backtracking */ private consumer: Generator<string>; /** repeat previously seen character */ private repeat: boolean = false; /** 1-indexed position of last read character; 0 if no characters read yet */ private char: number = 0; /** 1-indexed position in current line of last read character; 0 if no characters read for the current line */ private col: number = 0; /** 1-indexed line of last read character */ private line: number = 1; constructor(syntax: string) { this.syntax = syntax; this.consumer = this.consume(); } public parse(): Nesting { try { const res = this.parseRoot(); // we don't allow extracting a lone value currently if (res instanceof Token) { throw new SyntaxError("Expected value after key"); } if (this.next() !== undefined) { throw new SyntaxError("Unexpected character after end of syntax"); } return res as Nesting; } catch (e: any) { if (e instanceof SyntaxError) { // Wrap in SeriesSyntaxError with position and context throw new SeriesSyntaxError( e.message, this.line, this.col, this.char, this.syntax ); } throw e; // Re-throw other errors } } /** * Generator to consume syntax characters. This is doing some extra logic to do some of the * job of a tokenizer, so is not strictly iterating over single characters. Use * :meth:`next` and :meth:`backtrack` to interact with the consumer */ private *consume(): Generator<string> { let escape = false; for (let c of this.syntax) { // position for error messages this.char++; if (c === '\n') { this.line++; this.col = 0; } else { this.col++; } // escaped characters; we leave the backslash to simplify downstream logic, where // the backslash ensures the subsequent character is not interpreted as special if (escape) { c = '\\' + c; escape = false; } else if (c === '\\') { escape = true; continue; } else if (/\s/.test(c)) { // ignore whitespace continue; } // continually emit if they send truthy value back while (true) { yield c; if (!this.repeat) { break; } this.repeat = false; } } if (escape) { throw new SyntaxError("Backslash missing subsequent escaped character"); } } /** Get next character */ public next(): string | undefined { const result = this.consumer.next(); return result.value; } /** Backtrack and repeat last character returned by :meth:`next` */ public backtrack(): void { this.repeat = true; } /** Consume as many non-special characters as possible and add them to the token builder */ private parseToken(include: string | null, type: TokenType): void { const builder: string[] = []; if (include !== null) { builder.push(include); } while (true) { const c = this.next(); if (c === undefined) { break; } if (".,:{}[]".includes(c)) { this.backtrack(); break; } builder.push(c); } this.token = Token.fromBuilder(builder, type); } private parseRoot(): Extractable { const c = this.next(); if (c === undefined || '.,:}]'.includes(c)) { throw new SyntaxError("Expected value"); } if (c === '[') { return this.parseArray(); } if (c === '{') { return this.parseObject(); } this.parseToken(c, TokenType.STRING | TokenType.EXPLICIT_INTEGER); return this.parseShortcut(); } /** Parse syntax inside array definition: `[...]` */ private parseArray(): ArrayNesting { const nesting = new ArrayNesting(); let state = 0; while (true) { const c = this.next(); // 0: look for array key/idx or value if (state === 0) { if (c === ']') break; if (c === undefined || '.,:}'.includes(c)) { throw new SyntaxError("Expected array key or value"); } if (c === '[') { nesting.addImplicit(this.parseArray()); state = 1; } else if (c === '{') { nesting.addImplicit(this.parseObject()); state = 1; } else { this.parseToken(c, TokenType.ANY_TYPE); // following state will finalize token's type state = 2; } } else if (state === 1) { // 1: look for array separator or end of array if (c === ']') break; if (c !== ',') { throw new SyntaxError("Expected comma or end of array"); } state = 0; } else if (state === 2) { // 2: look for value pair to go with token, or interpret token as value itself if (c === undefined) { throw new SyntaxError("Unclosed array"); } if (c === '}') { throw new SyntaxError("Unexpected closing brace inside array"); } if (c === ']' || c === ',') { if (this.token!.finalized) { throw new SyntaxError("Missing value after explicit integer shortcut"); } this.token!.finalizeType(TokenType.STRING); nesting.addImplicit(this.token!); if (c === ']') break; state = 0; } else if (c === ':') { if (this.token!.finalized) { throw new SyntaxError("Do not use # for integer keys inside arrays"); } this.token!.finalizeType(TokenType.INTEGER); nesting.add(this.token!, this.parseRoot()); state = 1; } else { // remaining characters are one of {[. if (!this.token!.finalized) { this.token!.finalizeType(TokenType.STRING); } this.backtrack(); nesting.addImplicit(this.parseShortcut()); state = 1; } } } if (!nesting.length) { throw new SyntaxError("Empty array"); } return nesting; } /** Parse syntax inside object: `{...}` */ private parseObject(): ObjectNesting { const nesting = new ObjectNesting(); let state = 0; while (true) { const c = this.next(); // 0: look for object key if (state === 0) { if (c === '}') break; if (c === undefined || '.,:[]{'.includes(c)) { throw new SyntaxError("Expected object key"); } this.parseToken(c, TokenType.STRING); state = 1; } else if (state === 1) { // 1: look for colon if (c !== ':') { throw new SyntaxError("Expected colon"); } nesting.add(this.token!, this.parseRoot()); state = 2; } else if (state === 2) { // 2: look for comma or end of object if (c === '}') break; if (c !== ',') { throw new SyntaxError("Expected comma or end of object"); } state = 0; } } if (!nesting.length) { throw new SyntaxError("Empty object"); } return nesting; } /** * Parse syntax for shortcut array or object definition: `a.b.c`. Expects a token to have * been previously parsed, and its type finalized to be one of STRING or EXPLICIT_INTEGER */ private parseShortcut(): Extractable { const isArray = (this.token!.flags & TokenType.EXPLICIT_INTEGER) !== 0; const c = this.next(); // no well defined closing character for shortcut syntax; parent parser will decide what to do if (c === undefined || ',:}]'.includes(c)) { if (isArray) { throw new SyntaxError("Missing value after explicit integer shortcut"); } this.backtrack(); return this.token!; } const nesting = isArray ? new ArrayNesting() : new ObjectNesting(); if (c === '{') { nesting.add(this.token!, this.parseObject()); } else if (c === '[') { nesting.add(this.token!, this.parseArray()); } else if (c === '.') { // remaining character is '.' const key = this.token!; // parse value token this.parseToken(null, TokenType.STRING | TokenType.EXPLICIT_INTEGER); nesting.add(key, this.parseShortcut()); } else { throw new SyntaxError(`Unexpected character ${c}`); } return nesting; } } /** Parse series extraction syntax */ export function seriesExtractor(syntax: string): Nesting { return new Parser(syntax).parse(); }