UNPKG

@dobesv/parquets

Version:

TypeScript implementation of the Parquet file format, based on parquet.js

90 lines (89 loc) 3.14 kB
import { ParquetBuffer, ParquetColumnData, ParquetRecord, ParquetValueArray } from './declare'; import { ParquetSchema } from './schema'; import { CustomError } from 'ts-custom-error'; export declare class ParquetShredError extends CustomError { constructor(message: string); } export declare class MissingRequiredFieldShredError extends CustomError { fieldName: string; constructor(fieldName: string); } export declare class TooManyValuesShredError extends CustomError { fieldName: string; constructor(fieldName: string); } export interface ColumnStatistics { min: any; max: any; null_count: number; distinct_values: Set<any>; } export interface ParquetWriteColumnData { dLevels: number[]; rLevels: number[]; values: ParquetValueArray; count: number; } export declare class ParquetWriteBuffer { rowCount: number; columnData: Record<string, ParquetWriteColumnData>; statistics: Record<string, ColumnStatistics>; constructor(schema: ParquetSchema); } export declare const shredStatisticsBuffers: (schema: ParquetSchema) => Record<string, ColumnStatistics>; /** * 'Shred' a record into a list of <value, repetition_level, definition_level> * tuples per column using the Google Dremel Algorithm.. * * The buffer argument must point to an object into which the shredded record * will be returned. You may re-use the buffer for repeated calls to this function * to append to an existing buffer, as long as the schema is unchanged. * * The format in which the shredded records will be stored in the buffer is as * follows: * * buffer = { * columnData: [ * 'my_col': { * dLevels: [d1, d2, .. dN], * rLevels: [r1, r2, .. rN], * values: [v1, v2, .. vN], * }, ... * ], * rowCount: X, * } */ export declare function shredRecord(schema: ParquetSchema, record: any, buffer: ParquetWriteBuffer): void; /** * 'Materialize' a list of <value, repetition_level, definition_level> * tuples back to nested records (objects/arrays) using the Google Dremel * Algorithm.. * * The buffer argument must point to an object with the following structure (i.e. * the same structure that is returned by shredRecords): * * buffer = { * columnData: [ * 'my_col': { * dlevels: [d1, d2, .. dN], * rlevels: [r1, r2, .. rN], * values: [v1, v2, .. vN], * }, ... * ], * rowCount: X, * } */ export declare function materializeRecords(schema: ParquetSchema, buffer: ParquetBuffer): ParquetRecord[]; /** * Support iteration over the values in a single column. * * For a simple column which is not repeated and not nested in a repeated * field, this will give one value for each row in the input. * * If the column is repeated or nested in a repeated column, it will give an * array for each row in the input. * * When there are multiple levels of repetition the iterator will yield * nested arrays. */ export declare function materializeColumn(schema: ParquetSchema, data: ParquetColumnData, columnPath: string[]): Generator<any, void, unknown>;