UNPKG

@dobesv/parquets

Version:

TypeScript implementation of the Parquet file format, based on parquet.js

206 lines (189 loc) 6.49 kB
export type ParquetCodec = 'PLAIN' | 'RLE'; export type ParquetCompression = | 'UNCOMPRESSED' | 'GZIP' | 'SNAPPY' | 'LZO' | 'BROTLI' | 'LZ4'; export type RepetitionType = 'REQUIRED' | 'OPTIONAL' | 'REPEATED'; export type ParquetType = PrimitiveType | OriginalType; export type PrimitiveType = // Base Types | 'BOOLEAN' // 0 | 'INT32' // 1 | 'INT64' // 2 | 'INT96' // 3 | 'FLOAT' // 4 | 'DOUBLE' // 5 | 'BYTE_ARRAY' // 6, | 'FIXED_LEN_BYTE_ARRAY'; // 7 export type OriginalType = // Converted Types | 'UTF8' // 0 // | 'MAP' // 1 // | 'MAP_KEY_VALUE' // 2 // | 'LIST' // 3 // | 'ENUM' // 4 // | 'DECIMAL' // 5 | 'DATE' // 6 | 'TIME_MILLIS' // 7 | 'TIME_MICROS' // 8 | 'TIMESTAMP_MILLIS' // 9 | 'TIMESTAMP_MICROS' // 10 | 'UINT_8' // 11 | 'UINT_16' // 12 | 'UINT_32' // 13 | 'UINT_64' // 14 | 'INT_8' // 15 | 'INT_16' // 16 | 'INT_32' // 17 | 'INT_64' // 18 | 'JSON' // 19 | 'BSON' // 20 | 'INTERVAL'; // 21 export interface SchemaDefinition { [string: string]: FieldDefinition; } export interface FieldDefinition { type?: ParquetType; typeLength?: number; encoding?: ParquetCodec; compression?: ParquetCompression; /** * Optional fields can be null instead of having a value of the given schema type. * * When an optional type is not provided, it is not included into the data. * * Instead there is an array of "dlevels" indicated whether optional values are present at * each row offset into a column chunk. * * Note that fields should not be marked both optional and repeated - the underlying parquet * schema does not have a way to represent fields that are both optional and repeated. */ optional?: boolean; /** * Repeated fields can occur more than once. They represent arrays or lists of values. * * The "rlevels" data is used to indicate whether values in the data are part of a new * array or part of the same array as the prior value. * * Note that fields should not be marked both optional and repeated - the underlying parquet * schema does not have a way to represent fields that are both optional and repeated. */ repeated?: boolean; fields?: SchemaDefinition; } export interface ParquetField { name: string; path: string[]; key: string; primitiveType?: PrimitiveType; originalType?: OriginalType; repetitionType: RepetitionType; typeLength?: number; encoding?: ParquetCodec; compression?: ParquetCompression; /** * The maximum repetition level is a count of repeated fields in this field's * path (e.g. this field and its ancestors). * * When scanning values in the data, the rLevelMax is used to determine whether a REPEATED value * should be added to an existing array or if a new array (or arrays) should be created to * add the value to. * * If the value is not repeated (and neither are any of its ancestor fields) then rLevelMax * should be zero. * * If the rLevelMax is 1 and `repetitionType === 'REPEATED'`, this field is itself repeated * in its parent object (which is the root for a top-level field). * * Note that fields that do not have `repetitionType === 'REPEATED'` can still have `rLevelMax > 0` * if they are in a nested object that is repeated. */ rLevelMax: number; /** * The maximum definition level is a count of optional fields in this field's * path (e.g. this field and its ancestors). * * dLevelMax is used when decoding to determine whether to expect a value to be * present in the output for a given column and row. * * If the dLeveLMax is 0, the field is not optional. * * If the dLevelMax is 1, and the repetitionType === 'OPTIONAL', this field is itself * optional in its parent object. * * If this field is not optional but its parent is an optional value then it will have * a non-zero dLevelMax. */ dLevelMax: number; isNested?: boolean; fieldCount?: number; fields?: Record<string, ParquetField>; } export interface ParquetBuffer { rowCount?: number; columnData?: Record<string, ParquetColumnData>; } export type ParquetValueArray = | boolean[] | number[] | string[] | Buffer[] | (string | Buffer)[] | Int32Array | Float32Array | Float64Array; export interface ParquetColumnData { /** * Definition levels specify how many optional fields in the path for the column are defined. * * If the definition level for a row is less than the number of optional fields in field path * (dLevelMax), there will not be a value or repetition level encoded for that field. * * The field path refers to the field and its parent fields if it is nested. */ dLevels: Int32Array | number[]; /** * For fields which have REPEATED fields in their field path, repetition levels may be specified. * * The repetition level indicates the number of repeated fields in the field path that are being * "repeated" (rather than started anew in their parent record / array / row). * * The field path refers to the field and its parent fields if it is nested. * * A repetition level of zero indicates the first value for all the repeated fields in the path; * they would all get a fresh new array. * * When the repetition level is greater than zero and less than the number of repeated fields * (rLevelMax) in the field path we can keep that number of parent arrays but create new arrays * after that. * * When the repetition level is equal to the number of repeated fields (rLevelMax) we are simply * adding to the end of the most recently created array. * * If the field path has only one repeated field, the repetition level will be 0 for the first * element of a given row and 1 for the ones that follow. * * If the field path has two repeated fields, the repetition level will be 0 for the first element * in a row, 1 for the first element of the second set of values for the first repeated field in * the field path, and 2 for subsequent elements in a single group/array. */ rLevels: Int32Array | number[]; /** * Values read from the column. May be returned as a primitive * array format if it is able to. * * - UInt8Array: BOOLEAN * - Int32Array: INT32 * - Float32Array: FLOAT * - Float64Array: DOUBLE */ values: ParquetValueArray; /** * Value count */ count: number; } export type ParquetRecord = Record<string, any>;