@dobesv/parquets
Version:
TypeScript implementation of the Parquet file format, based on parquet.js
145 lines (144 loc) • 6.45 kB
TypeScript
/// <reference types="node" />
export declare type ParquetCodec = 'PLAIN' | 'RLE';
export declare type ParquetCompression = 'UNCOMPRESSED' | 'GZIP' | 'SNAPPY' | 'LZO' | 'BROTLI' | 'LZ4';
export declare type RepetitionType = 'REQUIRED' | 'OPTIONAL' | 'REPEATED';
export declare type ParquetType = PrimitiveType | OriginalType;
export declare type PrimitiveType = 'BOOLEAN' | 'INT32' | 'INT64' | 'INT96' | 'FLOAT' | 'DOUBLE' | 'BYTE_ARRAY' | 'FIXED_LEN_BYTE_ARRAY';
export declare type OriginalType = 'UTF8' | 'DATE' | 'TIME_MILLIS' | 'TIME_MICROS' | 'TIMESTAMP_MILLIS' | 'TIMESTAMP_MICROS' | 'UINT_8' | 'UINT_16' | 'UINT_32' | 'UINT_64' | 'INT_8' | 'INT_16' | 'INT_32' | 'INT_64' | 'JSON' | 'BSON' | 'INTERVAL';
export interface SchemaDefinition {
[string: string]: FieldDefinition;
}
export interface FieldDefinition {
type?: ParquetType;
typeLength?: number;
encoding?: ParquetCodec;
compression?: ParquetCompression;
/**
* Optional fields can be null instead of having a value of the given schema type.
*
* When an optional type is not provided, it is not included into the data.
*
* Instead there is an array of "dlevels" indicated whether optional values are present at
* each row offset into a column chunk.
*
* Note that fields should not be marked both optional and repeated - the underlying parquet
* schema does not have a way to represent fields that are both optional and repeated.
*/
optional?: boolean;
/**
* Repeated fields can occur more than once. They represent arrays or lists of values.
*
* The "rlevels" data is used to indicate whether values in the data are part of a new
* array or part of the same array as the prior value.
*
* Note that fields should not be marked both optional and repeated - the underlying parquet
* schema does not have a way to represent fields that are both optional and repeated.
*/
repeated?: boolean;
fields?: SchemaDefinition;
}
export interface ParquetField {
name: string;
path: string[];
key: string;
primitiveType?: PrimitiveType;
originalType?: OriginalType;
repetitionType: RepetitionType;
typeLength?: number;
encoding?: ParquetCodec;
compression?: ParquetCompression;
/**
* The maximum repetition level is a count of repeated fields in this field's
* path (e.g. this field and its ancestors).
*
* When scanning values in the data, the rLevelMax is used to determine whether a REPEATED value
* should be added to an existing array or if a new array (or arrays) should be created to
* add the value to.
*
* If the value is not repeated (and neither are any of its ancestor fields) then rLevelMax
* should be zero.
*
* If the rLevelMax is 1 and `repetitionType === 'REPEATED'`, this field is itself repeated
* in its parent object (which is the root for a top-level field).
*
* Note that fields that do not have `repetitionType === 'REPEATED'` can still have `rLevelMax > 0`
* if they are in a nested object that is repeated.
*/
rLevelMax: number;
/**
* The maximum definition level is a count of optional fields in this field's
* path (e.g. this field and its ancestors).
*
* dLevelMax is used when decoding to determine whether to expect a value to be
* present in the output for a given column and row.
*
* If the dLeveLMax is 0, the field is not optional.
*
* If the dLevelMax is 1, and the repetitionType === 'OPTIONAL', this field is itself
* optional in its parent object.
*
* If this field is not optional but its parent is an optional value then it will have
* a non-zero dLevelMax.
*/
dLevelMax: number;
isNested?: boolean;
fieldCount?: number;
fields?: Record<string, ParquetField>;
}
export interface ParquetBuffer {
rowCount?: number;
columnData?: Record<string, ParquetColumnData>;
}
export declare type ParquetValueArray = boolean[] | number[] | string[] | Buffer[] | (string | Buffer)[] | Int32Array | Float32Array | Float64Array;
export interface ParquetColumnData {
/**
* Definition levels specify how many optional fields in the path for the column are defined.
*
* If the definition level for a row is less than the number of optional fields in field path
* (dLevelMax), there will not be a value or repetition level encoded for that field.
*
* The field path refers to the field and its parent fields if it is nested.
*/
dLevels: Int32Array | number[];
/**
* For fields which have REPEATED fields in their field path, repetition levels may be specified.
*
* The repetition level indicates the number of repeated fields in the field path that are being
* "repeated" (rather than started anew in their parent record / array / row).
*
* The field path refers to the field and its parent fields if it is nested.
*
* A repetition level of zero indicates the first value for all the repeated fields in the path;
* they would all get a fresh new array.
*
* When the repetition level is greater than zero and less than the number of repeated fields
* (rLevelMax) in the field path we can keep that number of parent arrays but create new arrays
* after that.
*
* When the repetition level is equal to the number of repeated fields (rLevelMax) we are simply
* adding to the end of the most recently created array.
*
* If the field path has only one repeated field, the repetition level will be 0 for the first
* element of a given row and 1 for the ones that follow.
*
* If the field path has two repeated fields, the repetition level will be 0 for the first element
* in a row, 1 for the first element of the second set of values for the first repeated field in
* the field path, and 2 for subsequent elements in a single group/array.
*/
rLevels: Int32Array | number[];
/**
* Values read from the column. May be returned as a primitive
* array format if it is able to.
*
* - UInt8Array: BOOLEAN
* - Int32Array: INT32
* - Float32Array: FLOAT
* - Float64Array: DOUBLE
*/
values: ParquetValueArray;
/**
* Value count
*/
count: number;
}
export declare type ParquetRecord = Record<string, any>;