@dobesv/parquets
Version:
TypeScript implementation of the Parquet file format, based on parquet.js
206 lines (189 loc) • 6.49 kB
text/typescript
export type ParquetCodec = 'PLAIN' | 'RLE';
export type ParquetCompression =
| 'UNCOMPRESSED'
| 'GZIP'
| 'SNAPPY'
| 'LZO'
| 'BROTLI'
| 'LZ4';
export type RepetitionType = 'REQUIRED' | 'OPTIONAL' | 'REPEATED';
export type ParquetType = PrimitiveType | OriginalType;
export type PrimitiveType =
// Base Types
| 'BOOLEAN' // 0
| 'INT32' // 1
| 'INT64' // 2
| 'INT96' // 3
| 'FLOAT' // 4
| 'DOUBLE' // 5
| 'BYTE_ARRAY' // 6,
| 'FIXED_LEN_BYTE_ARRAY'; // 7
export type OriginalType =
// Converted Types
| 'UTF8' // 0
// | 'MAP' // 1
// | 'MAP_KEY_VALUE' // 2
// | 'LIST' // 3
// | 'ENUM' // 4
// | 'DECIMAL' // 5
| 'DATE' // 6
| 'TIME_MILLIS' // 7
| 'TIME_MICROS' // 8
| 'TIMESTAMP_MILLIS' // 9
| 'TIMESTAMP_MICROS' // 10
| 'UINT_8' // 11
| 'UINT_16' // 12
| 'UINT_32' // 13
| 'UINT_64' // 14
| 'INT_8' // 15
| 'INT_16' // 16
| 'INT_32' // 17
| 'INT_64' // 18
| 'JSON' // 19
| 'BSON' // 20
| 'INTERVAL'; // 21
export interface SchemaDefinition {
[string: string]: FieldDefinition;
}
export interface FieldDefinition {
type?: ParquetType;
typeLength?: number;
encoding?: ParquetCodec;
compression?: ParquetCompression;
/**
* Optional fields can be null instead of having a value of the given schema type.
*
* When an optional type is not provided, it is not included into the data.
*
* Instead there is an array of "dlevels" indicated whether optional values are present at
* each row offset into a column chunk.
*
* Note that fields should not be marked both optional and repeated - the underlying parquet
* schema does not have a way to represent fields that are both optional and repeated.
*/
optional?: boolean;
/**
* Repeated fields can occur more than once. They represent arrays or lists of values.
*
* The "rlevels" data is used to indicate whether values in the data are part of a new
* array or part of the same array as the prior value.
*
* Note that fields should not be marked both optional and repeated - the underlying parquet
* schema does not have a way to represent fields that are both optional and repeated.
*/
repeated?: boolean;
fields?: SchemaDefinition;
}
export interface ParquetField {
name: string;
path: string[];
key: string;
primitiveType?: PrimitiveType;
originalType?: OriginalType;
repetitionType: RepetitionType;
typeLength?: number;
encoding?: ParquetCodec;
compression?: ParquetCompression;
/**
* The maximum repetition level is a count of repeated fields in this field's
* path (e.g. this field and its ancestors).
*
* When scanning values in the data, the rLevelMax is used to determine whether a REPEATED value
* should be added to an existing array or if a new array (or arrays) should be created to
* add the value to.
*
* If the value is not repeated (and neither are any of its ancestor fields) then rLevelMax
* should be zero.
*
* If the rLevelMax is 1 and `repetitionType === 'REPEATED'`, this field is itself repeated
* in its parent object (which is the root for a top-level field).
*
* Note that fields that do not have `repetitionType === 'REPEATED'` can still have `rLevelMax > 0`
* if they are in a nested object that is repeated.
*/
rLevelMax: number;
/**
* The maximum definition level is a count of optional fields in this field's
* path (e.g. this field and its ancestors).
*
* dLevelMax is used when decoding to determine whether to expect a value to be
* present in the output for a given column and row.
*
* If the dLeveLMax is 0, the field is not optional.
*
* If the dLevelMax is 1, and the repetitionType === 'OPTIONAL', this field is itself
* optional in its parent object.
*
* If this field is not optional but its parent is an optional value then it will have
* a non-zero dLevelMax.
*/
dLevelMax: number;
isNested?: boolean;
fieldCount?: number;
fields?: Record<string, ParquetField>;
}
export interface ParquetBuffer {
rowCount?: number;
columnData?: Record<string, ParquetColumnData>;
}
export type ParquetValueArray =
| boolean[]
| number[]
| string[]
| Buffer[]
| (string | Buffer)[]
| Int32Array
| Float32Array
| Float64Array;
export interface ParquetColumnData {
/**
* Definition levels specify how many optional fields in the path for the column are defined.
*
* If the definition level for a row is less than the number of optional fields in field path
* (dLevelMax), there will not be a value or repetition level encoded for that field.
*
* The field path refers to the field and its parent fields if it is nested.
*/
dLevels: Int32Array | number[];
/**
* For fields which have REPEATED fields in their field path, repetition levels may be specified.
*
* The repetition level indicates the number of repeated fields in the field path that are being
* "repeated" (rather than started anew in their parent record / array / row).
*
* The field path refers to the field and its parent fields if it is nested.
*
* A repetition level of zero indicates the first value for all the repeated fields in the path;
* they would all get a fresh new array.
*
* When the repetition level is greater than zero and less than the number of repeated fields
* (rLevelMax) in the field path we can keep that number of parent arrays but create new arrays
* after that.
*
* When the repetition level is equal to the number of repeated fields (rLevelMax) we are simply
* adding to the end of the most recently created array.
*
* If the field path has only one repeated field, the repetition level will be 0 for the first
* element of a given row and 1 for the ones that follow.
*
* If the field path has two repeated fields, the repetition level will be 0 for the first element
* in a row, 1 for the first element of the second set of values for the first repeated field in
* the field path, and 2 for subsequent elements in a single group/array.
*/
rLevels: Int32Array | number[];
/**
* Values read from the column. May be returned as a primitive
* array format if it is able to.
*
* - UInt8Array: BOOLEAN
* - Int32Array: INT32
* - Float32Array: FLOAT
* - Float64Array: DOUBLE
*/
values: ParquetValueArray;
/**
* Value count
*/
count: number;
}
export type ParquetRecord = Record<string, any>;