UNPKG

hyparquet-writer

Version:

Parquet file writer for JavaScript

112 lines (99 loc) 3.58 kB
import type { ColumnChunk, ColumnIndex, CompressionCodec, DecodedArray, Encoding, KeyValue, OffsetIndex, SchemaElement } from 'hyparquet' export type Compressor = (input: Uint8Array) => Uint8Array export type Compressors = { [K in CompressionCodec]?: Compressor } // Superset of parquet types with automatic conversions export type BasicType = 'BOOLEAN' | 'INT32' | 'INT64' | 'FLOAT' | 'DOUBLE' | 'BYTE_ARRAY' | 'STRING' | 'JSON' | 'TIMESTAMP' | 'UUID' | 'FLOAT16' | 'GEOMETRY' | 'GEOGRAPHY' | 'VARIANT' // Recursive variant shredding config: a scalar BasicType, an array of one // element shred type ([elem] = array-of-elem), or an object mapping field names // to shred types. The single-element array is a shape template; only index 0 is read. export type ShredType = BasicType | ShredType[] | { [field: string]: ShredType } export interface ParquetWriteOptions { writer: Writer columnData: ColumnSource[] schema?: SchemaElement[] codec?: CompressionCodec // global default codec, default 'SNAPPY' compressors?: Compressors // custom compressors statistics?: boolean // enable column statistics, default true rowGroupSize?: number | number[] // number of rows per row group pageSize?: number // target uncompressed page size in bytes, default 1048576 kvMetadata?: KeyValue[] } export interface ColumnSource { name: string data: DecodedArray type?: BasicType nullable?: boolean encoding?: Encoding codec?: CompressionCodec // per-column codec override, default ParquetWriteOptions.codec columnIndex?: boolean // write column indexes, default false offsetIndex?: boolean // write offset indexes, default true shredding?: true | ShredType // variant shredding config (true = auto-detect) bloomFilter?: boolean | BloomFilterOptions // write bloom filter, default false } export interface PageData { values: DecodedArray definitionLevels: number[] repetitionLevels: number[] maxDefinitionLevel: number } export interface BloomFilterOptions { fpp?: number // false positive probability, default 0.01 maxBytes?: number // skip emission above this size, default 1 MiB } export interface ColumnEncoder { columnName: string element: SchemaElement schemaPath: SchemaElement[] codec: CompressionCodec compressors: Compressors stats: boolean pageSize: number // Spec: If ColumnIndex is present, OffsetIndex must also be present columnIndex: boolean offsetIndex: boolean encoding?: Encoding // user-specified encoding bloomFilter?: boolean | BloomFilterOptions } export interface PageIndexes { chunk: ColumnChunk columnIndex?: ColumnIndex offsetIndex?: OffsetIndex bloomFilter?: Uint32Array // finalized SBBF blocks } export interface Writer { buffer: ArrayBuffer view: DataView offset: number // total bytes written ensure(size: number): void flush?(): void | Promise<void> finish(): void | Promise<void> getBuffer(): ArrayBuffer getBytes(): Uint8Array appendUint8(value: number): void appendUint32(value: number): void appendInt32(value: number): void appendInt64(value: bigint): void appendFloat32(value: number): void appendFloat64(value: number): void appendBuffer(buffer: ArrayBuffer): void appendBytes(value: Uint8Array): void appendVarInt(value: number): void appendVarBigInt(value: bigint): void appendZigZag(value: number | bigint): void } export type ThriftObject = { [ key: `field_${number}` ]: ThriftType } export type ThriftType = boolean | number | bigint | string | Uint8Array | ThriftType[] | ThriftObject | undefined