UNPKG

@severo_tests/hyparquet

Version:

Parquet file parser for JavaScript

453 lines (402 loc) 10.5 kB
/** * Custom parsers for columns */ export interface ParquetParsers { timestampFromMilliseconds(millis: bigint): any; timestampFromMicroseconds(micros: bigint): any; timestampFromNanoseconds(nanos: bigint): any; dateFromDays(days: number): any; } /** * Parquet Metadata options for metadata parsing */ export interface MetadataOptions { parsers?: ParquetParsers // custom parsers to decode advanced types } /** * Parquet query options for reading data */ export interface ParquetReadOptions { file: AsyncBuffer // file-like object containing parquet data metadata?: FileMetaData // parquet metadata, will be parsed if not provided columns?: string[] // columns to read, all columns if undefined rowFormat?: 'object' | 'array' // format of each row passed to the onComplete function rowStart?: number // first requested row index (inclusive) rowEnd?: number // last requested row index (exclusive) onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may contain data outside the requested range. onPage?: (chunk: ColumnData) => void // called when a data page is parsed. pages may contain data outside the requested range. onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed compressors?: Compressors // custom decompressors utf8?: boolean // decode byte arrays as utf8 strings (default true) parsers?: ParquetParsers // custom parsers to decode advanced types } /** * Parquet query options for filtering data */ export type ParquetQueryFilter = | ParquetQueryColumnsFilter | { $and: ParquetQueryFilter[] } | { $or: ParquetQueryFilter[] } | { $nor: ParquetQueryFilter[] } type ParquetQueryColumnsFilter = { [key: string]: ParquetQueryOperator } export type ParquetQueryValue = string | number | boolean | object | null | undefined export type ParquetQueryOperator = { $gt?: ParquetQueryValue $gte?: ParquetQueryValue $lt?: ParquetQueryValue $lte?: ParquetQueryValue $eq?: ParquetQueryValue $ne?: ParquetQueryValue $in?: ParquetQueryValue[] $nin?: ParquetQueryValue[] $not?: ParquetQueryOperator } /** * A run of column data */ export interface ColumnData { columnName: string columnData: DecodedArray rowStart: number rowEnd: number // exclusive } /** * File-like object that can read slices of a file asynchronously. */ export interface AsyncBuffer { byteLength: number slice(start: number, end?: number): Awaitable<ArrayBuffer> } export type Awaitable<T> = T | Promise<T> export interface ByteRange { startByte: number endByte: number // exclusive } export interface DataReader { view: DataView offset: number } // Parquet file metadata types export interface FileMetaData { version: number schema: SchemaElement[] num_rows: bigint row_groups: RowGroup[] key_value_metadata?: KeyValue[] created_by?: string // column_orders?: ColumnOrder[] // encryption_algorithm?: EncryptionAlgorithm // footer_signing_key_metadata?: Uint8Array metadata_length: number } export interface SchemaTree { children: SchemaTree[] count: number element: SchemaElement path: string[] } export interface SchemaElement { type?: ParquetType type_length?: number repetition_type?: FieldRepetitionType name: string num_children?: number converted_type?: ConvertedType scale?: number precision?: number field_id?: number logical_type?: LogicalType } export type ParquetType = 'BOOLEAN' | 'INT32' | 'INT64' | 'INT96' | // deprecated 'FLOAT' | 'DOUBLE' | 'BYTE_ARRAY' | 'FIXED_LEN_BYTE_ARRAY' export type FieldRepetitionType = 'REQUIRED' | 'OPTIONAL' | 'REPEATED' export type ConvertedType = 'UTF8' | 'MAP' | 'MAP_KEY_VALUE' | 'LIST' | 'ENUM' | 'DECIMAL' | 'DATE' | 'TIME_MILLIS' | 'TIME_MICROS' | 'TIMESTAMP_MILLIS' | 'TIMESTAMP_MICROS' | 'UINT_8' | 'UINT_16' | 'UINT_32' | 'UINT_64' | 'INT_8' | 'INT_16' | 'INT_32' | 'INT_64' | 'JSON' | 'BSON' | 'INTERVAL' type LogicalDecimalType = { type: 'DECIMAL' precision: number scale: number } export type TimeUnit = 'MILLIS' | 'MICROS' | 'NANOS' type LogicalTimeType = { type: 'TIME' isAdjustedToUTC: boolean unit: TimeUnit } type LogicalTimestampType = { type: 'TIMESTAMP' isAdjustedToUTC: boolean unit: TimeUnit } type LogicalIntType = { type: 'INTEGER' bitWidth: number isSigned: boolean } export type LogicalType = { type: LogicalTypeSimple } | LogicalDecimalType | LogicalTimeType | LogicalTimestampType | LogicalIntType type LogicalTypeSimple = 'STRING' | 'MAP' | 'LIST' | 'ENUM' | 'DATE' | 'INTERVAL' | 'NULL' | 'JSON' | 'BSON' | 'UUID' | 'FLOAT16' | 'VARIANT' | 'GEOMETRY' | 'GEOGRAPHY' export type LogicalTypeType = LogicalTypeSimple | 'TIME' | // convertedType TIME_MILLIS or TIME_MICROS 'TIMESTAMP' | // convertedType TIMESTAMP_MILLIS or TIMESTAMP_MICROS 'INTEGER' // convertedType INT or UINT export interface RowGroup { columns: ColumnChunk[] total_byte_size: bigint num_rows: bigint sorting_columns?: SortingColumn[] file_offset?: bigint total_compressed_size?: bigint ordinal?: number } export interface ColumnChunk { file_path?: string file_offset: bigint meta_data?: ColumnMetaData offset_index_offset?: bigint offset_index_length?: number column_index_offset?: bigint column_index_length?: number crypto_metadata?: ColumnCryptoMetaData encrypted_column_metadata?: Uint8Array } export interface ColumnMetaData { type: ParquetType encodings: Encoding[] path_in_schema: string[] codec: CompressionCodec num_values: bigint total_uncompressed_size: bigint total_compressed_size: bigint key_value_metadata?: KeyValue[] data_page_offset: bigint index_page_offset?: bigint dictionary_page_offset?: bigint statistics?: Statistics encoding_stats?: PageEncodingStats[] bloom_filter_offset?: bigint bloom_filter_length?: number size_statistics?: SizeStatistics } type ColumnCryptoMetaData = Record<string, never> export type Encoding = 'PLAIN' | 'GROUP_VAR_INT' | // deprecated 'PLAIN_DICTIONARY' | 'RLE' | 'BIT_PACKED' | // deprecated 'DELTA_BINARY_PACKED' | 'DELTA_LENGTH_BYTE_ARRAY' | 'DELTA_BYTE_ARRAY' | 'RLE_DICTIONARY' | 'BYTE_STREAM_SPLIT' export type CompressionCodec = 'UNCOMPRESSED' | 'SNAPPY' | 'GZIP' | 'LZO' | 'BROTLI' | 'LZ4' | 'ZSTD' | 'LZ4_RAW' export type Compressors = { [K in CompressionCodec]?: (input: Uint8Array, outputLength: number) => Uint8Array } export interface KeyValue { key: string value?: string } export type MinMaxType = bigint | boolean | number | string | Date | Uint8Array export interface Statistics { max?: MinMaxType min?: MinMaxType null_count?: bigint distinct_count?: bigint max_value?: MinMaxType min_value?: MinMaxType is_max_value_exact?: boolean is_min_value_exact?: boolean } interface SizeStatistics { unencoded_byte_array_data_bytes?: bigint repetition_level_histogram?: bigint[] definition_level_histogram?: bigint[] } interface PageEncodingStats { page_type: PageType encoding: Encoding count: number } export type PageType = 'DATA_PAGE' | 'INDEX_PAGE' | 'DICTIONARY_PAGE' | 'DATA_PAGE_V2' interface SortingColumn { column_idx: number descending: boolean nulls_first: boolean } // Parquet file header types export interface PageHeader { type: PageType uncompressed_page_size: number compressed_page_size: number crc?: number data_page_header?: DataPageHeader index_page_header?: IndexPageHeader dictionary_page_header?: DictionaryPageHeader data_page_header_v2?: DataPageHeaderV2 } export interface DataPageHeader { num_values: number encoding: Encoding definition_level_encoding: Encoding repetition_level_encoding: Encoding statistics?: Statistics } type IndexPageHeader = Record<string, never> export interface DictionaryPageHeader { num_values: number encoding: Encoding is_sorted?: boolean } export interface DataPageHeaderV2 { num_values: number num_nulls: number num_rows: number encoding: Encoding definition_levels_byte_length: number repetition_levels_byte_length: number is_compressed?: boolean statistics?: Statistics } interface DataPage { definitionLevels: number[] | undefined repetitionLevels: number[] dataPage: DecodedArray } export type DecodedArray = Uint8Array | Uint32Array | Int32Array | BigInt64Array | BigUint64Array | Float32Array | Float64Array | any[] export interface OffsetIndex { page_locations: PageLocation[] unencoded_byte_array_data_bytes?: bigint[] } interface PageLocation { offset: bigint compressed_page_size: number first_row_index: bigint } export interface ColumnIndex { null_pages: boolean[] min_values: MinMaxType[] max_values: MinMaxType[] boundary_order: BoundaryOrder null_counts?: bigint[] repetition_level_histograms?: bigint[] definition_level_histograms?: bigint[] } export type BoundaryOrder = 'UNORDERED' | 'ASCENDING' | 'DESCENDING' export type ThriftObject = { [ key: `field_${number}` ]: ThriftType } export type ThriftType = boolean | number | bigint | Uint8Array | ThriftType[] | ThriftObject /** * Query plan for which byte ranges to read. */ export interface QueryPlan { metadata: FileMetaData rowStart: number rowEnd?: number columns?: string[] // columns to read fetches: ByteRange[] // byte ranges to fetch groups: GroupPlan[] // byte ranges by row group } // Plan for one group interface GroupPlan { ranges: ByteRange[] rowGroup: RowGroup // row group metadata groupStart: number // row index of the first row in the group selectStart: number // row index in the group to start reading selectEnd: number // row index in the group to stop reading groupRows: number } export interface ColumnDecoder { columnName: string type: ParquetType element: SchemaElement schemaPath: SchemaTree[] codec: CompressionCodec parsers: ParquetParsers compressors?: Compressors utf8?: boolean } export interface RowGroupSelect { groupStart: number // row index of the first row in the group selectStart: number // row index in the group to start reading selectEnd: number // row index in the group to stop reading groupRows: number } export interface AsyncColumn { pathInSchema: string[] data: Promise<DecodedArray[]> } export interface AsyncRowGroup { groupStart: number groupRows: number asyncColumns: AsyncColumn[] }