@severo_tests/hyparquet
Version:
Parquet file parser for JavaScript
300 lines (281 loc) • 12.7 kB
JavaScript
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from './constants.js'
import { DEFAULT_PARSERS, parseDecimal, parseFloat16 } from './convert.js'
import { getSchemaPath } from './schema.js'
import { deserializeTCompactProtocol } from './thrift.js'
export const defaultInitialFetchSize = 1 << 19 // 512kb
/**
* Read parquet metadata from an async buffer.
*
* An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
* asynchronously, possibly over the network.
*
* You must provide the byteLength of the buffer, typically from a HEAD request.
*
* In theory, you could use suffix-range requests to fetch the end of the file,
* and save a round trip. But in practice, this doesn't work because chrome
* deems suffix-range requests as a not-safe-listed header, and will require
* a pre-flight. So the byteLength is required.
*
* To make this efficient, we initially request the last 512kb of the file,
* which is likely to contain the metadata. If the metadata length exceeds the
* initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
*
* This ensures that we either make one 512kb initial request for the metadata,
* or a second request for up to the metadata size.
*
* @param {AsyncBuffer} asyncBuffer parquet file contents
* @param {MetadataOptions & { initialFetchSize?: number }} options initial fetch size in bytes (default 512kb)
* @returns {Promise<FileMetaData>} parquet metadata object
*/
export async function parquetMetadataAsync(asyncBuffer, { parsers, initialFetchSize = defaultInitialFetchSize } = {}) {
if (!asyncBuffer || !(asyncBuffer.byteLength >= 0)) throw new Error('parquet expected AsyncBuffer')
// fetch last bytes (footer) of the file
const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
const footerBuffer = await asyncBuffer.slice(footerOffset, asyncBuffer.byteLength)
// Check for parquet magic number "PAR1"
const footerView = new DataView(footerBuffer)
if (footerView.getUint32(footerBuffer.byteLength - 4, true) !== 0x31524150) {
throw new Error('parquet file invalid (footer != PAR1)')
}
// Parquet files store metadata at the end of the file
// Metadata length is 4 bytes before the last PAR1
const metadataLength = footerView.getUint32(footerBuffer.byteLength - 8, true)
if (metadataLength > asyncBuffer.byteLength - 8) {
throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${asyncBuffer.byteLength - 8}`)
}
// check if metadata size fits inside the initial fetch
if (metadataLength + 8 > initialFetchSize) {
// fetch the rest of the metadata
const metadataOffset = asyncBuffer.byteLength - metadataLength - 8
const metadataBuffer = await asyncBuffer.slice(metadataOffset, footerOffset)
// combine initial fetch with the new slice
const combinedBuffer = new ArrayBuffer(metadataLength + 8)
const combinedView = new Uint8Array(combinedBuffer)
combinedView.set(new Uint8Array(metadataBuffer))
combinedView.set(new Uint8Array(footerBuffer), footerOffset - metadataOffset)
return parquetMetadata(combinedBuffer, { parsers })
} else {
// parse metadata from the footer
return parquetMetadata(footerBuffer, { parsers })
}
}
/**
* Read parquet metadata from a buffer synchronously.
*
* @param {ArrayBuffer} arrayBuffer parquet file footer
* @param {MetadataOptions} options metadata parsing options
* @returns {FileMetaData} parquet metadata object
*/
export function parquetMetadata(arrayBuffer, { parsers } = {}) {
if (!(arrayBuffer instanceof ArrayBuffer)) throw new Error('parquet expected ArrayBuffer')
const view = new DataView(arrayBuffer)
// Use default parsers if not given
parsers = { ...DEFAULT_PARSERS, ...parsers }
// Validate footer magic number "PAR1"
if (view.byteLength < 8) {
throw new Error('parquet file is too short')
}
if (view.getUint32(view.byteLength - 4, true) !== 0x31524150) {
throw new Error('parquet file invalid (footer != PAR1)')
}
// Parquet files store metadata at the end of the file
// Metadata length is 4 bytes before the last PAR1
const metadataLengthOffset = view.byteLength - 8
const metadataLength = view.getUint32(metadataLengthOffset, true)
if (metadataLength > view.byteLength - 8) {
// {metadata}, metadata_length, PAR1
throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${view.byteLength - 8}`)
}
const metadataOffset = metadataLengthOffset - metadataLength
const reader = { view, offset: metadataOffset }
const metadata = deserializeTCompactProtocol(reader)
const decoder = new TextDecoder()
function decode(/** @type {Uint8Array} */ value) {
return value && decoder.decode(value)
}
// Parse metadata from thrift data
const version = metadata.field_1
/** @type {SchemaElement[]} */
const schema = metadata.field_2.map((/** @type {any} */ field) => ({
type: ParquetType[field.field_1],
type_length: field.field_2,
repetition_type: FieldRepetitionType[field.field_3],
name: decode(field.field_4),
num_children: field.field_5,
converted_type: ConvertedType[field.field_6],
scale: field.field_7,
precision: field.field_8,
field_id: field.field_9,
logical_type: logicalType(field.field_10),
}))
// schema element per column index
const columnSchema = schema.filter(e => e.type)
const num_rows = metadata.field_3
const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({
columns: rowGroup.field_1.map((/** @type {any} */ column, /** @type {number} */ columnIndex) => ({
file_path: decode(column.field_1),
file_offset: column.field_2,
meta_data: column.field_3 && {
type: ParquetType[column.field_3.field_1],
encodings: column.field_3.field_2?.map((/** @type {number} */ e) => Encoding[e]),
path_in_schema: column.field_3.field_3.map(decode),
codec: CompressionCodec[column.field_3.field_4],
num_values: column.field_3.field_5,
total_uncompressed_size: column.field_3.field_6,
total_compressed_size: column.field_3.field_7,
key_value_metadata: column.field_3.field_8,
data_page_offset: column.field_3.field_9,
index_page_offset: column.field_3.field_10,
dictionary_page_offset: column.field_3.field_11,
statistics: convertStats(column.field_3.field_12, columnSchema[columnIndex], parsers),
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({
page_type: PageType[encodingStat.field_1],
encoding: Encoding[encodingStat.field_2],
count: encodingStat.field_3,
})),
bloom_filter_offset: column.field_3.field_14,
bloom_filter_length: column.field_3.field_15,
size_statistics: column.field_3.field_16 && {
unencoded_byte_array_data_bytes: column.field_3.field_16.field_1,
repetition_level_histogram: column.field_3.field_16.field_2,
definition_level_histogram: column.field_3.field_16.field_3,
},
},
offset_index_offset: column.field_4,
offset_index_length: column.field_5,
column_index_offset: column.field_6,
column_index_length: column.field_7,
crypto_metadata: column.field_8,
encrypted_column_metadata: column.field_9,
})),
total_byte_size: rowGroup.field_2,
num_rows: rowGroup.field_3,
sorting_columns: rowGroup.field_4?.map((/** @type {any} */ sortingColumn) => ({
column_idx: sortingColumn.field_1,
descending: sortingColumn.field_2,
nulls_first: sortingColumn.field_3,
})),
file_offset: rowGroup.field_5,
total_compressed_size: rowGroup.field_6,
ordinal: rowGroup.field_7,
}))
const key_value_metadata = metadata.field_5?.map((/** @type {any} */ keyValue) => ({
key: decode(keyValue.field_1),
value: decode(keyValue.field_2),
}))
const created_by = decode(metadata.field_6)
return {
version,
schema,
num_rows,
row_groups,
key_value_metadata,
created_by,
metadata_length: metadataLength,
}
}
/**
* Return a tree of schema elements from parquet metadata.
*
* @param {{schema: SchemaElement[]}} metadata parquet metadata object
* @returns {SchemaTree} tree of schema elements
*/
export function parquetSchema({ schema }) {
return getSchemaPath(schema, [])[0]
}
/**
* @param {any} logicalType
* @returns {LogicalType | undefined}
*/
function logicalType(logicalType) {
if (logicalType?.field_1) return { type: 'STRING' }
if (logicalType?.field_2) return { type: 'MAP' }
if (logicalType?.field_3) return { type: 'LIST' }
if (logicalType?.field_4) return { type: 'ENUM' }
if (logicalType?.field_5) return {
type: 'DECIMAL',
scale: logicalType.field_5.field_1,
precision: logicalType.field_5.field_2,
}
if (logicalType?.field_6) return { type: 'DATE' }
if (logicalType?.field_7) return {
type: 'TIME',
isAdjustedToUTC: logicalType.field_7.field_1,
unit: timeUnit(logicalType.field_7.field_2),
}
if (logicalType?.field_8) return {
type: 'TIMESTAMP',
isAdjustedToUTC: logicalType.field_8.field_1,
unit: timeUnit(logicalType.field_8.field_2),
}
if (logicalType?.field_10) return {
type: 'INTEGER',
bitWidth: logicalType.field_10.field_1,
isSigned: logicalType.field_10.field_2,
}
if (logicalType?.field_11) return { type: 'NULL' }
if (logicalType?.field_12) return { type: 'JSON' }
if (logicalType?.field_13) return { type: 'BSON' }
if (logicalType?.field_14) return { type: 'UUID' }
if (logicalType?.field_15) return { type: 'FLOAT16' }
return logicalType
}
/**
* @param {any} unit
* @returns {TimeUnit}
*/
function timeUnit(unit) {
if (unit.field_1) return 'MILLIS'
if (unit.field_2) return 'MICROS'
if (unit.field_3) return 'NANOS'
throw new Error('parquet time unit required')
}
/**
* Convert column statistics based on column type.
*
* @import {AsyncBuffer, FileMetaData, LogicalType, MetadataOptions, MinMaxType, ParquetParsers, SchemaElement, SchemaTree, Statistics, TimeUnit} from '../src/types.d.ts'
* @param {any} stats
* @param {SchemaElement} schema
* @param {ParquetParsers} parsers
* @returns {Statistics}
*/
function convertStats(stats, schema, parsers) {
return stats && {
max: convertMetadata(stats.field_1, schema, parsers),
min: convertMetadata(stats.field_2, schema, parsers),
null_count: stats.field_3,
distinct_count: stats.field_4,
max_value: convertMetadata(stats.field_5, schema, parsers),
min_value: convertMetadata(stats.field_6, schema, parsers),
is_max_value_exact: stats.field_7,
is_min_value_exact: stats.field_8,
}
}
/**
* @param {Uint8Array | undefined} value
* @param {SchemaElement} schema
* @param {ParquetParsers} parsers
* @returns {MinMaxType | undefined}
*/
export function convertMetadata(value, schema, parsers) {
const { type, converted_type, logical_type } = schema
if (value === undefined) return value
if (type === 'BOOLEAN') return value[0] === 1
if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value)
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
if (type === 'FLOAT' && view.byteLength === 4) return view.getFloat32(0, true)
if (type === 'DOUBLE' && view.byteLength === 8) return view.getFloat64(0, true)
if (type === 'INT32' && converted_type === 'DATE') return parsers.dateFromDays(view.getInt32(0, true))
if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS') return parsers.timestampFromMilliseconds(view.getBigInt64(0, true))
if (type === 'INT64' && converted_type === 'TIMESTAMP_MICROS') return parsers.timestampFromMicroseconds(view.getBigInt64(0, true))
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP' && logical_type?.unit === 'NANOS') return parsers.timestampFromNanoseconds(view.getBigInt64(0, true))
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP' && logical_type?.unit === 'MICROS') return parsers.timestampFromMicroseconds(view.getBigInt64(0, true))
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return parsers.timestampFromMilliseconds(view.getBigInt64(0, true))
if (type === 'INT32' && view.byteLength === 4) return view.getInt32(0, true)
if (type === 'INT64' && view.byteLength === 8) return view.getBigInt64(0, true)
if (converted_type === 'DECIMAL') return parseDecimal(value) * 10 ** -(schema.scale || 0)
if (logical_type?.type === 'FLOAT16') return parseFloat16(value)
if (type === 'FIXED_LEN_BYTE_ARRAY') return value
// assert(false)
return value
}