UNPKG

hyparquet

Version:

Parquet file parser for JavaScript

358 lines (325 loc) 10.7 kB
import { DEFAULT_PARSERS } from './convert.js' /** @import {DataReader, ParquetParsers, VariantMetadata} from './types.d.ts' */ const decoder = new TextDecoder() /** @type {WeakMap<object, Map<string, VariantMetadata>>} */ const metadataCache = new WeakMap() /** * Recursively decode variant structs into native values. * * @param {any} value * @param {ParquetParsers} [parsers] * @returns {any} */ export function decodeVariantColumn(value, parsers = DEFAULT_PARSERS) { if (Array.isArray(value)) { return value.map(entry => decodeVariantColumn(entry, parsers)) } if (typeof value !== 'object') return value if ('metadata' in value) { const metadata = parseVariantMetadata(value.metadata) // Decode shredded fields from typed_value const shreddedFields = value.typed_value && decodeTypedValue(value.typed_value, metadata, parsers) // Decode binary value (may contain additional fields for partially shredded objects) const binaryValue = value.value && readVariant(makeReader(value.value), metadata, parsers) // Merge shredded and binary values for partially shredded objects if (shreddedFields && binaryValue) { return { ...binaryValue, ...shreddedFields } } return shreddedFields ?? binaryValue } return value } /** * Decode a shredded variant typed_value field. * * @param {any} typedValue * @param {VariantMetadata} metadata * @param {ParquetParsers} parsers * @returns {any} */ function decodeTypedValue(typedValue, metadata, parsers) { // Handle {typed_value, value} wrapper - unwrap and recurse if (typedValue && typeof typedValue === 'object' && !Array.isArray(typedValue) && !(typedValue instanceof Uint8Array)) { if ('typed_value' in typedValue) { return decodeTypedValue(typedValue.typed_value, metadata, parsers) } if ('value' in typedValue && typedValue.value instanceof Uint8Array) { return readVariant(makeReader(typedValue.value), metadata, parsers) } // Shredded object: each field value gets decoded /** @type {Record<string, any>} */ const result = {} for (const [key, field] of Object.entries(typedValue)) { result[key] = decodeTypedValue(field, metadata, parsers) } return result } // Uint8Array: decode as binary variant if (typedValue instanceof Uint8Array) { return readVariant(makeReader(typedValue), metadata, parsers) } // Arrays if (Array.isArray(typedValue)) { return typedValue.map(element => decodeTypedValue(element, metadata, parsers)) } return typedValue } /** * @param {Uint8Array} bytes * @returns {DataReader} */ function makeReader(bytes) { return { view: new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength), offset: 0 } } /** * Parse and cache variant metadata dictionary. * * @param {Uint8Array} bytes * @returns {VariantMetadata} */ function parseVariantMetadata(bytes) { let bufferCache = metadataCache.get(bytes.buffer) if (!bufferCache) { bufferCache = new Map() metadataCache.set(bytes.buffer, bufferCache) } const key = `${bytes.byteOffset}:${bytes.byteLength}` const cached = bufferCache.get(key) if (cached) return cached const reader = makeReader(bytes) const header = reader.view.getUint8(reader.offset++) const version = header & 0x0f if (version !== 1) throw new Error(`parquet unsupported variant metadata version: ${version}`) const sorted = (header >> 4 & 0x1) === 1 const offsetSize = (header >> 6 & 0x3) + 1 const dictionarySize = readUnsigned(reader, offsetSize) const offsets = new Array(dictionarySize + 1) for (let i = 0; i < offsets.length; i++) { offsets[i] = readUnsigned(reader, offsetSize) } const base = reader.offset const dictionary = new Array(dictionarySize) for (let i = 0; i < dictionarySize; i++) { const start = offsets[i] const end = offsets[i + 1] const strBytes = new Uint8Array(bytes.buffer, bytes.byteOffset + base + start, end - start) dictionary[i] = decoder.decode(strBytes) } const metadata = { dictionary, sorted } bufferCache.set(key, metadata) return metadata } /** * @param {DataReader} reader * @param {number} byteWidth * @returns {number} */ function readUnsigned(reader, byteWidth) { let value = 0 for (let i = 0; i < byteWidth; i++) { value |= reader.view.getUint8(reader.offset + i) << i * 8 } reader.offset += byteWidth return value } /** * @param {DataReader} reader * @param {VariantMetadata} metadata * @param {ParquetParsers} parsers * @returns {any} */ function readVariant(reader, metadata, parsers) { const typeByte = reader.view.getUint8(reader.offset++) const basicType = typeByte & 0x3 const header = typeByte >> 2 if (basicType === 0) return readVariantPrimitive(reader, header, parsers) if (basicType === 2) return readVariantObject(reader, header, metadata, parsers) if (basicType === 3) return readVariantArray(reader, header, metadata, parsers) // else short string const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, header) reader.offset += header return decoder.decode(bytes) } /** * @param {DataReader} reader * @param {number} typeId * @param {ParquetParsers} parsers * @returns {any} */ function readVariantPrimitive(reader, typeId, parsers) { switch (typeId) { case 0: return null case 1: return true case 2: return false case 3: { const value = reader.view.getInt8(reader.offset) reader.offset += 1 return value } case 4: { const value = reader.view.getInt16(reader.offset, true) reader.offset += 2 return value } case 5: { const value = reader.view.getInt32(reader.offset, true) reader.offset += 4 return value } case 6: { const value = reader.view.getBigInt64(reader.offset, true) reader.offset += 8 return value } case 7: { const value = reader.view.getFloat64(reader.offset, true) reader.offset += 8 return value } case 8: return readVariantDecimal(reader, 4) case 9: return readVariantDecimal(reader, 8) case 10: return readVariantDecimal(reader, 16) case 11: { const value = reader.view.getInt32(reader.offset, true) reader.offset += 4 return parsers.dateFromDays(value) } case 12: // timestamp_micros (utc) case 13: { // timestamp_micros_ntz (no timezone) const value = reader.view.getBigInt64(reader.offset, true) reader.offset += 8 return parsers.timestampFromMicroseconds(value) } case 14: { const value = reader.view.getFloat32(reader.offset, true) reader.offset += 4 return value } case 15: return readVariantBinary(reader) case 16: { const bytes = readVariantBinary(reader) return decoder.decode(bytes) } case 17: { // time: microseconds since midnight const value = reader.view.getBigInt64(reader.offset, true) reader.offset += 8 return value } case 18: // timestamp_nanos (utc) case 19: { // timestamp_nanos_ntz (no timezone) const value = reader.view.getBigInt64(reader.offset, true) reader.offset += 8 return parsers.timestampFromNanoseconds(value) } case 20: { const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, 16) reader.offset += 16 const hex = Array.from(bytes, b => b.toString(16).padStart(2, '0')).join('') return `${hex.slice(0, 8)}-${hex.slice(8, 12)}-${hex.slice(12, 16)}-${hex.slice(16, 20)}-${hex.slice(20)}` } default: throw new Error(`parquet unsupported variant primitive type: ${typeId}`) } } /** * @param {DataReader} reader * @param {number} header * @param {VariantMetadata} metadata * @param {ParquetParsers} parsers * @returns {Record<string, any>} */ function readVariantObject(reader, header, metadata, parsers) { const offsetWidth = (header & 0x3) + 1 const idWidth = (header >> 2 & 0x3) + 1 const isLarge = header >> 4 & 0x1 const numElements = isLarge ? readUnsigned(reader, 4) : reader.view.getUint8(reader.offset++) /** @type {number[]} */ const fieldIds = new Array(numElements) for (let i = 0; i < numElements; i++) { fieldIds[i] = readUnsigned(reader, idWidth) } const offsets = new Array(numElements + 1) for (let i = 0; i < offsets.length; i++) { offsets[i] = readUnsigned(reader, offsetWidth) } /** @type {Record<string, any>} */ const out = {} for (let i = 0; i < numElements; i++) { const key = metadata.dictionary[fieldIds[i]] // Read value at the given offset const valueReader = { view: reader.view, offset: reader.offset + offsets[i], } out[key] = readVariant(valueReader, metadata, parsers) } reader.offset += offsets[offsets.length - 1] return out } /** * @param {DataReader} reader * @param {number} header * @param {VariantMetadata} metadata * @param {ParquetParsers} parsers * @returns {any[]} */ function readVariantArray(reader, header, metadata, parsers) { const fieldOffsetSize = header & 0x3 const isLarge = header >> 2 & 0x1 const offsetWidth = fieldOffsetSize + 1 const numElements = readUnsigned(reader, isLarge ? 4 : 1) const offsets = new Array(numElements + 1) for (let i = 0; i < offsets.length; i++) { offsets[i] = readUnsigned(reader, offsetWidth) } const valuesStart = reader.offset const result = new Array(numElements) for (let i = 0; i < numElements; i++) { const valueReader = { view: reader.view, offset: valuesStart + offsets[i], } result[i] = readVariant(valueReader, metadata, parsers) } reader.offset = valuesStart + offsets[offsets.length - 1] return result } /** * @param {DataReader} reader * @param {number} width * @returns {number} */ function readVariantDecimal(reader, width) { const scale = reader.view.getUint8(reader.offset) reader.offset += 1 let unscaled if (width === 4) { unscaled = BigInt(reader.view.getInt32(reader.offset, true)) reader.offset += 4 } else if (width === 8) { unscaled = reader.view.getBigInt64(reader.offset, true) reader.offset += 8 } else { const low = reader.view.getBigUint64(reader.offset, true) const high = reader.view.getBigInt64(reader.offset + 8, true) unscaled = high << 64n | low reader.offset += 16 } return Number(unscaled) * 10 ** -scale } /** * @param {DataReader} reader * @returns {Uint8Array} */ function readVariantBinary(reader) { const length = reader.view.getUint32(reader.offset, true) reader.offset += 4 const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length) reader.offset += length return bytes }