hyparquet-writer
Version:
Parquet file writer for JavaScript
634 lines (585 loc) • 21.4 kB
JavaScript
import { ByteWriter } from './bytewriter.js'
const encoder = new TextEncoder()
const INT64_MIN = -(2n ** 63n)
const INT64_MAX = 2n ** 63n - 1n
const VARIANT_NULL = new Uint8Array([0x00])
const RESERVED_SHREDDING_FIELDS = new Set(['value', 'typed_value'])
/** @type {Map<string, number>} */
const EMPTY_KEY_INDEX = new Map()
const EMPTY_METADATA = writeVariantMetadata([])
/**
* Encode an array of arbitrary JS values into variant binary format.
* Each row becomes { metadata, value } (or null for missing values).
* When shredding is provided, produces { metadata, value, typed_value } per row.
*
* @import {BasicType, ShredType} from '../src/types.js'
* @param {any[]} values
* @param {ShredType | undefined} shredding
* @param {{ name: string, required: boolean }} [column]
* @returns {Array<Record<string, any> | null>}
*/
export function encodeVariantColumn(values, shredding, column) {
if (column?.required) {
for (let i = 0; i < values.length; i++) {
if (values[i] === undefined) {
throw new Error(`required variant column ${column.name} has undefined value at index ${i}`)
}
}
}
const shreddingConfig = shredding && normalizeShreddingConfig(shredding)
if (shreddingConfig) {
// Cache (metadata, keyIndex) by sorted-dictionary signature so rows with
// the same set of keys share a single Uint8Array + Map.
/** @type {Map<string, { metadata: Uint8Array, keyIndex: Map<string, number> }>} */
const metadataCache = new Map()
return values.map(value => {
// undefined is a missing row; null is a present Variant null.
if (value === undefined) return null
// Build the metadata dictionary from every nested key in the row. The
// reader uses dictionary membership to decide which object fields are
// present, so all present keys (shredded or not) must be in the dictionary.
/** @type {Set<string>} */
const keys = new Set()
collectKeys(value, keys)
const { metadata, keyIndex } = getVariantRowMetadata(keys, metadataCache)
return { metadata, ...encodeShredded(value, shreddingConfig, keyIndex, true) }
})
}
const dictionary = buildVariantDictionary(values)
const metadata = writeVariantMetadata(dictionary)
/** @type {Map<string, number>} */
const keyIndex = new Map()
for (let i = 0; i < dictionary.length; i++) {
keyIndex.set(dictionary[i], i)
}
return values.map(value => {
// Keep top-level null as a present Variant null (0x00). Only undefined is missing.
if (value === undefined) return null
return { metadata, value: writeVariantValue(value, keyIndex) }
})
}
/**
* Recursively encode a value against a shred type into a { value, typed_value }
* shredded group (the metadata wrapper is added by the caller at the top level).
*
* Shape rules (per the Variant shredding spec):
* - scalar: matches the type -> typed_value holds the value, value is null;
* otherwise fall back to a binary variant in value.
* - object: shredded fields go into the typed_value struct (absent fields are
* omitted), remaining fields are packed into a binary value.
* - array: each element is recursively shredded into the typed_value LIST, value
* is null. A non-array value falls back to a binary value.
*
* @param {any} value
* @param {ShredType} shredType
* @param {Map<string, number>} keyIndex
* @param {boolean} allowPartialObjects
* @returns {{ value: Uint8Array | null, typed_value: any }}
*/
function encodeShredded(value, shredType, keyIndex, allowPartialObjects) {
// Present Variant null: value holds variant null, typed_value is null.
if (value === null || value === undefined) {
return { value: VARIANT_NULL, typed_value: null }
}
// Array shred type
if (Array.isArray(shredType)) {
if (!Array.isArray(value)) {
// Not an array: typed_value must be null, store the value as binary.
return { value: writeVariantValue(value, keyIndex), typed_value: null }
}
const elemShred = shredType[0]
return { value: null, typed_value: value.map(el => encodeShredded(el, elemShred, keyIndex, false)) }
}
// Object shred type
if (typeof shredType === 'object') {
// Not a plain object: fall back to a binary value.
if (typeof value !== 'object' || Array.isArray(value) || value instanceof Date || value instanceof Uint8Array) {
return { value: writeVariantValue(value, keyIndex), typed_value: null }
}
// Remaining (non-shredded) fields are packed into a binary value.
/** @type {Record<string, any>} */
const remaining = {}
let hasRemaining = false
for (const k of Object.keys(value)) {
if (k in shredType || value[k] === undefined) continue
remaining[k] = value[k]
hasRemaining = true
}
if (hasRemaining && !allowPartialObjects) {
return { value: writeVariantValue(value, keyIndex), typed_value: null }
}
const fieldNames = Object.keys(shredType)
const hasMissingFieldConflict = fieldNames.some(fieldName =>
(!Object.prototype.hasOwnProperty.call(value, fieldName) || value[fieldName] === undefined) &&
keyIndex.has(fieldName)
)
if (hasMissingFieldConflict) {
return { value: writeVariantValue(value, keyIndex), typed_value: null }
}
/** @type {Record<string, any>} */
const typedValue = {}
for (const fieldName of fieldNames) {
if (!Object.prototype.hasOwnProperty.call(value, fieldName) || value[fieldName] === undefined) {
// missing field: omit the optional field wrapper entirely
continue
}
typedValue[fieldName] = encodeShredded(value[fieldName], shredType[fieldName], keyIndex, false)
}
const binaryValue = hasRemaining ? writeVariantValue(remaining, keyIndex) : null
return { value: binaryValue, typed_value: typedValue }
}
// Scalar shred type
if (matchesType(value, shredType)) {
return { value: null, typed_value: value }
}
return { value: writeVariantValue(value, keyIndex), typed_value: null }
}
/**
* Build metadata and keyIndex, sharing across rows with the same dictionary.
*
* @param {Set<string>} keys
* @param {Map<string, { metadata: Uint8Array, keyIndex: Map<string, number> }>} metadataCache
* @returns {{ metadata: Uint8Array, keyIndex: Map<string, number> }}
*/
function getVariantRowMetadata(keys, metadataCache) {
if (keys.size === 0) {
return { metadata: EMPTY_METADATA, keyIndex: EMPTY_KEY_INDEX }
}
const dictionary = [...keys].sort()
const cacheKey = dictionary.join('\0')
const cached = metadataCache.get(cacheKey)
if (cached) {
return cached
}
const metadata = writeVariantMetadata(dictionary)
const keyIndex = new Map()
for (let i = 0; i < dictionary.length; i++) keyIndex.set(dictionary[i], i)
const rowMetadata = { metadata, keyIndex }
metadataCache.set(cacheKey, rowMetadata)
return rowMetadata
}
/**
* Check if a JS value matches a BasicType for shredding.
*
* @param {any} value
* @param {BasicType} type
* @returns {boolean}
*/
function matchesType(value, type) {
if (value === null || value === undefined) return false
switch (type) {
case 'BOOLEAN': return typeof value === 'boolean'
case 'INT32': return typeof value === 'number' && Number.isInteger(value) && value >= -2147483648 && value <= 2147483647
case 'INT64': return typeof value === 'bigint' && value >= INT64_MIN && value <= INT64_MAX
case 'FLOAT': return typeof value === 'number'
case 'DOUBLE': return typeof value === 'number'
case 'STRING': return typeof value === 'string'
case 'TIMESTAMP': return value instanceof Date
default: return false
}
}
// Conservative defaults for auto-detected shredding. Shredding is a query
// optimization, not a compression one: each shredded leaf is a full column
// (page headers, dictionary page, offset index), so deep/wide structures
// auto-shred into hundreds of columns and bloat the file. Auto-detect only
// descends a couple of container levels and bails on very wide schemas,
// leaving the rest in the binary value fallback. Explicit configs are unbounded.
const MAX_SHRED_DEPTH = 3
const MAX_SHRED_LEAVES = 256
/**
* Auto-detect a shredding config by recursively analyzing values for consistent
* structure. Detects scalar fields, nested objects, and arrays. Only structured
* top-level values (objects/arrays) are shredded; a column of bare scalars is
* left unshredded. Descent is bounded (see MAX_SHRED_DEPTH/MAX_SHRED_LEAVES) to
* avoid exploding deeply nested variants into a column per leaf.
*
* @param {any[]} values
* @returns {ShredType | undefined}
*/
export function autoDetectShredding(values) {
const detected = detectShred(values, 0)
// Top level: only shred structured values (objects/arrays), not bare scalars.
if (detected === undefined || typeof detected !== 'object') return undefined
const normalized = normalizeShreddingConfig(detected)
// Leave pathologically wide schemas as a single binary value.
if (normalized === undefined || countShredLeaves(normalized) > MAX_SHRED_LEAVES) return undefined
return normalized
}
/**
* Count the typed leaf columns a shred type expands into (per array element).
*
* @param {ShredType} shredType
* @returns {number}
*/
function countShredLeaves(shredType) {
if (Array.isArray(shredType)) return shredType.length ? countShredLeaves(shredType[0]) : 0
if (shredType && typeof shredType === 'object') {
let leaves = 0
for (const key of Object.keys(shredType)) leaves += countShredLeaves(shredType[key])
return leaves
}
return 1 // scalar leaf
}
/**
* Recursively detect a shred type from a pool of sample values at one position.
* Returns undefined when the values are not consistently shreddable, or when a
* container is nested deeper than MAX_SHRED_DEPTH (left as binary fallback).
*
* @param {any[]} values
* @param {number} depth container nesting levels already descended
* @returns {ShredType | undefined}
*/
function detectShred(values, depth) {
/** @type {any[]} */
const nonNull = []
for (const v of values) {
if (v !== null && v !== undefined) nonNull.push(v)
}
if (!nonNull.length) return undefined
// Object shred: any plain object present. Non-objects are ignored here and
// fall back to binary at encode time.
if (nonNull.some(isPlainObject)) {
if (depth >= MAX_SHRED_DEPTH) return undefined
/** @type {Map<string, any[]>} field name -> its present values */
const fieldValues = new Map()
for (const v of nonNull) {
if (!isPlainObject(v)) continue
for (const [key, fieldValue] of Object.entries(v)) {
if (fieldValue === undefined) continue
const arr = fieldValues.get(key)
if (arr) arr.push(fieldValue)
else fieldValues.set(key, [fieldValue])
}
}
/** @type {Record<string, ShredType>} */
const shredding = {}
for (const [key, vals] of fieldValues) {
const fieldShred = detectShred(vals, depth + 1)
if (fieldShred !== undefined) shredding[key] = fieldShred
}
return Object.keys(shredding).length > 0 ? shredding : undefined
}
// Array shred: every value is an array. Pool all elements and recurse.
if (nonNull.every(Array.isArray)) {
if (depth >= MAX_SHRED_DEPTH) return undefined
/** @type {any[]} */
const elements = []
for (const arr of nonNull) for (const el of arr) elements.push(el)
const elemShred = detectShred(elements, depth + 1)
return elemShred === undefined ? undefined : [elemShred]
}
// Scalar shred: every value is the same basic JS type.
/** @type {string | undefined} */
let jsType
for (const v of nonNull) {
if (Array.isArray(v)) return undefined // mixed array + scalar
const t = v instanceof Date ? 'date' : typeof v
if (jsType === undefined) jsType = t
else if (jsType !== t) return undefined
}
return jsType ? jsTypeToBasicType(jsType) : undefined
}
/**
* True for plain objects (not null, array, Date, or Uint8Array).
*
* @param {any} v
* @returns {boolean}
*/
function isPlainObject(v) {
return typeof v === 'object' && v !== null &&
!Array.isArray(v) && !(v instanceof Date) && !(v instanceof Uint8Array)
}
/**
* Recursively strip field names reserved by the shredded variant wrapper layout
* (`value`, `typed_value`). Returns undefined when an object level empties out.
*
* @param {ShredType} shredding
* @returns {ShredType | undefined}
*/
export function normalizeShreddingConfig(shredding) {
if (Array.isArray(shredding)) {
const elem = shredding.length ? normalizeShreddingConfig(shredding[0]) : undefined
return elem === undefined ? undefined : [elem]
}
if (typeof shredding === 'object') {
/** @type {Record<string, ShredType>} */
const normalized = {}
for (const [key, type] of Object.entries(shredding)) {
if (RESERVED_SHREDDING_FIELDS.has(key)) continue
const norm = normalizeShreddingConfig(type)
if (norm !== undefined) normalized[key] = norm
}
return Object.keys(normalized).length > 0 ? normalized : undefined
}
// scalar
return shredding
}
/**
* Map a JS typeof string to a BasicType for shredding.
*
* @param {string} jsType
* @returns {BasicType | undefined}
*/
function jsTypeToBasicType(jsType) {
switch (jsType) {
case 'boolean': return 'BOOLEAN'
case 'string': return 'STRING'
case 'number': return 'DOUBLE'
case 'bigint': return 'INT64'
case 'date': return 'TIMESTAMP'
default: return undefined
}
}
/**
* Recursively collect all unique object keys from the column values.
* Returns a sorted string array.
*
* @param {any[]} values
* @returns {string[]}
*/
function buildVariantDictionary(values) {
/** @type {Set<string>} */
const keys = new Set()
collectKeys(values, keys)
return [...keys].sort()
}
/**
* @param {any} value
* @param {Set<string>} keys
*/
function collectKeys(value, keys) {
if (value === null || value === undefined) return
if (Array.isArray(value)) {
for (const item of value) {
collectKeys(item, keys)
}
return
}
if (value instanceof Date || value instanceof Uint8Array) return
if (typeof value === 'object') {
for (const key of Object.keys(value)) {
keys.add(key)
collectKeys(value[key], keys)
}
}
}
/**
* Encode variant metadata binary.
* Format: header byte, dictionary size, offsets, UTF-8 string data.
*
* @param {string[]} dictionary sorted array of unique keys
* @returns {Uint8Array}
*/
function writeVariantMetadata(dictionary) {
// Encode strings and compute total byte length in one pass
const n = dictionary.length
/** @type {Uint8Array[]} */
const encoded = new Array(n)
let totalStringBytes = 0
for (let i = 0; i < n; i++) {
const e = encoder.encode(dictionary[i])
encoded[i] = e
totalStringBytes += e.length
}
// Determine offset size: max offset is totalStringBytes
const offsetSize = byteWidth(totalStringBytes)
// Header: version=1, sorted=1, offsetSize
const header = 1 | 1 << 4 | offsetSize - 1 << 6
// Total size: 1 (header) + offsetSize (dict size) + (n + 1) * offsetSize (offsets) + totalStringBytes
const totalSize = 1 + offsetSize + (n + 1) * offsetSize + totalStringBytes
const bytes = new Uint8Array(totalSize)
let offset = 0
bytes[offset++] = header
// Dictionary size
for (let j = 0; j < offsetSize; j++) bytes[offset++] = n >> j * 8 & 0xff
// String offsets
let strOffset = 0
for (let i = 0; i < n; i++) {
for (let j = 0; j < offsetSize; j++) bytes[offset++] = strOffset >> j * 8 & 0xff
strOffset += encoded[i].length
}
// Final offset
for (let j = 0; j < offsetSize; j++) bytes[offset++] = strOffset >> j * 8 & 0xff
// String data
for (let i = 0; i < n; i++) {
bytes.set(encoded[i], offset)
offset += encoded[i].length
}
return bytes
}
/**
* Encode a single JS value to variant binary format.
*
* @param {any} value
* @param {Map<string, number>} keyIndex map from key string to dictionary index
* @returns {Uint8Array}
*/
function writeVariantValue(value, keyIndex) {
const writer = new ByteWriter(8)
writeValue(value, writer, keyIndex)
return writer.getBytes()
}
/**
* @param {any} val
* @param {ByteWriter} writer
* @param {Map<string, number>} keyIndex
*/
function writeValue(val, writer, keyIndex) {
if (val === null || val === undefined) {
writer.appendUint8(0x00) // basicType=0, typeId=0
return
}
if (val === true) {
writer.appendUint8(0x04) // typeId=1
return
}
if (val === false) {
writer.appendUint8(0x08) // typeId=2
return
}
if (typeof val === 'bigint') {
if (val < INT64_MIN || val > INT64_MAX) {
throw new RangeError(`variant bigint out of int64 range: ${val}`)
}
writer.appendUint8(6 << 2) // int64
writer.appendInt64(val)
return
}
if (typeof val === 'number') {
if (Number.isInteger(val)) {
if (val >= -128 && val <= 127) {
writer.appendUint8(3 << 2) // int8
writer.appendUint8(val & 0xff)
return
}
if (val >= -32768 && val <= 32767) {
writer.appendUint8(4 << 2) // int16
appendUnsignedLE(writer, val, 2)
return
}
if (val >= -2147483648 && val <= 2147483647) {
writer.appendUint8(5 << 2) // int32
writer.appendInt32(val)
return
}
}
writer.appendUint8(7 << 2) // double
writer.appendFloat64(val)
return
}
if (typeof val === 'string') {
const strBytes = encoder.encode(val)
if (strBytes.length <= 63) {
// short string: basicType=1, length in header
writer.appendUint8(strBytes.length << 2 | 1)
writer.appendBytes(strBytes)
} else {
// long string: primitive typeId=16
writer.appendUint8(16 << 2)
writer.appendUint32(strBytes.length)
writer.appendBytes(strBytes)
}
return
}
if (val instanceof Date) {
writer.appendUint8(13 << 2) // timestamp_micros_ntz
writer.appendInt64(BigInt(val.getTime()) * 1000n)
return
}
if (val instanceof Uint8Array) {
writer.appendUint8(15 << 2) // binary
writer.appendUint32(val.length)
writer.appendBytes(val)
return
}
if (Array.isArray(val)) {
writeVariantArray(val, writer, keyIndex)
return
}
if (typeof val === 'object') {
writeVariantObject(val, writer, keyIndex)
return
}
throw new Error(`variant cannot encode value: ${val}`)
}
/**
* @param {Record<string, any>} obj
* @param {ByteWriter} writer
* @param {Map<string, number>} keyIndex
*/
function writeVariantObject(obj, writer, keyIndex) {
const entries = Object.keys(obj).filter(key => obj[key] !== undefined).map(key => {
const id = keyIndex.get(key)
if (id === undefined) throw new Error(`variant key not in dictionary: ${key}`)
return { id, key }
})
// Sort by field ID for spec compliance
entries.sort((a, b) => a.id - b.id)
const numElements = entries.length
const maxFieldId = numElements > 0 ? entries[numElements - 1].id : 0
const idWidth = byteWidth(maxFieldId)
// Encode child values into a scratch writer so we can compute offsets
const scratch = new ByteWriter(8)
const offsets = new Array(numElements + 1)
offsets[0] = 0
for (let i = 0; i < numElements; i++) {
writeValue(obj[entries[i].key], scratch, keyIndex)
offsets[i + 1] = scratch.index
}
const offsetWidth = byteWidth(offsets[numElements])
const isLarge = numElements > 255 ? 1 : 0
// Header: basicType=2, header encodes offsetWidth, idWidth, isLarge
writer.appendUint8((offsetWidth - 1 | idWidth - 1 << 2 | isLarge << 4) << 2 | 2)
if (isLarge) writer.appendUint32(numElements)
else writer.appendUint8(numElements)
for (const { id } of entries) appendUnsignedLE(writer, id, idWidth)
for (const off of offsets) appendUnsignedLE(writer, off, offsetWidth)
writer.appendBytes(scratch.getBytes())
}
/**
* @param {any[]} arr
* @param {ByteWriter} writer
* @param {Map<string, number>} keyIndex
*/
function writeVariantArray(arr, writer, keyIndex) {
const numElements = arr.length
const scratch = new ByteWriter(8)
const offsets = new Array(numElements + 1)
offsets[0] = 0
for (let i = 0; i < numElements; i++) {
writeValue(arr[i], scratch, keyIndex)
offsets[i + 1] = scratch.index
}
const offsetWidth = byteWidth(offsets[numElements])
const isLarge = numElements > 255 ? 1 : 0
// Header: basicType=3, header encodes fieldOffsetSize, isLarge
writer.appendUint8((offsetWidth - 1 | isLarge << 2) << 2 | 3)
if (isLarge) writer.appendUint32(numElements)
else writer.appendUint8(numElements)
for (const off of offsets) appendUnsignedLE(writer, off, offsetWidth)
writer.appendBytes(scratch.getBytes())
}
/**
* Determine the minimum byte width needed to represent a value.
*
* @param {number} maxValue
* @returns {number} 1, 2, 3, or 4
*/
function byteWidth(maxValue) {
if (maxValue <= 0xff) return 1
if (maxValue <= 0xffff) return 2
if (maxValue <= 0xffffff) return 3
return 4
}
/**
* Write an unsigned integer in little-endian format into a ByteWriter.
*
* @param {ByteWriter} writer
* @param {number} value
* @param {number} width byte width (1-4)
*/
function appendUnsignedLE(writer, value, width) {
for (let i = 0; i < width; i++) {
writer.appendUint8(value >> i * 8 & 0xff)
}
}