hyparquet-writer
Version:
Parquet file writer for JavaScript
258 lines (243 loc) • 9.55 kB
JavaScript
import { normalizeShreddingConfig } from './variant.js'
/**
* @import {ConvertedType, DecodedArray, FieldRepetitionType, ParquetType, SchemaElement} from 'hyparquet'
* @import {BasicType, ColumnSource, ShredType} from '../src/types.js'
*/
/**
* Infer a schema from column data.
* Accepts optional schemaOverrides to override the type of columns by name.
*
* @param {object} options
* @param {ColumnSource[]} options.columnData
* @param {Record<string, SchemaElement>} [options.schemaOverrides]
* @returns {SchemaElement[]}
*/
export function schemaFromColumnData({ columnData, schemaOverrides }) {
/** @type {SchemaElement[]} */
const schema = [{
name: 'root',
num_children: columnData.length,
}]
for (const { name, data, type, nullable, shredding } of columnData) {
if (schemaOverrides?.[name]) {
// use schema override
const override = schemaOverrides[name]
if (type || nullable !== undefined) {
throw new Error(`cannot provide both type and schema override for column ${name}`)
}
if (override.name !== name) {
throw new Error(`schema override for column ${name} must have matching name, got ${override.name}`)
}
if (override.type === 'FIXED_LEN_BYTE_ARRAY' && !override.type_length) {
throw new Error('schema override for FIXED_LEN_BYTE_ARRAY must include type_length')
}
// TODO: support nested schema overrides
if (override.num_children) {
throw new Error('schema override does not support nested types')
}
schema.push(override)
} else if (type === 'VARIANT') {
// variant group with metadata and value children
const repetition_type = nullable === false ? 'REQUIRED' : 'OPTIONAL'
const shreddingConfig = shredding && shredding !== true ? normalizeShreddingConfig(shredding) : undefined
if (shreddingConfig) {
schema.push(
{ name, repetition_type, num_children: 3, logical_type: { type: 'VARIANT' } },
{ name: 'metadata', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED' },
{ name: 'value', type: 'BYTE_ARRAY', repetition_type: 'OPTIONAL' },
...buildVariantTypedValue(shreddingConfig)
)
} else {
schema.push(
{ name, repetition_type, num_children: 2, logical_type: { type: 'VARIANT' } },
{ name: 'metadata', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED' },
{ name: 'value', type: 'BYTE_ARRAY', repetition_type: 'OPTIONAL' }
)
}
} else if (type) {
// use provided type
schema.push(basicTypeToSchemaElement(name, type, nullable))
} else {
// auto-detect type from first 1000 values
schema.push(autoSchemaElement(name, data.slice(0, 1000)))
}
}
return schema
}
/**
* Build the flat preorder SchemaElement subtree for a shredded variant
* `typed_value` node, recursively, from a shred type. The first returned element
* is always named `typed_value`. Supports scalars, object structs, and LIST arrays.
*
* @param {ShredType} shredType
* @returns {SchemaElement[]}
*/
function buildVariantTypedValue(shredType) {
// Array shred type: single-element array template -> 3-level LIST
if (Array.isArray(shredType)) {
return [
{ name: 'typed_value', repetition_type: 'OPTIONAL', converted_type: 'LIST', num_children: 1 },
{ name: 'list', repetition_type: 'REPEATED', num_children: 1 },
{ name: 'element', repetition_type: 'REQUIRED', num_children: 2 },
{ name: 'value', type: 'BYTE_ARRAY', repetition_type: 'OPTIONAL' },
...buildVariantTypedValue(shredType[0]),
]
}
// Object shred type: struct with one optional group per field
if (typeof shredType === 'object') {
const fieldNames = Object.keys(shredType)
/** @type {SchemaElement[]} */
const elements = [
{ name: 'typed_value', repetition_type: 'OPTIONAL', num_children: fieldNames.length },
]
for (const fieldName of fieldNames) {
elements.push(
{ name: fieldName, repetition_type: 'OPTIONAL', num_children: 2 },
{ name: 'value', type: 'BYTE_ARRAY', repetition_type: 'OPTIONAL' },
...buildVariantTypedValue(shredType[fieldName])
)
}
return elements
}
// Scalar shred type: typed leaf
return [shreddedLeafElement(shredType)]
}
/**
* Map a BasicType to the typed_value leaf SchemaElement for shredded scalars.
*
* @param {BasicType} type
* @returns {SchemaElement}
*/
function shreddedLeafElement(type) {
switch (type) {
case 'STRING':
return { name: 'typed_value', type: 'BYTE_ARRAY', converted_type: 'UTF8', repetition_type: 'OPTIONAL' }
case 'INT32':
return { name: 'typed_value', type: 'INT32', repetition_type: 'OPTIONAL' }
case 'INT64':
return { name: 'typed_value', type: 'INT64', repetition_type: 'OPTIONAL' }
case 'DOUBLE':
return { name: 'typed_value', type: 'DOUBLE', repetition_type: 'OPTIONAL' }
case 'FLOAT':
return { name: 'typed_value', type: 'FLOAT', repetition_type: 'OPTIONAL' }
case 'BOOLEAN':
return { name: 'typed_value', type: 'BOOLEAN', repetition_type: 'OPTIONAL' }
case 'TIMESTAMP':
return { name: 'typed_value', type: 'INT64', converted_type: 'TIMESTAMP_MICROS', repetition_type: 'OPTIONAL' }
default:
throw new Error(`unsupported shredded field type: ${type}`)
}
}
/**
* @param {string} name
* @param {Exclude<BasicType, 'VARIANT'>} type
* @param {boolean} [nullable]
* @returns {SchemaElement}
*/
function basicTypeToSchemaElement(name, type, nullable) {
const repetition_type = nullable === false ? 'REQUIRED' : 'OPTIONAL'
if (type === 'STRING') {
return { name, type: 'BYTE_ARRAY', converted_type: 'UTF8', repetition_type }
}
if (type === 'JSON') {
return { name, type: 'BYTE_ARRAY', converted_type: 'JSON', repetition_type }
}
if (type === 'TIMESTAMP') {
return { name, type: 'INT64', converted_type: 'TIMESTAMP_MILLIS', repetition_type }
}
if (type === 'UUID') {
return { name, type: 'FIXED_LEN_BYTE_ARRAY', type_length: 16, logical_type: { type: 'UUID' }, repetition_type }
}
if (type === 'FLOAT16') {
return { name, type: 'FIXED_LEN_BYTE_ARRAY', type_length: 2, logical_type: { type: 'FLOAT16' }, repetition_type }
}
if (type === 'GEOMETRY') {
return { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOMETRY' }, repetition_type }
}
if (type === 'GEOGRAPHY') {
return { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOGRAPHY' }, repetition_type }
}
return { name, type, repetition_type }
}
/**
* Automatically determine a SchemaElement from an array of values.
*
* @param {string} name the column name
* @param {DecodedArray} values the column values
* @returns {SchemaElement}
*/
export function autoSchemaElement(name, values) {
/** @type {ParquetType | undefined} */
let type
/** @type {FieldRepetitionType} */
let repetition_type = 'REQUIRED'
/** @type {ConvertedType | undefined} */
let converted_type
if (values instanceof Int32Array) return { name, type: 'INT32', repetition_type }
if (values instanceof BigInt64Array) return { name, type: 'INT64', repetition_type }
if (values instanceof Float32Array) return { name, type: 'FLOAT', repetition_type }
if (values instanceof Float64Array) return { name, type: 'DOUBLE', repetition_type }
for (const value of values) {
if (value === null || value === undefined) {
repetition_type = 'OPTIONAL'
} else {
// value is defined, infer type
/** @type {ParquetType} */
let valueType
/** @type {ConvertedType | undefined} */
let valueConvertedType
if (typeof value === 'boolean') valueType = 'BOOLEAN'
else if (typeof value === 'bigint') valueType = 'INT64'
else if (Number.isInteger(value)) valueType = 'INT32'
else if (typeof value === 'number') valueType = 'DOUBLE'
else if (value instanceof Uint8Array) valueType = 'BYTE_ARRAY'
else if (typeof value === 'string') {
valueType = 'BYTE_ARRAY'
valueConvertedType = 'UTF8'
}
else if (value instanceof Date) {
valueType = 'INT64'
valueConvertedType = 'TIMESTAMP_MILLIS'
}
else if (typeof value === 'object') {
// use json (TODO: native list and object types)
valueType = 'BYTE_ARRAY'
valueConvertedType = 'JSON'
}
else throw new Error(`cannot determine parquet type for: ${value}`)
// expand type if necessary
if (type === undefined) {
type = valueType
converted_type = valueConvertedType
} else if (type === 'INT32' && valueType === 'DOUBLE') {
type = 'DOUBLE'
} else if (type === 'DOUBLE' && valueType === 'INT32') {
valueType = 'DOUBLE'
} else if (type !== valueType || converted_type !== valueConvertedType) {
throw new Error(`parquet cannot write mixed types: ${converted_type ?? type} and ${valueConvertedType ?? valueType}`)
}
}
}
if (!type) {
// fallback to nullable BYTE_ARRAY
// TODO: logical_type: 'NULL'
type = 'BYTE_ARRAY'
repetition_type = 'OPTIONAL'
}
return { name, type, repetition_type, converted_type }
}
/**
* Get the max repetition level for a given schema path.
*
* @param {SchemaElement[]} schemaPath
* @returns {number} max repetition level
*/
export function getMaxRepetitionLevel(schemaPath) {
let maxLevel = 0
for (const element of schemaPath) {
if (element.repetition_type === 'REPEATED') {
maxLevel++
}
}
return maxLevel
}