UNPKG

hyparquet-writer

Version:

Parquet file writer for JavaScript

227 lines (205 loc) 7.49 kB
import { getSchemaPath } from 'hyparquet/src/schema.js' import { writeBlooms } from './bloom.js' import { writeColumn } from './column.js' import { encodeNestedValues } from './dremel.js' import { writeIndexes } from './indexes.js' import { writeMetadata } from './metadata.js' import { snappyCompress } from './snappy.js' import { encodeVariantColumn } from './variant.js' /** * @import {ColumnChunk, CompressionCodec, FileMetaData, KeyValue, RowGroup, SchemaElement, SchemaTree} from 'hyparquet' * @import {ColumnEncoder, ColumnSource, Compressors, PageIndexes, Writer} from '../src/types.js' */ /** * ParquetWriter class allows incremental writing of parquet files. * * @param {object} options * @param {Writer} options.writer * @param {SchemaElement[]} options.schema * @param {CompressionCodec} [options.codec] * @param {Compressors} [options.compressors] * @param {boolean} [options.statistics] * @param {KeyValue[]} [options.kvMetadata] */ export function ParquetWriter({ writer, schema, codec = 'SNAPPY', compressors, statistics = true, kvMetadata }) { this.writer = writer this.schema = schema this.codec = codec // Include built-in snappy as fallback this.compressors = { SNAPPY: snappyCompress, ...compressors } this.statistics = statistics this.kvMetadata = kvMetadata /** @type {RowGroup[]} */ this.row_groups = [] this.num_rows = 0n /** @type {PageIndexes[]} */ this.pendingIndexes = [] // write header PAR1 this.writer.appendUint32(0x31524150) } /** * Write data to the file. * Will split data into row groups of the specified size. * Calls writer.flush() (if defined) after each row group; if it returns a * Promise, subsequent row groups await it before encoding more data. * * @param {object} options * @param {ColumnSource[]} options.columnData * @param {number | number[]} [options.rowGroupSize] * @param {number} [options.pageSize] * @returns {void | Promise<void>} */ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = [1000, 100000], pageSize = 1048576 }) { const columnDataRows = columnData[0]?.data?.length || 0 /** @type {Promise<void> | undefined} */ let pending for (const { groupStartIndex, groupSize } of groupIterator({ columnDataRows, rowGroupSize })) { const writeGroup = () => { const groupStartOffset = this.writer.offset /** @type {ColumnChunk[]} */ const columns = [] // write columns for (let j = 0; j < columnData.length; j++) { const { name, data, encoding, codec = this.codec, columnIndex = false, offsetIndex = true, shredding, bloomFilter } = columnData[j] // Spec: if ColumnIndex is present, OffsetIndex must also be present if (columnIndex && !offsetIndex) { throw new Error('parquet ColumnIndex cannot be present without OffsetIndex') } if (data.length !== columnDataRows) { throw new Error('parquet columns must have the same length') } const groupData = data.slice(groupStartIndex, groupStartIndex + groupSize) const columnPath = getSchemaPath(this.schema, [name]) const leafPaths = getLeafSchemaPaths(columnPath) // For VARIANT logical type, encode JS values into {metadata, value} structs const columnElement = columnPath.at(-1)?.element const shreddingConfig = shredding && shredding !== true ? shredding : undefined const isVariant = columnElement?.logical_type?.type === 'VARIANT' const isRequired = columnElement?.repetition_type === 'REQUIRED' const rows = isVariant ? encodeVariantColumn(Array.from(groupData), shreddingConfig, { name, required: isRequired }) : groupData for (const leafPath of leafPaths) { const schemaPath = leafPath.map(node => node.element) /** @type {ColumnEncoder} */ const column = { columnName: schemaPath.slice(1).map(s => s.name).join('.'), element: schemaPath[schemaPath.length - 1], schemaPath, codec, compressors: this.compressors, stats: this.statistics, pageSize, columnIndex, offsetIndex, encoding, bloomFilter, } const pageData = encodeNestedValues(leafPath, rows) const result = writeColumn({ writer: this.writer, column, pageData, }) columns.push(result.chunk) this.pendingIndexes.push(result) } } this.num_rows += BigInt(groupSize) this.row_groups.push({ columns, total_byte_size: BigInt(this.writer.offset - groupStartOffset), num_rows: BigInt(groupSize), }) return this.writer.flush?.() } if (pending) { pending = pending.then(writeGroup) } else { const r = writeGroup() if (r) pending = Promise.resolve(r) } } return pending } /** * Finish writing the file. * * @returns {void | Promise<void>} */ ParquetWriter.prototype.finish = function() { // Write all indexes at end of file writeIndexes(this.writer, this.pendingIndexes) // Bloom filters cluster after indexes so pushdown readers fetch them in one range writeBlooms(this.writer, this.pendingIndexes) // write metadata /** @type {FileMetaData} */ const metadata = { version: 2, created_by: 'hyparquet', schema: this.schema, num_rows: this.num_rows, row_groups: this.row_groups, metadata_length: 0, key_value_metadata: this.kvMetadata, } // @ts-ignore don't want to actually serialize metadata_length delete metadata.metadata_length writeMetadata(this.writer, metadata) // write footer PAR1 this.writer.appendUint32(0x31524150) return this.writer.finish() } /** * Create an iterator for row groups based on the specified row group size. * If rowGroupSize is an array, it will return groups based on the sizes in the array. * When the array runs out, it will continue with the last size. * * @param {object} options * @param {number} options.columnDataRows - Total number of rows in the column data * @param {number | number[]} options.rowGroupSize - Size of each row group or an array of sizes * @returns {Array<{groupStartIndex: number, groupSize: number}>} */ function groupIterator({ columnDataRows, rowGroupSize }) { if (Array.isArray(rowGroupSize) && !rowGroupSize.length) { throw new Error('rowGroupSize array cannot be empty') } const groups = [] let groupIndex = 0 let groupStartIndex = 0 while (groupStartIndex < columnDataRows) { const size = Array.isArray(rowGroupSize) ? rowGroupSize[Math.min(groupIndex, rowGroupSize.length - 1)] : rowGroupSize const groupSize = Math.min(size, columnDataRows - groupStartIndex) groups.push({ groupStartIndex, groupSize }) groupStartIndex += size groupIndex++ } return groups } /** * Expand a schema path to all primitive leaf nodes under the column. * * @param {SchemaTree[]} schemaPath * @returns {SchemaTree[][]} */ function getLeafSchemaPaths(schemaPath) { /** @type {SchemaTree[][]} */ const leaves = [] dfs(schemaPath) return leaves /** * @param {SchemaTree[]} path */ function dfs(path) { const node = path[path.length - 1] if (!node.children.length) { leaves.push(path) return } for (const child of node.children) { dfs([...path, child]) } } }