UNPKG

hyparquet-writer

Version:

Parquet file writer for JavaScript

hyparam/hyparquet-writer

124 lines (112 loc) • 3.95 kB

JavaScript

import { writeColumn } from './column.js' import { writeMetadata } from './metadata.js' /** * ParquetWriter class allows incremental writing of parquet files. * * @import {ColumnChunk, FileMetaData, KeyValue, RowGroup, SchemaElement} from 'hyparquet' * @import {ColumnSource, Writer} from '../src/types.js' * @param {object} options * @param {Writer} options.writer * @param {SchemaElement[]} options.schema * @param {boolean} [options.compressed] * @param {boolean} [options.statistics] * @param {KeyValue[]} [options.kvMetadata] */ export function ParquetWriter({ writer, schema, compressed = true, statistics = true, kvMetadata }) { this.writer = writer this.schema = schema this.compressed = compressed this.statistics = statistics this.kvMetadata = kvMetadata /** @type {RowGroup[]} */ this.row_groups = [] this.num_rows = 0n // write header PAR1 this.writer.appendUint32(0x31524150) } /** * Write data to the file. * Will split data into row groups of the specified size. * * @param {object} options * @param {ColumnSource[]} options.columnData * @param {number | number[]} [options.rowGroupSize] */ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 }) { const columnDataRows = columnData[0]?.data?.length || 0 for (const { groupStartIndex, groupSize } of groupIterator({ columnDataRows, rowGroupSize })) { const groupStartOffset = this.writer.offset // row group columns /** @type {ColumnChunk[]} */ const columns = [] // write columns for (let j = 0; j < columnData.length; j++) { const { data } = columnData[j] const schemaPath = [this.schema[0], this.schema[j + 1]] const groupData = data.slice(groupStartIndex, groupStartIndex + groupSize) const file_offset = BigInt(this.writer.offset) const meta_data = writeColumn(this.writer, schemaPath, groupData, this.compressed, this.statistics) // save column chunk metadata columns.push({ file_offset, meta_data, }) } this.num_rows += BigInt(groupSize) this.row_groups.push({ columns, total_byte_size: BigInt(this.writer.offset - groupStartOffset), num_rows: BigInt(groupSize), }) } } /** * Finish writing the file. */ ParquetWriter.prototype.finish = function() { // write metadata /** @type {FileMetaData} */ const metadata = { version: 2, created_by: 'hyparquet', schema: this.schema, num_rows: this.num_rows, row_groups: this.row_groups, metadata_length: 0, key_value_metadata: this.kvMetadata, } // @ts-ignore don't want to actually serialize metadata_length delete metadata.metadata_length writeMetadata(this.writer, metadata) // write footer PAR1 this.writer.appendUint32(0x31524150) this.writer.finish() } /** * Create an iterator for row groups based on the specified row group size. * If rowGroupSize is an array, it will return groups based on the sizes in the array. * When the array runs out, it will continue with the last size. * * @param {object} options * @param {number} options.columnDataRows - Total number of rows in the column data * @param {number | number[]} options.rowGroupSize - Size of each row group or an array of sizes * @returns {Array<{groupStartIndex: number, groupSize: number}>} */ function groupIterator({ columnDataRows, rowGroupSize }) { if (Array.isArray(rowGroupSize) && !rowGroupSize.length) { throw new Error('rowGroupSize array cannot be empty') } const groups = [] let groupIndex = 0 let groupStartIndex = 0 while (groupStartIndex < columnDataRows) { const size = Array.isArray(rowGroupSize) ? rowGroupSize[Math.min(groupIndex, rowGroupSize.length - 1)] : rowGroupSize const groupSize = Math.min(size, columnDataRows - groupStartIndex) groups.push({ groupStartIndex, groupSize }) groupStartIndex += size groupIndex++ } return groups }