UNPKG

@dobesv/parquets

Version:

TypeScript implementation of the Parquet file format, based on parquet.js

360 lines (323 loc) 10.4 kB
import { Transform, TransformCallback, Writable } from 'stream'; import { ParquetSchema } from './schema'; import { RowGroup } from './thrift'; import { PARQUET_DEFAULT_PAGE_SIZE, PARQUET_DEFAULT_ROW_GROUP_SIZE, PARQUET_MAGIC, encodeFooter, encodeRowGroup, ParquetWriterOptions, } from './encoding'; import * as Util from './util'; import { ParquetWriteBuffer, shredRecord, } from './shred'; export { ParquetWriterOptions }; /** * Write a parquet file to an output stream. The ParquetWriter will perform * buffering/batching for performance, so close() must be called after all rows * are written. */ export class ParquetWriter<T> { /** * Convenience method to create a new buffered parquet writer that writes to * the specified file */ static async openFile<T>( schema: ParquetSchema, path: string, opts?: ParquetWriterOptions ): Promise<ParquetWriter<T>> { const outputStream = await Util.osopen(path, opts); return ParquetWriter.openStream(schema, outputStream, opts); } /** * Convenience method to create a new buffered parquet writer that writes to * the specified stream */ static async openStream<T>( schema: ParquetSchema, outputStream: Writable, opts?: ParquetWriterOptions ): Promise<ParquetWriter<T>> { if (!opts) { // tslint:disable-next-line:no-parameter-reassignment opts = {}; } const envelopeWriter = await ParquetEnvelopeWriter.openStream( schema, outputStream, opts ); return new ParquetWriter(schema, envelopeWriter, opts); } public schema: ParquetSchema; public envelopeWriter: ParquetEnvelopeWriter; public rowBuffer: ParquetWriteBuffer; public rowGroupSize: number; public closed: boolean; public headerWritten: boolean; public userMetadata: Record<string, string>; /** * Create a new buffered parquet writer for a given envelope writer */ constructor( schema: ParquetSchema, envelopeWriter: ParquetEnvelopeWriter, opts: ParquetWriterOptions ) { this.schema = schema; this.envelopeWriter = envelopeWriter; this.rowBuffer = new ParquetWriteBuffer(schema); this.rowGroupSize = opts.rowGroupSize || PARQUET_DEFAULT_ROW_GROUP_SIZE; this.closed = false; this.headerWritten = false; this.userMetadata = {}; } /** * Write the header if it was not already written */ async ensureHeaderWritten(): Promise<void> { if(!this.headerWritten) { try { // Set the flag before making the call so that a concurrent call while the header // is being written will not write the header a second time this.headerWritten = true; // Go ahead and write the header await this.envelopeWriter.writeHeader(); } catch (err) { this.envelopeWriter.close(); throw err; } } } /** * Append a single row to the parquet file. Rows are buffered in memory until * rowGroupSize rows are in the buffer or close() is called */ async appendRow<T>(row: T): Promise<void> { if (this.closed) { throw new Error('writer was closed'); } shredRecord(this.schema, row, this.rowBuffer); if (this.rowBuffer.rowCount >= this.rowGroupSize) { await this.ensureHeaderWritten(); await this.envelopeWriter.writeRowGroup(this.rowBuffer); this.rowBuffer = new ParquetWriteBuffer(this.schema); } } /** * Finish writing the parquet file and commit the footer to disk. This method * MUST be called after you are finished adding rows. You must not call this * method twice on the same object or add any rows after the close() method has * been called */ async close(callback?: () => void): Promise<void> { if (this.closed) { throw new Error('writer was closed'); } this.closed = true; // Make sure we have written the header even if the file is empty await this.ensureHeaderWritten(); if ( this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize ) { await this.envelopeWriter.writeRowGroup(this.rowBuffer); this.rowBuffer = new ParquetWriteBuffer(this.schema); } await this.envelopeWriter.writeFooter(this.userMetadata); await this.envelopeWriter.close(); this.envelopeWriter = null; if (callback) { callback(); } } /** * Add key<>value metadata to the file */ setMetadata(key: string, value: string): void { // TODO: value to be any, obj -> JSON this.userMetadata[String(key)] = String(value); } /** * Set the parquet row group size. This values controls the maximum number * of rows that are buffered in memory at any given time as well as the number * of rows that are co-located on disk. A higher value is generally better for * read-time I/O performance at the tradeoff of write-time memory usage. */ setRowGroupSize(cnt: number): void { this.rowGroupSize = cnt; } /** * Set the parquet data page size. The data page size controls the maximum * number of column values that are written to disk as a consecutive array */ setPageSize(cnt: number): void { this.envelopeWriter.setPageSize(cnt); } } /** * Create a parquet file from a schema and a number of row groups. This class * performs direct, unbuffered writes to the underlying output stream and is * intendend for advanced and internal users; the writeXXX methods must be * called in the correct order to produce a valid file. */ export class ParquetEnvelopeWriter { /** * Create a new parquet envelope writer that writes to the specified stream */ static async openStream( schema: ParquetSchema, outputStream: Writable, opts: ParquetWriterOptions ): Promise<ParquetEnvelopeWriter> { const writeFn = Util.oswrite.bind(undefined, outputStream); const closeFn = Util.osclose.bind(undefined, outputStream); return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts); } public schema: ParquetSchema; public write: (buf: Buffer) => Promise<void>; public close: () => Promise<void>; public offset: number; public rowCount: number; public rowGroups: RowGroup[]; public pageSize: number; public useDataPageV2: boolean; constructor( schema: ParquetSchema, writeFn: (buf: Buffer) => Promise<void>, closeFn: () => Promise<void>, fileOffset: number, opts: ParquetWriterOptions ) { this.schema = schema; this.write = writeFn; this.close = closeFn; this.offset = fileOffset; this.rowCount = 0; this.rowGroups = []; this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE; this.useDataPageV2 = 'useDataPageV2' in opts ? opts.useDataPageV2 : false; } writeSection(buf: Buffer): Promise<void> { this.offset += buf.length; return this.write(buf); } /** * Encode the parquet file header */ writeHeader(): Promise<void> { return this.writeSection(Buffer.from(PARQUET_MAGIC)); } /** * Encode a parquet row group. The records object should be created using the * shredRecord method */ writeRowGroup(records: ParquetWriteBuffer): Promise<void> { const rowGroup = encodeRowGroup(this.schema, records, { baseOffset: this.offset, pageSize: this.pageSize, useDataPageV2: this.useDataPageV2, }); this.rowCount += records.rowCount; this.rowGroups.push(rowGroup.metadata); return this.writeSection(rowGroup.body); } /** * Write the parquet file footer */ writeFooter(userMetadata: Record<string, string>): Promise<void> { if (!userMetadata) { // tslint:disable-next-line:no-parameter-reassignment userMetadata = {}; } return this.writeSection( encodeFooter(this.schema, this.rowCount, this.rowGroups, userMetadata) ); } /** * Set the parquet data page size. The data page size controls the maximum * number of column values that are written to disk as a consecutive array */ setPageSize(cnt: number): void { this.pageSize = cnt; } } /** * Create a parquet transform stream */ export class ParquetTransformer<T> extends Transform { public writer: ParquetWriter<T>; waiting: [() => void, (reason?: any) => void][] = []; constructor(schema: ParquetSchema, opts: ParquetWriterOptions = {}) { super({ objectMode: true }); const writeFn = (function (t: ParquetTransformer<any>) { return function (b: any): Promise<void> { if (!t.push(b)) { // stop writing until the readable is ready again return new Promise((resolve, reject) => { t.waiting.push([resolve, reject]); }); } return Promise.resolve(); }; })(this); const closeFn = (function (t: ParquetTransformer<any>) { return function (): Promise<void> { t.push(null); return Promise.resolve(); }; })(this); this.writer = new ParquetWriter( schema, new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts), opts ); } // If I/O was delayed due to backpressure and then the stream is destroyed, // propagate an error back to the callee of the I/O operation(s) // tslint:disable-next-line:function-name _destroy(error: Error | null, callback: (error: Error | null) => void): void { try { if (this.waiting.length) { const waiting = this.waiting; this.waiting = []; waiting.forEach(([resolve, reject]) => error ? reject(error) : resolve() ); } callback(null); } catch (err) { callback(err); } } // If we get backpressure we will delay returning from a call to write until // the next call to _read // tslint:disable-next-line:function-name _read(arg?: any) { if (this.waiting.length) { const waiting = this.waiting; this.waiting = []; waiting.forEach(([resolve]) => resolve()); } return super._read(arg); } // tslint:disable-next-line:function-name _transform(row: any, encoding: string, callback: TransformCallback) { if (row) { this.writer.appendRow(row).then( () => callback(), err => callback(err) ); } else { callback(); } } // tslint:disable-next-line:function-name _flush(callback: (val?: any) => void) { this.writer.close(callback); } }