UNPKG

@dobesv/parquets

Version:

TypeScript implementation of the Parquet file format, based on parquet.js

130 lines (129 loc) 5.07 kB
/// <reference types="node" /> import { Transform, TransformCallback, Writable } from 'stream'; import { ParquetSchema } from './schema'; import { RowGroup } from './thrift'; import { ParquetWriteBuffer } from './shred'; export interface ParquetWriterOptions { baseOffset?: number; rowGroupSize?: number; pageSize?: number; useDataPageV2?: boolean; flags?: string; encoding?: string; fd?: number; mode?: number; autoClose?: boolean; start?: number; } /** * Write a parquet file to an output stream. The ParquetWriter will perform * buffering/batching for performance, so close() must be called after all rows * are written. */ export declare class ParquetWriter<T> { /** * Convenience method to create a new buffered parquet writer that writes to * the specified file */ static openFile<T>(schema: ParquetSchema, path: string, opts?: ParquetWriterOptions): Promise<ParquetWriter<T>>; /** * Convenience method to create a new buffered parquet writer that writes to * the specified stream */ static openStream<T>(schema: ParquetSchema, outputStream: Writable, opts?: ParquetWriterOptions): Promise<ParquetWriter<T>>; schema: ParquetSchema; envelopeWriter: ParquetEnvelopeWriter; rowBuffer: ParquetWriteBuffer; rowGroupSize: number; closed: boolean; headerWritten: boolean; userMetadata: Record<string, string>; /** * Create a new buffered parquet writer for a given envelope writer */ constructor(schema: ParquetSchema, envelopeWriter: ParquetEnvelopeWriter, opts: ParquetWriterOptions); /** * Write the header if it was not already written */ ensureHeaderWritten(): Promise<void>; /** * Append a single row to the parquet file. Rows are buffered in memory until * rowGroupSize rows are in the buffer or close() is called */ appendRow<T>(row: T): Promise<void>; /** * Finish writing the parquet file and commit the footer to disk. This method * MUST be called after you are finished adding rows. You must not call this * method twice on the same object or add any rows after the close() method has * been called */ close(callback?: () => void): Promise<void>; /** * Add key<>value metadata to the file */ setMetadata(key: string, value: string): void; /** * Set the parquet row group size. This values controls the maximum number * of rows that are buffered in memory at any given time as well as the number * of rows that are co-located on disk. A higher value is generally better for * read-time I/O performance at the tradeoff of write-time memory usage. */ setRowGroupSize(cnt: number): void; /** * Set the parquet data page size. The data page size controls the maximum * number of column values that are written to disk as a consecutive array */ setPageSize(cnt: number): void; } /** * Create a parquet file from a schema and a number of row groups. This class * performs direct, unbuffered writes to the underlying output stream and is * intendend for advanced and internal users; the writeXXX methods must be * called in the correct order to produce a valid file. */ export declare class ParquetEnvelopeWriter { /** * Create a new parquet envelope writer that writes to the specified stream */ static openStream(schema: ParquetSchema, outputStream: Writable, opts: ParquetWriterOptions): Promise<ParquetEnvelopeWriter>; schema: ParquetSchema; write: (buf: Buffer) => Promise<void>; close: () => Promise<void>; offset: number; rowCount: number; rowGroups: RowGroup[]; pageSize: number; useDataPageV2: boolean; constructor(schema: ParquetSchema, writeFn: (buf: Buffer) => Promise<void>, closeFn: () => Promise<void>, fileOffset: number, opts: ParquetWriterOptions); writeSection(buf: Buffer): Promise<void>; /** * Encode the parquet file header */ writeHeader(): Promise<void>; /** * Encode a parquet row group. The records object should be created using the * shredRecord method */ writeRowGroup(records: ParquetWriteBuffer): Promise<void>; /** * Write the parquet file footer */ writeFooter(userMetadata: Record<string, string>): Promise<void>; /** * Set the parquet data page size. The data page size controls the maximum * number of column values that are written to disk as a consecutive array */ setPageSize(cnt: number): void; } /** * Create a parquet transform stream */ export declare class ParquetTransformer<T> extends Transform { writer: ParquetWriter<T>; waiting: [() => void, (reason?: any) => void][]; constructor(schema: ParquetSchema, opts?: ParquetWriterOptions); _destroy(error: Error | null, callback: (error: Error | null) => void): void; _read(arg?: any): void; _transform(row: any, encoding: string, callback: TransformCallback): void; _flush(callback: (val?: any) => void): void; }