UNPKG

parquets

Version:

TypeScript implementation of the Parquet file format, based on parquet.js

122 lines (121 loc) 4.66 kB
/// <reference types="node" /> import { Transform, Writable } from 'stream'; import { ParquetBuffer } from './declare'; import { ParquetSchema } from './schema'; import { RowGroup } from './thrift'; export interface ParquetWriterOptions { baseOffset?: number; rowGroupSize?: number; pageSize?: number; useDataPageV2?: boolean; flags?: string; encoding?: string; fd?: number; mode?: number; autoClose?: boolean; start?: number; } /** * Write a parquet file to an output stream. The ParquetWriter will perform * buffering/batching for performance, so close() must be called after all rows * are written. */ export declare class ParquetWriter<T> { /** * Convenience method to create a new buffered parquet writer that writes to * the specified file */ static openFile<T>(schema: ParquetSchema, path: string, opts?: ParquetWriterOptions): Promise<ParquetWriter<T>>; /** * Convenience method to create a new buffered parquet writer that writes to * the specified stream */ static openStream<T>(schema: ParquetSchema, outputStream: Writable, opts?: ParquetWriterOptions): Promise<ParquetWriter<T>>; schema: ParquetSchema; envelopeWriter: ParquetEnvelopeWriter; rowBuffer: ParquetBuffer; rowGroupSize: number; closed: boolean; userMetadata: Record<string, string>; /** * Create a new buffered parquet writer for a given envelope writer */ constructor(schema: ParquetSchema, envelopeWriter: ParquetEnvelopeWriter, opts: ParquetWriterOptions); /** * Append a single row to the parquet file. Rows are buffered in memory until * rowGroupSize rows are in the buffer or close() is called */ appendRow<T>(row: T): Promise<void>; /** * Finish writing the parquet file and commit the footer to disk. This method * MUST be called after you are finished adding rows. You must not call this * method twice on the same object or add any rows after the close() method has * been called */ close(callback?: () => void): Promise<void>; /** * Add key<>value metadata to the file */ setMetadata(key: string, value: string): void; /** * Set the parquet row group size. This values controls the maximum number * of rows that are buffered in memory at any given time as well as the number * of rows that are co-located on disk. A higher value is generally better for * read-time I/O performance at the tradeoff of write-time memory usage. */ setRowGroupSize(cnt: number): void; /** * Set the parquet data page size. The data page size controls the maximum * number of column values that are written to disk as a consecutive array */ setPageSize(cnt: number): void; } /** * Create a parquet file from a schema and a number of row groups. This class * performs direct, unbuffered writes to the underlying output stream and is * intendend for advanced and internal users; the writeXXX methods must be * called in the correct order to produce a valid file. */ export declare class ParquetEnvelopeWriter { /** * Create a new parquet envelope writer that writes to the specified stream */ static openStream(schema: ParquetSchema, outputStream: Writable, opts: ParquetWriterOptions): Promise<ParquetEnvelopeWriter>; schema: ParquetSchema; write: (buf: Buffer) => void; close: () => void; offset: number; rowCount: number; rowGroups: RowGroup[]; pageSize: number; useDataPageV2: boolean; constructor(schema: ParquetSchema, writeFn: (buf: Buffer) => void, closeFn: () => void, fileOffset: number, opts: ParquetWriterOptions); writeSection(buf: Buffer): void; /** * Encode the parquet file header */ writeHeader(): void; /** * Encode a parquet row group. The records object should be created using the * shredRecord method */ writeRowGroup(records: ParquetBuffer): void; /** * Write the parquet file footer */ writeFooter(userMetadata: Record<string, string>): void; /** * Set the parquet data page size. The data page size controls the maximum * number of column values that are written to disk as a consecutive array */ setPageSize(cnt: number): void; } /** * Create a parquet transform stream */ export declare class ParquetTransformer<T> extends Transform { writer: ParquetWriter<T>; constructor(schema: ParquetSchema, opts?: ParquetWriterOptions); _transform(row: any, encoding: string, callback: (val?: any) => void): void; _flush(callback: (val?: any) => void): void; }