@dobesv/parquets
Version:
TypeScript implementation of the Parquet file format, based on parquet.js
284 lines (283 loc) • 11.4 kB
TypeScript
/// <reference types="node" />
/// <reference types="node" />
import { ParquetRecord, ParquetValueArray } from './declare';
import { ParquetSchema } from './schema';
import { ColumnChunk, FileMetaData, RowGroup, ColumnMetaData } from './thrift';
/**
* Variation of ParquetData which always has Int32Array for dLevels and rLevels.
*/
export interface ParquetReadData {
dLevels: Int32Array;
rLevels: Int32Array;
values: ParquetValueArray;
count: number;
}
/**
* Variation of ParquetBuffer which always has Int32Array for dLevels and rLevels.
*/
export interface ParquetReadBuffer {
rowCount: number;
columnData: Record<string, ParquetReadData>;
}
/**
* A parquet cursor is used to retrieve rows from a parquet file in order
*/
export declare class ParquetCursor<T> implements AsyncIterable<T> {
metadata: FileMetaData;
envelopeReader: ParquetEnvelopeReader;
schema: ParquetSchema;
columnList: string[][];
rowGroup: ParquetRecord[];
rowGroupIndex: number;
cursorIndex: number;
/**
* Create a new parquet reader from the file metadata and an envelope reader.
* It is usually not recommended to call this constructor directly except for
* advanced and internal use cases. Consider using getCursor() on the
* ParquetReader instead
*/
constructor(metadata: FileMetaData, envelopeReader: ParquetEnvelopeReader, schema: ParquetSchema, columnList: string[][]);
/**
* Retrieve the next row from the cursor. Returns a row or NULL if the end
* of the file was reached
*/
next<T = any>(): Promise<T>;
/**
* Rewind the cursor the the beginning of the file
*/
rewind(): void;
/**
* Implement AsyncIterable
*/
[Symbol.asyncIterator](): AsyncIterator<T>;
}
/**
* A parquet reader allows retrieving the rows from a parquet file in order.
* The basic usage is to create a reader and then retrieve a cursor/iterator
* which allows you to consume row after row until all rows have been read. It is
* important that you call close() after you are finished reading the file to
* avoid leaking file descriptors.
*/
export declare class ParquetReader<T> implements AsyncIterable<T> {
/**
* Open the parquet file pointed to by the specified path and return a new
* parquet reader
*/
static openFile<T>(filePath: string): Promise<ParquetReader<T>>;
static openBuffer<T>(buffer: Buffer): Promise<ParquetReader<T>>;
metadata: FileMetaData;
envelopeReader: ParquetEnvelopeReader;
schema: ParquetSchema;
/**
* Create a new parquet reader from the file metadata and an envelope reader.
* It is not recommended to call this constructor directly except for advanced
* and internal use cases. Consider using one of the open{File,Buffer} methods
* instead
*/
constructor(metadata: FileMetaData, envelopeReader: ParquetEnvelopeReader);
/**
* Return a cursor to the file. You may open more than one cursor and use
* them concurrently. All cursors become invalid once close() is called on
* the reader object.
*
* The required_columns parameter controls which columns are actually read
* from disk. An empty array or no value implies all columns. A list of column
* names means that only those columns should be loaded from disk.
*/
getCursor(): ParquetCursor<T>;
getCursor<K extends keyof T>(columnList: (K | K[])[]): ParquetCursor<Pick<T, K>>;
getCursor(columnList: (string | string[])[]): ParquetCursor<Partial<T>>;
/**
* Get an iterable over a single column. The column is specified as an array of
* strings in order to support nested records.
*
* The path should not reference a nested record column.
*
* When a column is repeated the iterable will an array for each row.
*
* When a column is optional the iterable will produce null for any row missing
* the value.
*
* If a column is repeated and also nested inside another repeated object, then an array of arrays
* is returned for each row in the dataset.
*
* If a column is optional and also nested inside a repeated nested object, then it will be in an array
* where the array elements may be null.
*
* This means you can iterate multiple of these in parallel to walk multiple
* columns at once and they will stay in sync as long as the calls to next()
* are made in sync.
*
* @param columnPath
*/
getColumnValues(columnPath: string[]): AsyncIterable<any>;
/**
* Return the number of rows in this file. Note that the number of rows is
* not neccessarily equal to the number of rows in each column.
*/
getRowCount(): number;
/**
* Returns the ParquetSchema for this file
*/
getSchema(): ParquetSchema;
/**
* Returns the user (key/value) metadata for this file
*/
getMetadata(): Record<string, string>;
/**
* Returns the column metadata for all columns.
*/
getColumnMetadata(): Record<string, ColumnMetaData[]>;
/**
* Close this parquet reader. You MUST call this method once you're finished
* reading rows
*/
close(): Promise<void>;
/**
* Implement AsyncIterable
*/
[Symbol.asyncIterator](): AsyncIterator<T>;
}
/**
* The parquet envelope reader allows direct, unbuffered access to the individual
* sections of the parquet file, namely the header, footer and the row groups.
* This class is intended for advanced/internal users; if you just want to retrieve
* rows from a parquet file use the ParquetReader instead
*/
export declare class ParquetEnvelopeReader {
read: (position: number, length: number) => Promise<Buffer>;
close: () => Promise<void>;
fileSize: number;
static openFile(filePath: string): Promise<ParquetEnvelopeReader>;
/**
* Read parquet data from an in-memory buffer. This provides an asynchronous
* interface compatible with reading from a file.
*
* Note that you can also use ParquetEnvelopeBufferReader if you don't need your code to be able
* to handle files and buffers both. It may offer some performance benefit because it does not yield
* to the event loop in between operations.
*/
static openBuffer(buffer: Buffer): Promise<ParquetEnvelopeReader>;
constructor(read: (position: number, length: number) => Promise<Buffer>, close: () => Promise<void>, fileSize: number);
readHeader(): Promise<void>;
readRowGroup(schema: ParquetSchema, rowGroup: RowGroup, columnList: string[][]): Promise<ParquetReadBuffer>;
readColumnChunk(schema: ParquetSchema, colChunk: ColumnChunk): Promise<ParquetReadData>;
readFooter(): Promise<FileMetaData>;
}
/**
* A parquet cursor is used to retrieve rows from a parquet file in order
*/
export declare class ParquetBufferCursor<T> implements Iterable<T> {
metadata: FileMetaData;
envelopeReader: ParquetEnvelopeBufferReader;
schema: ParquetSchema;
columnList: string[][];
rows: ParquetRecord[];
rowsIndex: number;
cursorIndex: number;
/**
* Create a new parquet reader from the file metadata and an envelope reader.
* It is usually not recommended to call this constructor directly except for
* advanced and internal use cases. Consider using getCursor() on the
* ParquetReader instead
*/
constructor(metadata: FileMetaData, envelopeReader: ParquetEnvelopeBufferReader, schema: ParquetSchema, columnList: string[][]);
/**
* Retrieve the next row from the cursor. Returns a row or NULL if the end
* of the file was reached
*/
next<T = any>(): T;
/**
* Rewind the cursor the the beginning of the file
*/
rewind(): void;
/**
* Implement Iterable
*/
[Symbol.iterator](): Iterator<T>;
}
/**
* A parquet reader allows retrieving the rows from a parquet file in order.
* The basic usage is to create a reader and then retrieve a cursor/iterator
* which allows you to consume row after row until all rows have been read. It is
* important that you call close() after you are finished reading the file to
* avoid leaking file descriptors.
*/
export declare class ParquetBufferReader<T> implements Iterable<T> {
buffer: Buffer;
static openBuffer<T>(buffer: Buffer): ParquetBufferReader<T>;
metadata: FileMetaData;
envelopeReader: ParquetEnvelopeBufferReader;
schema: ParquetSchema;
/**
* Create a new parquet reader from a buffer. This version of ParquetReader
* runs synchronously so it may be more efficient when reading from a Buffer.
*
* However, it doesn't have a compatible API with ParquetReader.
*/
constructor(buffer: Buffer);
/**
* Return a cursor to the buffer. You may open more than one cursor and use
* them concurrently.
*
* The required_columns parameter controls which columns are actually read
* from disk. An empty array or no value implies all columns. A list of column
* names means that only those columns should be loaded from disk.
*
* When the schema has nested records, you will need to specify each column as an array
* of strings specifying the "path" to the actual leaf column to fetch.
*/
getCursor(): ParquetBufferCursor<T>;
getCursor<K extends keyof T>(columnList: (K | K[])[]): ParquetBufferCursor<Pick<T, K>>;
getCursor(columnList: (string | string[])[]): ParquetBufferCursor<Partial<T>>;
/**
* Get an iterable over a single column. The column is specified as an array of
* strings in order to support nested records.
*
* The path should not reference a nested record column.
*
* When a column is repeated the iterable will an array for each row.
*
* When a column is optional the iterable will produce null for any row missing
* the value.
*
* If a column is repeated and also nested inside another repeated object, then an array of arrays
* is returned for each row in the dataset.
*
* If a column is optional and also nested inside a repeated nested object, then it will be in an array
* where the array elements may be null.
*
* This means you can iterate multiple of these in parallel to walk multiple
* columns at once and they will stay in sync as long as the calls to next()
* are made in sync.
*
* @param columnPath
*/
getColumnValues(columnPath: string[]): Iterable<any>;
/**
* Return the number of rows in this file. Note that the number of rows is
* not necessarily equal to the number of rows in each column.
*/
getRowCount(): number;
/**
* Returns the ParquetSchema for this file
*/
getSchema(): ParquetSchema;
/**
* Returns the user (key/value) metadata for this file
*/
getMetadata(): Record<string, string>;
/**
* Implement Iterable
*/
[Symbol.iterator](): Iterator<T>;
}
export declare class ParquetEnvelopeBufferReader {
buffer: Buffer;
constructor(buffer: Buffer);
read(offset: number, length: number): Buffer;
readHeader(): void;
readRowGroup(schema: ParquetSchema, rowGroup: RowGroup, columnList: string[][]): ParquetReadBuffer;
readColumnChunk(schema: ParquetSchema, colChunk: ColumnChunk): ParquetReadData;
readFooter(): FileMetaData;
}