parquets
Version:
TypeScript implementation of the Parquet file format, based on parquet.js
113 lines (112 loc) • 4.73 kB
TypeScript
/// <reference types="node" />
import { ParquetBuffer, ParquetData, ParquetRecord } from './declare';
import { ParquetSchema } from './schema';
import { ColumnChunk, FileMetaData, RowGroup } from './thrift';
/**
* A parquet cursor is used to retrieve rows from a parquet file in order
*/
export declare class ParquetCursor<T> implements AsyncIterable<T> {
metadata: FileMetaData;
envelopeReader: ParquetEnvelopeReader;
schema: ParquetSchema;
columnList: string[][];
rowGroup: ParquetRecord[];
rowGroupIndex: number;
/**
* Create a new parquet reader from the file metadata and an envelope reader.
* It is usually not recommended to call this constructor directly except for
* advanced and internal use cases. Consider using getCursor() on the
* ParquetReader instead
*/
constructor(metadata: FileMetaData, envelopeReader: ParquetEnvelopeReader, schema: ParquetSchema, columnList: string[][]);
/**
* Retrieve the next row from the cursor. Returns a row or NULL if the end
* of the file was reached
*/
next<T = any>(): Promise<T>;
/**
* Rewind the cursor the the beginning of the file
*/
rewind(): void;
/**
* Implement AsyncIterable
*/
[Symbol.asyncIterator](): AsyncIterator<T>;
}
/**
* A parquet reader allows retrieving the rows from a parquet file in order.
* The basic usage is to create a reader and then retrieve a cursor/iterator
* which allows you to consume row after row until all rows have been read. It is
* important that you call close() after you are finished reading the file to
* avoid leaking file descriptors.
*/
export declare class ParquetReader<T> implements AsyncIterable<T> {
/**
* Open the parquet file pointed to by the specified path and return a new
* parquet reader
*/
static openFile<T>(filePath: string): Promise<ParquetReader<T>>;
static openBuffer<T>(buffer: Buffer): Promise<ParquetReader<T>>;
metadata: FileMetaData;
envelopeReader: ParquetEnvelopeReader;
schema: ParquetSchema;
/**
* Create a new parquet reader from the file metadata and an envelope reader.
* It is not recommended to call this constructor directly except for advanced
* and internal use cases. Consider using one of the open{File,Buffer} methods
* instead
*/
constructor(metadata: FileMetaData, envelopeReader: ParquetEnvelopeReader);
/**
* Return a cursor to the file. You may open more than one cursor and use
* them concurrently. All cursors become invalid once close() is called on
* the reader object.
*
* The required_columns parameter controls which columns are actually read
* from disk. An empty array or no value implies all columns. A list of column
* names means that only those columns should be loaded from disk.
*/
getCursor(): ParquetCursor<T>;
getCursor<K extends keyof T>(columnList: (K | K[])[]): ParquetCursor<Pick<T, K>>;
getCursor(columnList: (string | string[])[]): ParquetCursor<Partial<T>>;
/**
* Return the number of rows in this file. Note that the number of rows is
* not neccessarily equal to the number of rows in each column.
*/
getRowCount(): number;
/**
* Returns the ParquetSchema for this file
*/
getSchema(): ParquetSchema;
/**
* Returns the user (key/value) metadata for this file
*/
getMetadata(): Record<string, string>;
/**
* Close this parquet reader. You MUST call this method once you're finished
* reading rows
*/
close(): Promise<void>;
/**
* Implement AsyncIterable
*/
[Symbol.asyncIterator](): AsyncIterator<T>;
}
/**
* The parquet envelope reader allows direct, unbuffered access to the individual
* sections of the parquet file, namely the header, footer and the row groups.
* This class is intended for advanced/internal users; if you just want to retrieve
* rows from a parquet file use the ParquetReader instead
*/
export declare class ParquetEnvelopeReader {
read: (position: number, length: number) => Promise<Buffer>;
close: () => Promise<void>;
fileSize: number;
static openFile(filePath: string): Promise<ParquetEnvelopeReader>;
static openBuffer(buffer: Buffer): Promise<ParquetEnvelopeReader>;
constructor(read: (position: number, length: number) => Promise<Buffer>, close: () => Promise<void>, fileSize: number);
readHeader(): Promise<void>;
readRowGroup(schema: ParquetSchema, rowGroup: RowGroup, columnList: string[][]): Promise<ParquetBuffer>;
readColumnChunk(schema: ParquetSchema, colChunk: ColumnChunk): Promise<ParquetData>;
readFooter(): Promise<FileMetaData>;
}