UNPKG

parquets

Version:

TypeScript implementation of the Parquet file format, based on parquet.js

113 lines (112 loc) 4.73 kB
/// <reference types="node" /> import { ParquetBuffer, ParquetData, ParquetRecord } from './declare'; import { ParquetSchema } from './schema'; import { ColumnChunk, FileMetaData, RowGroup } from './thrift'; /** * A parquet cursor is used to retrieve rows from a parquet file in order */ export declare class ParquetCursor<T> implements AsyncIterable<T> { metadata: FileMetaData; envelopeReader: ParquetEnvelopeReader; schema: ParquetSchema; columnList: string[][]; rowGroup: ParquetRecord[]; rowGroupIndex: number; /** * Create a new parquet reader from the file metadata and an envelope reader. * It is usually not recommended to call this constructor directly except for * advanced and internal use cases. Consider using getCursor() on the * ParquetReader instead */ constructor(metadata: FileMetaData, envelopeReader: ParquetEnvelopeReader, schema: ParquetSchema, columnList: string[][]); /** * Retrieve the next row from the cursor. Returns a row or NULL if the end * of the file was reached */ next<T = any>(): Promise<T>; /** * Rewind the cursor the the beginning of the file */ rewind(): void; /** * Implement AsyncIterable */ [Symbol.asyncIterator](): AsyncIterator<T>; } /** * A parquet reader allows retrieving the rows from a parquet file in order. * The basic usage is to create a reader and then retrieve a cursor/iterator * which allows you to consume row after row until all rows have been read. It is * important that you call close() after you are finished reading the file to * avoid leaking file descriptors. */ export declare class ParquetReader<T> implements AsyncIterable<T> { /** * Open the parquet file pointed to by the specified path and return a new * parquet reader */ static openFile<T>(filePath: string): Promise<ParquetReader<T>>; static openBuffer<T>(buffer: Buffer): Promise<ParquetReader<T>>; metadata: FileMetaData; envelopeReader: ParquetEnvelopeReader; schema: ParquetSchema; /** * Create a new parquet reader from the file metadata and an envelope reader. * It is not recommended to call this constructor directly except for advanced * and internal use cases. Consider using one of the open{File,Buffer} methods * instead */ constructor(metadata: FileMetaData, envelopeReader: ParquetEnvelopeReader); /** * Return a cursor to the file. You may open more than one cursor and use * them concurrently. All cursors become invalid once close() is called on * the reader object. * * The required_columns parameter controls which columns are actually read * from disk. An empty array or no value implies all columns. A list of column * names means that only those columns should be loaded from disk. */ getCursor(): ParquetCursor<T>; getCursor<K extends keyof T>(columnList: (K | K[])[]): ParquetCursor<Pick<T, K>>; getCursor(columnList: (string | string[])[]): ParquetCursor<Partial<T>>; /** * Return the number of rows in this file. Note that the number of rows is * not neccessarily equal to the number of rows in each column. */ getRowCount(): number; /** * Returns the ParquetSchema for this file */ getSchema(): ParquetSchema; /** * Returns the user (key/value) metadata for this file */ getMetadata(): Record<string, string>; /** * Close this parquet reader. You MUST call this method once you're finished * reading rows */ close(): Promise<void>; /** * Implement AsyncIterable */ [Symbol.asyncIterator](): AsyncIterator<T>; } /** * The parquet envelope reader allows direct, unbuffered access to the individual * sections of the parquet file, namely the header, footer and the row groups. * This class is intended for advanced/internal users; if you just want to retrieve * rows from a parquet file use the ParquetReader instead */ export declare class ParquetEnvelopeReader { read: (position: number, length: number) => Promise<Buffer>; close: () => Promise<void>; fileSize: number; static openFile(filePath: string): Promise<ParquetEnvelopeReader>; static openBuffer(buffer: Buffer): Promise<ParquetEnvelopeReader>; constructor(read: (position: number, length: number) => Promise<Buffer>, close: () => Promise<void>, fileSize: number); readHeader(): Promise<void>; readRowGroup(schema: ParquetSchema, rowGroup: RowGroup, columnList: string[][]): Promise<ParquetBuffer>; readColumnChunk(schema: ParquetSchema, colChunk: ColumnChunk): Promise<ParquetData>; readFooter(): Promise<FileMetaData>; }