UNPKG

@dobesv/parquets

Version:

TypeScript implementation of the Parquet file format, based on parquet.js

284 lines (283 loc) 11.4 kB
/// <reference types="node" /> /// <reference types="node" /> import { ParquetRecord, ParquetValueArray } from './declare'; import { ParquetSchema } from './schema'; import { ColumnChunk, FileMetaData, RowGroup, ColumnMetaData } from './thrift'; /** * Variation of ParquetData which always has Int32Array for dLevels and rLevels. */ export interface ParquetReadData { dLevels: Int32Array; rLevels: Int32Array; values: ParquetValueArray; count: number; } /** * Variation of ParquetBuffer which always has Int32Array for dLevels and rLevels. */ export interface ParquetReadBuffer { rowCount: number; columnData: Record<string, ParquetReadData>; } /** * A parquet cursor is used to retrieve rows from a parquet file in order */ export declare class ParquetCursor<T> implements AsyncIterable<T> { metadata: FileMetaData; envelopeReader: ParquetEnvelopeReader; schema: ParquetSchema; columnList: string[][]; rowGroup: ParquetRecord[]; rowGroupIndex: number; cursorIndex: number; /** * Create a new parquet reader from the file metadata and an envelope reader. * It is usually not recommended to call this constructor directly except for * advanced and internal use cases. Consider using getCursor() on the * ParquetReader instead */ constructor(metadata: FileMetaData, envelopeReader: ParquetEnvelopeReader, schema: ParquetSchema, columnList: string[][]); /** * Retrieve the next row from the cursor. Returns a row or NULL if the end * of the file was reached */ next<T = any>(): Promise<T>; /** * Rewind the cursor the the beginning of the file */ rewind(): void; /** * Implement AsyncIterable */ [Symbol.asyncIterator](): AsyncIterator<T>; } /** * A parquet reader allows retrieving the rows from a parquet file in order. * The basic usage is to create a reader and then retrieve a cursor/iterator * which allows you to consume row after row until all rows have been read. It is * important that you call close() after you are finished reading the file to * avoid leaking file descriptors. */ export declare class ParquetReader<T> implements AsyncIterable<T> { /** * Open the parquet file pointed to by the specified path and return a new * parquet reader */ static openFile<T>(filePath: string): Promise<ParquetReader<T>>; static openBuffer<T>(buffer: Buffer): Promise<ParquetReader<T>>; metadata: FileMetaData; envelopeReader: ParquetEnvelopeReader; schema: ParquetSchema; /** * Create a new parquet reader from the file metadata and an envelope reader. * It is not recommended to call this constructor directly except for advanced * and internal use cases. Consider using one of the open{File,Buffer} methods * instead */ constructor(metadata: FileMetaData, envelopeReader: ParquetEnvelopeReader); /** * Return a cursor to the file. You may open more than one cursor and use * them concurrently. All cursors become invalid once close() is called on * the reader object. * * The required_columns parameter controls which columns are actually read * from disk. An empty array or no value implies all columns. A list of column * names means that only those columns should be loaded from disk. */ getCursor(): ParquetCursor<T>; getCursor<K extends keyof T>(columnList: (K | K[])[]): ParquetCursor<Pick<T, K>>; getCursor(columnList: (string | string[])[]): ParquetCursor<Partial<T>>; /** * Get an iterable over a single column. The column is specified as an array of * strings in order to support nested records. * * The path should not reference a nested record column. * * When a column is repeated the iterable will an array for each row. * * When a column is optional the iterable will produce null for any row missing * the value. * * If a column is repeated and also nested inside another repeated object, then an array of arrays * is returned for each row in the dataset. * * If a column is optional and also nested inside a repeated nested object, then it will be in an array * where the array elements may be null. * * This means you can iterate multiple of these in parallel to walk multiple * columns at once and they will stay in sync as long as the calls to next() * are made in sync. * * @param columnPath */ getColumnValues(columnPath: string[]): AsyncIterable<any>; /** * Return the number of rows in this file. Note that the number of rows is * not neccessarily equal to the number of rows in each column. */ getRowCount(): number; /** * Returns the ParquetSchema for this file */ getSchema(): ParquetSchema; /** * Returns the user (key/value) metadata for this file */ getMetadata(): Record<string, string>; /** * Returns the column metadata for all columns. */ getColumnMetadata(): Record<string, ColumnMetaData[]>; /** * Close this parquet reader. You MUST call this method once you're finished * reading rows */ close(): Promise<void>; /** * Implement AsyncIterable */ [Symbol.asyncIterator](): AsyncIterator<T>; } /** * The parquet envelope reader allows direct, unbuffered access to the individual * sections of the parquet file, namely the header, footer and the row groups. * This class is intended for advanced/internal users; if you just want to retrieve * rows from a parquet file use the ParquetReader instead */ export declare class ParquetEnvelopeReader { read: (position: number, length: number) => Promise<Buffer>; close: () => Promise<void>; fileSize: number; static openFile(filePath: string): Promise<ParquetEnvelopeReader>; /** * Read parquet data from an in-memory buffer. This provides an asynchronous * interface compatible with reading from a file. * * Note that you can also use ParquetEnvelopeBufferReader if you don't need your code to be able * to handle files and buffers both. It may offer some performance benefit because it does not yield * to the event loop in between operations. */ static openBuffer(buffer: Buffer): Promise<ParquetEnvelopeReader>; constructor(read: (position: number, length: number) => Promise<Buffer>, close: () => Promise<void>, fileSize: number); readHeader(): Promise<void>; readRowGroup(schema: ParquetSchema, rowGroup: RowGroup, columnList: string[][]): Promise<ParquetReadBuffer>; readColumnChunk(schema: ParquetSchema, colChunk: ColumnChunk): Promise<ParquetReadData>; readFooter(): Promise<FileMetaData>; } /** * A parquet cursor is used to retrieve rows from a parquet file in order */ export declare class ParquetBufferCursor<T> implements Iterable<T> { metadata: FileMetaData; envelopeReader: ParquetEnvelopeBufferReader; schema: ParquetSchema; columnList: string[][]; rows: ParquetRecord[]; rowsIndex: number; cursorIndex: number; /** * Create a new parquet reader from the file metadata and an envelope reader. * It is usually not recommended to call this constructor directly except for * advanced and internal use cases. Consider using getCursor() on the * ParquetReader instead */ constructor(metadata: FileMetaData, envelopeReader: ParquetEnvelopeBufferReader, schema: ParquetSchema, columnList: string[][]); /** * Retrieve the next row from the cursor. Returns a row or NULL if the end * of the file was reached */ next<T = any>(): T; /** * Rewind the cursor the the beginning of the file */ rewind(): void; /** * Implement Iterable */ [Symbol.iterator](): Iterator<T>; } /** * A parquet reader allows retrieving the rows from a parquet file in order. * The basic usage is to create a reader and then retrieve a cursor/iterator * which allows you to consume row after row until all rows have been read. It is * important that you call close() after you are finished reading the file to * avoid leaking file descriptors. */ export declare class ParquetBufferReader<T> implements Iterable<T> { buffer: Buffer; static openBuffer<T>(buffer: Buffer): ParquetBufferReader<T>; metadata: FileMetaData; envelopeReader: ParquetEnvelopeBufferReader; schema: ParquetSchema; /** * Create a new parquet reader from a buffer. This version of ParquetReader * runs synchronously so it may be more efficient when reading from a Buffer. * * However, it doesn't have a compatible API with ParquetReader. */ constructor(buffer: Buffer); /** * Return a cursor to the buffer. You may open more than one cursor and use * them concurrently. * * The required_columns parameter controls which columns are actually read * from disk. An empty array or no value implies all columns. A list of column * names means that only those columns should be loaded from disk. * * When the schema has nested records, you will need to specify each column as an array * of strings specifying the "path" to the actual leaf column to fetch. */ getCursor(): ParquetBufferCursor<T>; getCursor<K extends keyof T>(columnList: (K | K[])[]): ParquetBufferCursor<Pick<T, K>>; getCursor(columnList: (string | string[])[]): ParquetBufferCursor<Partial<T>>; /** * Get an iterable over a single column. The column is specified as an array of * strings in order to support nested records. * * The path should not reference a nested record column. * * When a column is repeated the iterable will an array for each row. * * When a column is optional the iterable will produce null for any row missing * the value. * * If a column is repeated and also nested inside another repeated object, then an array of arrays * is returned for each row in the dataset. * * If a column is optional and also nested inside a repeated nested object, then it will be in an array * where the array elements may be null. * * This means you can iterate multiple of these in parallel to walk multiple * columns at once and they will stay in sync as long as the calls to next() * are made in sync. * * @param columnPath */ getColumnValues(columnPath: string[]): Iterable<any>; /** * Return the number of rows in this file. Note that the number of rows is * not necessarily equal to the number of rows in each column. */ getRowCount(): number; /** * Returns the ParquetSchema for this file */ getSchema(): ParquetSchema; /** * Returns the user (key/value) metadata for this file */ getMetadata(): Record<string, string>; /** * Implement Iterable */ [Symbol.iterator](): Iterator<T>; } export declare class ParquetEnvelopeBufferReader { buffer: Buffer; constructor(buffer: Buffer); read(offset: number, length: number): Buffer; readHeader(): void; readRowGroup(schema: ParquetSchema, rowGroup: RowGroup, columnList: string[][]): ParquetReadBuffer; readColumnChunk(schema: ParquetSchema, colChunk: ColumnChunk): ParquetReadData; readFooter(): FileMetaData; }