nodejs-polars
Version:
Polars: Blazingly fast DataFrames in Rust, Python, Node.js, R and SQL
427 lines (421 loc) • 21.6 kB
TypeScript
import { type Readable } from "node:stream";
import { type DataFrame } from "./dataframe";
import type { DataType } from "./datatypes";
import { type LazyDataFrame } from "./lazy/dataframe";
import type { RowCount, ScanParquetOptions } from "./types";
export interface ReadCsvOptions {
inferSchemaLength: number | null;
nRows: number;
hasHeader: boolean;
ignoreErrors: boolean;
endRows: number;
startRows: number;
projection: number;
sep: string;
schema: Record<string, DataType>;
columns: string[];
rechunk: boolean;
encoding: "utf8" | "utf8-lossy";
numThreads: number;
dtypes: Record<string, DataType>;
lowMemory: boolean;
commentChar: string;
quoteChar: string;
eolChar: string;
nullValues: string | Array<string> | Record<string, string>;
chunkSize: number;
skipRows: number;
tryParseDates: boolean;
skipRowsAfterHeader: number;
rowCount: RowCount;
raiseIfEmpty: boolean;
truncateRaggedLines: boolean;
missingIsNull: boolean;
}
export interface ReadJsonOptions {
batchSize: number;
inferSchemaLength: number | null;
format: "lines" | "json";
}
export declare function readRecords(records: Record<string, any>[], options?: {
schema: Record<string, DataType>;
}): DataFrame;
export declare function readRecords(records: Record<string, any>[], options?: {
inferSchemaLength?: number;
}): DataFrame;
/**
* __Read a CSV file or string into a Dataframe.__
* ___
* @param pathOrBody - path or buffer or string
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.csv`.
* - body: String or buffer to be read as a CSV
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.nRows -After n rows are read from the CSV, it stops reading.
* During multi-threaded parsing, an upper bound of `n` rows
* cannot be guaranteed.
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`,
* `x` being an enumeration over every column in the dataset.
* @param options.ignoreErrors -Try to keep reading lines if some lines yield errors.
* @param options.endRows -After n rows are read from the CSV, it stops reading.
* During multi-threaded parsing, an upper bound of `n` rows
* cannot be guaranteed.
* @param options.startRows -Start reading after `startRows` position.
* @param options.projection -Indices of columns to select. Note that column indices start at zero.
* @param options.sep -Character to use as delimiter in the file.
* @param options.columns -Columns to select.
* @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.
* @param options.encoding -Allowed encodings: `utf8`, `utf8-lossy`. Lossy means that invalid utf8 values are replaced with `�` character.
* @param options.numThreads -Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.
* @param options.dtype -Overwrite the dtypes during inference.
* @param options.schema -Set the CSV file's schema. This only accepts datatypes that are implemented in the csv parser and expects a complete Schema.
* @param options.lowMemory - Reduce memory usage in expense of performance.
* @param options.commentChar - character that indicates the start of a comment line, for instance '#'.
* @param options.quoteChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off.
* @param options.nullValues - Values to interpret as null values. You can provide a
* - `string` -> all values encountered equal to this string will be null
* - `Array<string>` -> A null value per column.
* - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0}
* @param options.parseDates -Whether to attempt to parse dates or not
* @returns DataFrame
*/
export declare function readCSV(pathOrBody: string | Buffer, options?: Partial<ReadCsvOptions>): DataFrame;
export interface ScanCsvOptions {
hasHeader: boolean;
sep: string;
commentChar: string;
quoteChar: string;
skipRows: number;
nullValues: string | Array<string> | Record<string, string>;
ignoreErrors: boolean;
cache: boolean;
inferSchemaLength: number | null;
rechunk: boolean;
nRows: number;
encoding: string;
lowMemory: boolean;
parseDates: boolean;
skipRowsAfterHeader: number;
eolChar: string;
missingUtf8IsEmptyString: boolean;
raiseIfEmpty: boolean;
truncateRaggedLines: boolean;
schema: Record<string, DataType>;
}
/**
* __Lazily read from a CSV file or multiple files via glob patterns.__
*
* This allows the query optimizer to push down predicates and
* projections to the scan level, thereby potentially reducing
* memory overhead.
* ___
* @param path path to a file
* @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`,
* `x` being an enumeration over every column in the dataset.
* @param options.sep -Character to use as delimiter in the file.
* @param options.commentChar - character that indicates the start of a comment line, for instance '#'.
* @param options.quoteChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off.
* @param options.skipRows -Start reading after `skipRows` position.
* @param options.nullValues - Values to interpret as null values. You can provide a
* - `string` -> all values encountered equal to this string will be null
* - `Array<string>` -> A null value per column.
* - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0}
* @param options.ignoreErrors -Try to keep reading lines if some lines yield errors.
* @param options.cache Cache the result after reading.
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.nRows -After n rows are read from the CSV, it stops reading.
* During multi-threaded parsing, an upper bound of `n` rows
* cannot be guaranteed.
* @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.
* @param options.lowMemory - Reduce memory usage in expense of performance.
* ___
*
*/
export declare function scanCSV(path: string, options?: Partial<ScanCsvOptions>): LazyDataFrame;
/**
* __Read a JSON file or string into a DataFrame.__
*
* @param pathOrBody - path or buffer or string
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.csv`.
* - body: String or buffer to be read as a CSV
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.jsonFormat - Either "lines" or "json"
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @returns ({@link DataFrame})
* @example
* ```
* const jsonString = `
* {"a", 1, "b", "foo", "c": 3}
* {"a": 2, "b": "bar", "c": 6}
* `
* > const df = pl.readJSON(jsonString)
* > console.log(df)
* shape: (2, 3)
* ╭─────┬─────┬─────╮
* │ a ┆ b ┆ c │
* │ --- ┆ --- ┆ --- │
* │ i64 ┆ str ┆ i64 │
* ╞═════╪═════╪═════╡
* │ 1 ┆ foo ┆ 3 │
* ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ bar ┆ 6 │
* ╰─────┴─────┴─────╯
* ```
*/
export declare function readJSON(pathOrBody: string | Buffer, options?: Partial<ReadJsonOptions>): DataFrame;
interface ScanJsonOptions {
inferSchemaLength: number | null;
nThreads: number;
batchSize: number;
lowMemory: boolean;
numRows: number;
skipRows: number;
rowCount: RowCount;
}
/**
* __Read a JSON file or string into a DataFrame.__
*
* _Note: Currently only newline delimited JSON is supported_
* @param path - path to json file
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `./file.json`.
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.nThreads - Maximum number of threads to use when reading json.
* @param options.lowMemory - Reduce memory usage in expense of performance.
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @param options.numRows Stop reading from parquet file after reading ``numRows``.
* @param options.skipRows -Start reading after ``skipRows`` position.
* @param options.rowCount Add row count as column
* @returns ({@link DataFrame})
* @example
* ```
* > const df = pl.scanJson('path/to/file.json', {numRows: 2}).collectSync()
* > console.log(df)
* shape: (2, 3)
* ╭─────┬─────┬─────╮
* │ a ┆ b ┆ c │
* │ --- ┆ --- ┆ --- │
* │ i64 ┆ str ┆ i64 │
* ╞═════╪═════╪═════╡
* │ 1 ┆ foo ┆ 3 │
* ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ bar ┆ 6 │
* ╰─────┴─────┴─────╯
* ```
*/
export declare function scanJson(path: string, options?: Partial<ScanJsonOptions>): LazyDataFrame;
interface ReadParquetOptions {
columns: string[] | number[];
numRows: number;
parallel: "auto" | "columns" | "row_groups" | "none";
rowCount: RowCount;
}
/**
* Read into a DataFrame from a parquet file.
* @param pathOrBuffer
* Path to a file, list of files, or a file like object. If the path is a directory, that directory will be used
* as partition aware scan.
* @param options.columns Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.
* @param options.numRows Stop reading from parquet file after reading ``numRows``.
* @param options.parallel
* Any of 'auto' | 'columns' | 'row_groups' | 'none'
This determines the direction of parallelism. 'auto' will try to determine the optimal direction.
Defaults to 'auto'
* @param options.rowCount Add row count as column
*/
export declare function readParquet(pathOrBody: string | Buffer, options?: Partial<ReadParquetOptions>): DataFrame;
export interface ReadAvroOptions {
columns: string[] | Array<string> | number[];
projection: number;
nRows: number;
}
/**
* Read into a DataFrame from an avro file.
* @param pathOrBuffer
* Path to a file, list of files, or a file like object. If the path is a directory, that directory will be used
* as partition aware scan.
* @param options.columns Columns to select. Accepts a list of column names.
* @param options.projection -Indices of columns to select. Note that column indices start at zero.
* @param options.nRows Stop reading from avro file after reading ``nRows``.
*/
export declare function readAvro(pathOrBody: string | Buffer, options?: Partial<ReadAvroOptions>): DataFrame;
/**
* Lazily read from a local or cloud-hosted parquet file (or files).
This function allows the query optimizer to push down predicates and projections to
the scan level, typically increasing performance and reducing memory overhead.
* This allows the query optimizer to push down predicates and projections to the scan level,
* thereby potentially reducing memory overhead.
* @param source - Path(s) to a file. If a single path is given, it can be a globbing pattern.
@param options.nRows - Stop reading from parquet file after reading `n_rows`.
@param options.rowIndexName - If not None, this will insert a row index column with the given name into the DataFrame
@param options.rowIndexOffset - Offset to start the row index column (only used if the name is set)
@param options.parallel : {'auto', 'columns', 'row_groups', 'none'}
This determines the direction of parallelism. 'auto' will try to determine the optimal direction.
@param options.useStatistics - Use statistics in the parquet to determine if pages can be skipped from reading.
@param options.hivePartitioning - Infer statistics and schema from hive partitioned URL and use them to prune reads.
@param options.glob - Expand path given via globbing rules.
@param options.hiveSchema - The column names and data types of the columns by which the data is partitioned.
If set to `None` (default), the schema of the Hive partitions is inferred.
@param options.tryParseHiveDates - Whether to try parsing hive values as date/datetime types.
@param options.rechunk - In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.
@param options.lowMemory - Reduce memory pressure at the expense of performance.
@param options.cache - Cache the result after reading.
@param options.cloudOptions - Options that indicate how to connect to a cloud provider.
If the cloud provider is not supported by Polars, the storage options are passed to `fsspec.open()`.
The cloud providers currently supported are AWS, GCP, and Azure.
See supported keys here:
* `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
* `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
* `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
If `cloudOptions` is not provided, Polars will try to infer the information from environment variables.
@param retries - Number of retries if accessing a cloud instance fails.
@param includeFilePaths - Include the path of the source file(s) as a column with this name.
*/
export declare function scanParquet(source: string, options?: ScanParquetOptions): LazyDataFrame;
export interface ReadIPCOptions {
columns: string[] | number[];
nRows: number;
}
/**
* __Read into a DataFrame from Arrow IPC file (Feather v2).__
* ___
* @param pathOrBody - path or buffer or string
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.ipc`.
* - body: String or buffer to be read as Arrow IPC
* @param options.columns Columns to select. Accepts a list of column names.
* @param options.nRows Stop reading from parquet file after reading ``nRows``.
*/
export declare function readIPC(pathOrBody: string | Buffer, options?: Partial<ReadIPCOptions>): DataFrame;
/**
* __Read into a DataFrame from Arrow IPC stream.__
* ___
* @param pathOrBody - path or buffer or string
* - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.ipc`.
* - body: String or buffer to be read as Arrow IPC
* @param options.columns Columns to select. Accepts a list of column names.
* @param options.nRows Stop reading from parquet file after reading ``nRows``.
*/
export declare function readIPCStream(pathOrBody: string | Buffer, options?: Partial<ReadIPCOptions>): DataFrame;
export interface ScanIPCOptions {
nRows: number;
cache: boolean;
rechunk: boolean;
}
/**
* __Lazily read from an Arrow IPC file (Feather v2) or multiple files via glob patterns.__
* ___
* @param path Path to a IPC file.
* @param options.nRows Stop reading from IPC file after reading ``nRows``
* @param options.cache Cache the result after reading.
* @param options.rechunk Reallocate to contiguous memory when all chunks/ files are parsed.
*/
export declare function scanIPC(path: string, options?: Partial<ScanIPCOptions>): LazyDataFrame;
/**
* __Read a stream into a Dataframe.__
*
* **Warning:** this is much slower than `scanCSV` or `readCSV`
*
* This will consume the entire stream into a single buffer and then call `readCSV`
* Only use it when you must consume from a stream, or when performance is not a major consideration
*
* ___
* @param stream - readable stream containing csv data
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`,
* `x` being an enumeration over every column in the dataset.
* @param options.ignoreErrors -Try to keep reading lines if some lines yield errors.
* @param options.endRows -After n rows are read from the CSV, it stops reading.
* During multi-threaded parsing, an upper bound of `n` rows
* cannot be guaranteed.
* @param options.startRows -Start reading after `startRows` position.
* @param options.projection -Indices of columns to select. Note that column indices start at zero.
* @param options.sep -Character to use as delimiter in the file.
* @param options.columns -Columns to select.
* @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array.
* @param options.encoding -Allowed encodings: `utf8`, `utf8-lossy`. Lossy means that invalid utf8 values are replaced with `�` character.
* @param options.numThreads -Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system.
* @param options.dtype -Overwrite the dtypes during inference.
* @param options.lowMemory - Reduce memory usage in expense of performance.
* @param options.commentChar - character that indicates the start of a comment line, for instance '#'.
* @param options.quoteChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off.
* @param options.nullValues - Values to interpret as null values. You can provide a
* - `string` -> all values encountered equal to this string will be null
* - `Array<string>` -> A null value per column.
* - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0}
* @param options.parseDates -Whether to attempt to parse dates or not
* @returns Promise<DataFrame>
*
* @example
* ```
* >>> const readStream = new Stream.Readable({read(){}});
* >>> readStream.push(`a,b\n`);
* >>> readStream.push(`1,2\n`);
* >>> readStream.push(`2,2\n`);
* >>> readStream.push(`3,2\n`);
* >>> readStream.push(`4,2\n`);
* >>> readStream.push(null);
*
* >>> pl.readCSVStream(readStream).then(df => console.log(df));
* shape: (4, 2)
* ┌─────┬─────┐
* │ a ┆ b │
* │ --- ┆ --- │
* │ i64 ┆ i64 │
* ╞═════╪═════╡
* │ 1 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 3 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 4 ┆ 2 │
* └─────┴─────┘
* ```
*/
export declare function readCSVStream(stream: Readable, options?: Partial<ReadCsvOptions>): Promise<DataFrame>;
/**
* __Read a newline delimited JSON stream into a DataFrame.__
*
* @param stream - readable stream containing json data
* @param options
* @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8.
* If set to `null`, a full table scan will be done (slow).
* Note: this is done per batch
* @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance.
* @example
* ```
* >>> const readStream = new Stream.Readable({read(){}});
* >>> readStream.push(`${JSON.stringify({a: 1, b: 2})} \n`);
* >>> readStream.push(`${JSON.stringify({a: 2, b: 2})} \n`);
* >>> readStream.push(`${JSON.stringify({a: 3, b: 2})} \n`);
* >>> readStream.push(`${JSON.stringify({a: 4, b: 2})} \n`);
* >>> readStream.push(null);
*
* >>> pl.readJSONStream(readStream, { format: "lines" }).then(df => console.log(df));
* shape: (4, 2)
* ┌─────┬─────┐
* │ a ┆ b │
* │ --- ┆ --- │
* │ i64 ┆ i64 │
* ╞═════╪═════╡
* │ 1 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 2 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 3 ┆ 2 │
* ├╌╌╌╌╌┼╌╌╌╌╌┤
* │ 4 ┆ 2 │
* └─────┴─────┘
* ```
*/
export declare function readJSONStream(stream: Readable, options?: Partial<ReadJsonOptions>): Promise<DataFrame>;
export {};