UNPKG

nodejs-polars

Version:

Polars: Blazingly fast DataFrames in Rust, Python, Node.js, R and SQL

427 lines (421 loc) 21.6 kB
import { type Readable } from "node:stream"; import { type DataFrame } from "./dataframe"; import type { DataType } from "./datatypes"; import { type LazyDataFrame } from "./lazy/dataframe"; import type { RowCount, ScanParquetOptions } from "./types"; export interface ReadCsvOptions { inferSchemaLength: number | null; nRows: number; hasHeader: boolean; ignoreErrors: boolean; endRows: number; startRows: number; projection: number; sep: string; schema: Record<string, DataType>; columns: string[]; rechunk: boolean; encoding: "utf8" | "utf8-lossy"; numThreads: number; dtypes: Record<string, DataType>; lowMemory: boolean; commentChar: string; quoteChar: string; eolChar: string; nullValues: string | Array<string> | Record<string, string>; chunkSize: number; skipRows: number; tryParseDates: boolean; skipRowsAfterHeader: number; rowCount: RowCount; raiseIfEmpty: boolean; truncateRaggedLines: boolean; missingIsNull: boolean; } export interface ReadJsonOptions { batchSize: number; inferSchemaLength: number | null; format: "lines" | "json"; } export declare function readRecords(records: Record<string, any>[], options?: { schema: Record<string, DataType>; }): DataFrame; export declare function readRecords(records: Record<string, any>[], options?: { inferSchemaLength?: number; }): DataFrame; /** * __Read a CSV file or string into a Dataframe.__ * ___ * @param pathOrBody - path or buffer or string * - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.csv`. * - body: String or buffer to be read as a CSV * @param options * @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8. * If set to `null`, a full table scan will be done (slow). * @param options.nRows -After n rows are read from the CSV, it stops reading. * During multi-threaded parsing, an upper bound of `n` rows * cannot be guaranteed. * @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance. * @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`, * `x` being an enumeration over every column in the dataset. * @param options.ignoreErrors -Try to keep reading lines if some lines yield errors. * @param options.endRows -After n rows are read from the CSV, it stops reading. * During multi-threaded parsing, an upper bound of `n` rows * cannot be guaranteed. * @param options.startRows -Start reading after `startRows` position. * @param options.projection -Indices of columns to select. Note that column indices start at zero. * @param options.sep -Character to use as delimiter in the file. * @param options.columns -Columns to select. * @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array. * @param options.encoding -Allowed encodings: `utf8`, `utf8-lossy`. Lossy means that invalid utf8 values are replaced with `�` character. * @param options.numThreads -Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system. * @param options.dtype -Overwrite the dtypes during inference. * @param options.schema -Set the CSV file's schema. This only accepts datatypes that are implemented in the csv parser and expects a complete Schema. * @param options.lowMemory - Reduce memory usage in expense of performance. * @param options.commentChar - character that indicates the start of a comment line, for instance '#'. * @param options.quoteChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off. * @param options.nullValues - Values to interpret as null values. You can provide a * - `string` -> all values encountered equal to this string will be null * - `Array<string>` -> A null value per column. * - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0} * @param options.parseDates -Whether to attempt to parse dates or not * @returns DataFrame */ export declare function readCSV(pathOrBody: string | Buffer, options?: Partial<ReadCsvOptions>): DataFrame; export interface ScanCsvOptions { hasHeader: boolean; sep: string; commentChar: string; quoteChar: string; skipRows: number; nullValues: string | Array<string> | Record<string, string>; ignoreErrors: boolean; cache: boolean; inferSchemaLength: number | null; rechunk: boolean; nRows: number; encoding: string; lowMemory: boolean; parseDates: boolean; skipRowsAfterHeader: number; eolChar: string; missingUtf8IsEmptyString: boolean; raiseIfEmpty: boolean; truncateRaggedLines: boolean; schema: Record<string, DataType>; } /** * __Lazily read from a CSV file or multiple files via glob patterns.__ * * This allows the query optimizer to push down predicates and * projections to the scan level, thereby potentially reducing * memory overhead. * ___ * @param path path to a file * @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`, * `x` being an enumeration over every column in the dataset. * @param options.sep -Character to use as delimiter in the file. * @param options.commentChar - character that indicates the start of a comment line, for instance '#'. * @param options.quoteChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off. * @param options.skipRows -Start reading after `skipRows` position. * @param options.nullValues - Values to interpret as null values. You can provide a * - `string` -> all values encountered equal to this string will be null * - `Array<string>` -> A null value per column. * - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0} * @param options.ignoreErrors -Try to keep reading lines if some lines yield errors. * @param options.cache Cache the result after reading. * @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8. * If set to `null`, a full table scan will be done (slow). * @param options.nRows -After n rows are read from the CSV, it stops reading. * During multi-threaded parsing, an upper bound of `n` rows * cannot be guaranteed. * @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array. * @param options.lowMemory - Reduce memory usage in expense of performance. * ___ * */ export declare function scanCSV(path: string, options?: Partial<ScanCsvOptions>): LazyDataFrame; /** * __Read a JSON file or string into a DataFrame.__ * * @param pathOrBody - path or buffer or string * - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.csv`. * - body: String or buffer to be read as a CSV * @param options * @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8. * If set to `null`, a full table scan will be done (slow). * @param options.jsonFormat - Either "lines" or "json" * @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance. * @returns ({@link DataFrame}) * @example * ``` * const jsonString = ` * {"a", 1, "b", "foo", "c": 3} * {"a": 2, "b": "bar", "c": 6} * ` * > const df = pl.readJSON(jsonString) * > console.log(df) * shape: (2, 3) * ╭─────┬─────┬─────╮ * │ a ┆ b ┆ c │ * │ --- ┆ --- ┆ --- │ * │ i64 ┆ str ┆ i64 │ * ╞═════╪═════╪═════╡ * │ 1 ┆ foo ┆ 3 │ * ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ * │ 2 ┆ bar ┆ 6 │ * ╰─────┴─────┴─────╯ * ``` */ export declare function readJSON(pathOrBody: string | Buffer, options?: Partial<ReadJsonOptions>): DataFrame; interface ScanJsonOptions { inferSchemaLength: number | null; nThreads: number; batchSize: number; lowMemory: boolean; numRows: number; skipRows: number; rowCount: RowCount; } /** * __Read a JSON file or string into a DataFrame.__ * * _Note: Currently only newline delimited JSON is supported_ * @param path - path to json file * - path: Path to a file or a file like string. Any valid filepath can be used. Example: `./file.json`. * @param options * @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8. * If set to `null`, a full table scan will be done (slow). * @param options.nThreads - Maximum number of threads to use when reading json. * @param options.lowMemory - Reduce memory usage in expense of performance. * @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance. * @param options.numRows Stop reading from parquet file after reading ``numRows``. * @param options.skipRows -Start reading after ``skipRows`` position. * @param options.rowCount Add row count as column * @returns ({@link DataFrame}) * @example * ``` * > const df = pl.scanJson('path/to/file.json', {numRows: 2}).collectSync() * > console.log(df) * shape: (2, 3) * ╭─────┬─────┬─────╮ * │ a ┆ b ┆ c │ * │ --- ┆ --- ┆ --- │ * │ i64 ┆ str ┆ i64 │ * ╞═════╪═════╪═════╡ * │ 1 ┆ foo ┆ 3 │ * ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤ * │ 2 ┆ bar ┆ 6 │ * ╰─────┴─────┴─────╯ * ``` */ export declare function scanJson(path: string, options?: Partial<ScanJsonOptions>): LazyDataFrame; interface ReadParquetOptions { columns: string[] | number[]; numRows: number; parallel: "auto" | "columns" | "row_groups" | "none"; rowCount: RowCount; } /** * Read into a DataFrame from a parquet file. * @param pathOrBuffer * Path to a file, list of files, or a file like object. If the path is a directory, that directory will be used * as partition aware scan. * @param options.columns Columns to select. Accepts a list of column indices (starting at zero) or a list of column names. * @param options.numRows Stop reading from parquet file after reading ``numRows``. * @param options.parallel * Any of 'auto' | 'columns' | 'row_groups' | 'none' This determines the direction of parallelism. 'auto' will try to determine the optimal direction. Defaults to 'auto' * @param options.rowCount Add row count as column */ export declare function readParquet(pathOrBody: string | Buffer, options?: Partial<ReadParquetOptions>): DataFrame; export interface ReadAvroOptions { columns: string[] | Array<string> | number[]; projection: number; nRows: number; } /** * Read into a DataFrame from an avro file. * @param pathOrBuffer * Path to a file, list of files, or a file like object. If the path is a directory, that directory will be used * as partition aware scan. * @param options.columns Columns to select. Accepts a list of column names. * @param options.projection -Indices of columns to select. Note that column indices start at zero. * @param options.nRows Stop reading from avro file after reading ``nRows``. */ export declare function readAvro(pathOrBody: string | Buffer, options?: Partial<ReadAvroOptions>): DataFrame; /** * Lazily read from a local or cloud-hosted parquet file (or files). This function allows the query optimizer to push down predicates and projections to the scan level, typically increasing performance and reducing memory overhead. * This allows the query optimizer to push down predicates and projections to the scan level, * thereby potentially reducing memory overhead. * @param source - Path(s) to a file. If a single path is given, it can be a globbing pattern. @param options.nRows - Stop reading from parquet file after reading `n_rows`. @param options.rowIndexName - If not None, this will insert a row index column with the given name into the DataFrame @param options.rowIndexOffset - Offset to start the row index column (only used if the name is set) @param options.parallel : {'auto', 'columns', 'row_groups', 'none'} This determines the direction of parallelism. 'auto' will try to determine the optimal direction. @param options.useStatistics - Use statistics in the parquet to determine if pages can be skipped from reading. @param options.hivePartitioning - Infer statistics and schema from hive partitioned URL and use them to prune reads. @param options.glob - Expand path given via globbing rules. @param options.hiveSchema - The column names and data types of the columns by which the data is partitioned. If set to `None` (default), the schema of the Hive partitions is inferred. @param options.tryParseHiveDates - Whether to try parsing hive values as date/datetime types. @param options.rechunk - In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks. @param options.lowMemory - Reduce memory pressure at the expense of performance. @param options.cache - Cache the result after reading. @param options.cloudOptions - Options that indicate how to connect to a cloud provider. If the cloud provider is not supported by Polars, the storage options are passed to `fsspec.open()`. The cloud providers currently supported are AWS, GCP, and Azure. See supported keys here: * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_ * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_ * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_ If `cloudOptions` is not provided, Polars will try to infer the information from environment variables. @param retries - Number of retries if accessing a cloud instance fails. @param includeFilePaths - Include the path of the source file(s) as a column with this name. */ export declare function scanParquet(source: string, options?: ScanParquetOptions): LazyDataFrame; export interface ReadIPCOptions { columns: string[] | number[]; nRows: number; } /** * __Read into a DataFrame from Arrow IPC file (Feather v2).__ * ___ * @param pathOrBody - path or buffer or string * - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.ipc`. * - body: String or buffer to be read as Arrow IPC * @param options.columns Columns to select. Accepts a list of column names. * @param options.nRows Stop reading from parquet file after reading ``nRows``. */ export declare function readIPC(pathOrBody: string | Buffer, options?: Partial<ReadIPCOptions>): DataFrame; /** * __Read into a DataFrame from Arrow IPC stream.__ * ___ * @param pathOrBody - path or buffer or string * - path: Path to a file or a file like string. Any valid filepath can be used. Example: `file.ipc`. * - body: String or buffer to be read as Arrow IPC * @param options.columns Columns to select. Accepts a list of column names. * @param options.nRows Stop reading from parquet file after reading ``nRows``. */ export declare function readIPCStream(pathOrBody: string | Buffer, options?: Partial<ReadIPCOptions>): DataFrame; export interface ScanIPCOptions { nRows: number; cache: boolean; rechunk: boolean; } /** * __Lazily read from an Arrow IPC file (Feather v2) or multiple files via glob patterns.__ * ___ * @param path Path to a IPC file. * @param options.nRows Stop reading from IPC file after reading ``nRows`` * @param options.cache Cache the result after reading. * @param options.rechunk Reallocate to contiguous memory when all chunks/ files are parsed. */ export declare function scanIPC(path: string, options?: Partial<ScanIPCOptions>): LazyDataFrame; /** * __Read a stream into a Dataframe.__ * * **Warning:** this is much slower than `scanCSV` or `readCSV` * * This will consume the entire stream into a single buffer and then call `readCSV` * Only use it when you must consume from a stream, or when performance is not a major consideration * * ___ * @param stream - readable stream containing csv data * @param options * @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8. * If set to `null`, a full table scan will be done (slow). * @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance. * @param options.hasHeader - Indicate if first row of dataset is header or not. If set to False first row will be set to `column_x`, * `x` being an enumeration over every column in the dataset. * @param options.ignoreErrors -Try to keep reading lines if some lines yield errors. * @param options.endRows -After n rows are read from the CSV, it stops reading. * During multi-threaded parsing, an upper bound of `n` rows * cannot be guaranteed. * @param options.startRows -Start reading after `startRows` position. * @param options.projection -Indices of columns to select. Note that column indices start at zero. * @param options.sep -Character to use as delimiter in the file. * @param options.columns -Columns to select. * @param options.rechunk -Make sure that all columns are contiguous in memory by aggregating the chunks into a single array. * @param options.encoding -Allowed encodings: `utf8`, `utf8-lossy`. Lossy means that invalid utf8 values are replaced with `�` character. * @param options.numThreads -Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system. * @param options.dtype -Overwrite the dtypes during inference. * @param options.lowMemory - Reduce memory usage in expense of performance. * @param options.commentChar - character that indicates the start of a comment line, for instance '#'. * @param options.quoteChar -character that is used for csv quoting, default = ''. Set to null to turn special handling and escaping of quotes off. * @param options.nullValues - Values to interpret as null values. You can provide a * - `string` -> all values encountered equal to this string will be null * - `Array<string>` -> A null value per column. * - `Record<string,string>` -> An object or map that maps column name to a null value string.Ex. {"column_1": 0} * @param options.parseDates -Whether to attempt to parse dates or not * @returns Promise<DataFrame> * * @example * ``` * >>> const readStream = new Stream.Readable({read(){}}); * >>> readStream.push(`a,b\n`); * >>> readStream.push(`1,2\n`); * >>> readStream.push(`2,2\n`); * >>> readStream.push(`3,2\n`); * >>> readStream.push(`4,2\n`); * >>> readStream.push(null); * * >>> pl.readCSVStream(readStream).then(df => console.log(df)); * shape: (4, 2) * ┌─────┬─────┐ * │ a ┆ b │ * │ --- ┆ --- │ * │ i64 ┆ i64 │ * ╞═════╪═════╡ * │ 1 ┆ 2 │ * ├╌╌╌╌╌┼╌╌╌╌╌┤ * │ 2 ┆ 2 │ * ├╌╌╌╌╌┼╌╌╌╌╌┤ * │ 3 ┆ 2 │ * ├╌╌╌╌╌┼╌╌╌╌╌┤ * │ 4 ┆ 2 │ * └─────┴─────┘ * ``` */ export declare function readCSVStream(stream: Readable, options?: Partial<ReadCsvOptions>): Promise<DataFrame>; /** * __Read a newline delimited JSON stream into a DataFrame.__ * * @param stream - readable stream containing json data * @param options * @param options.inferSchemaLength -Maximum number of lines to read to infer schema. If set to 0, all columns will be read as pl.Utf8. * If set to `null`, a full table scan will be done (slow). * Note: this is done per batch * @param options.batchSize - Number of lines to read into the buffer at once. Modify this to change performance. * @example * ``` * >>> const readStream = new Stream.Readable({read(){}}); * >>> readStream.push(`${JSON.stringify({a: 1, b: 2})} \n`); * >>> readStream.push(`${JSON.stringify({a: 2, b: 2})} \n`); * >>> readStream.push(`${JSON.stringify({a: 3, b: 2})} \n`); * >>> readStream.push(`${JSON.stringify({a: 4, b: 2})} \n`); * >>> readStream.push(null); * * >>> pl.readJSONStream(readStream, { format: "lines" }).then(df => console.log(df)); * shape: (4, 2) * ┌─────┬─────┐ * │ a ┆ b │ * │ --- ┆ --- │ * │ i64 ┆ i64 │ * ╞═════╪═════╡ * │ 1 ┆ 2 │ * ├╌╌╌╌╌┼╌╌╌╌╌┤ * │ 2 ┆ 2 │ * ├╌╌╌╌╌┼╌╌╌╌╌┤ * │ 3 ┆ 2 │ * ├╌╌╌╌╌┼╌╌╌╌╌┤ * │ 4 ┆ 2 │ * └─────┴─────┘ * ``` */ export declare function readJSONStream(stream: Readable, options?: Partial<ReadJsonOptions>): Promise<DataFrame>; export {};