UNPKG

nodejs-polars

Version:

Polars: Blazingly fast DataFrames in Rust, Python, Node.js, R and SQL

328 lines (322 loc) 14 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.readRecords = readRecords; exports.readCSV = readCSV; exports.scanCSV = scanCSV; exports.readJSON = readJSON; exports.scanJson = scanJson; exports.readParquet = readParquet; exports.readAvro = readAvro; exports.scanParquet = scanParquet; exports.readIPC = readIPC; exports.readIPCStream = readIPCStream; exports.scanIPC = scanIPC; exports.readCSVStream = readCSVStream; exports.readJSONStream = readJSONStream; const node_stream_1 = require("node:stream"); const dataframe_1 = require("./dataframe"); const functions_1 = require("./functions"); const polars_internal_1 = __importDefault(require("./internals/polars_internal")); const dataframe_2 = require("./lazy/dataframe"); const utils_1 = require("./utils"); const readCsvDefaultOptions = { inferSchemaLength: 100, hasHeader: true, ignoreErrors: true, chunkSize: 10000, skipRows: 0, sep: ",", rechunk: false, encoding: "utf8", lowMemory: false, tryParseDates: false, skipRowsAfterHeader: 0, raiseIfEmpty: true, truncateRaggedLines: true, missingIsNull: true, eolChar: "\n", }; const readJsonDefaultOptions = { batchSize: 10000, inferSchemaLength: 50, format: "json", }; // utility to read streams as lines. class LineBatcher extends node_stream_1.Stream.Transform { #lines; #accumulatedLines; #batchSize; constructor(options) { super(options); this.#lines = []; this.#accumulatedLines = 0; this.#batchSize = options.batchSize; } _transform(chunk, _encoding, done) { let begin = 0; let i = 0; while (i < chunk.length) { if (chunk[i] === 10) { // '\n' this.#accumulatedLines++; if (this.#accumulatedLines === this.#batchSize) { this.#lines.push(chunk.subarray(begin, i + 1)); this.push(Buffer.concat(this.#lines)); this.#lines = []; this.#accumulatedLines = 0; begin = i + 1; } } i++; } this.#lines.push(chunk.subarray(begin)); done(); } _flush(done) { this.push(Buffer.concat(this.#lines)); done(); } } // helper functions function readCSVBuffer(buff, options) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readCsv(buff, { ...readCsvDefaultOptions, ...options })); } function readRecords(records, options) { if (options?.schema) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.fromRows(records, options.schema)); } return (0, dataframe_1._DataFrame)(polars_internal_1.default.fromRows(records, undefined, options?.inferSchemaLength)); } function readCSV(pathOrBody, options) { options = { ...readCsvDefaultOptions, ...options }; const extensions = [".tsv", ".csv"]; // Handle If set to `null` case options.inferSchemaLength = options.inferSchemaLength ?? 0; if (Buffer.isBuffer(pathOrBody)) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readCsv(pathOrBody, options)); } if (typeof pathOrBody === "string") { const inline = !(0, utils_1.isPath)(pathOrBody, extensions); if (inline) { const buf = Buffer.from(pathOrBody, "utf-8"); return (0, dataframe_1._DataFrame)(polars_internal_1.default.readCsv(buf, options)); } return (0, dataframe_1._DataFrame)(polars_internal_1.default.readCsv(pathOrBody, options)); } throw new Error("must supply either a path or body"); } const scanCsvDefaultOptions = { inferSchemaLength: 100, cache: true, hasHeader: true, ignoreErrors: true, skipRows: 0, sep: ",", rechunk: false, encoding: "utf8", lowMemory: false, parseDates: false, skipRowsAfterHeader: 0, }; function scanCSV(path, options) { options = { ...scanCsvDefaultOptions, ...options }; // Handle If set to `null` case options.inferSchemaLength = options.inferSchemaLength ?? 0; return (0, dataframe_2._LazyDataFrame)(polars_internal_1.default.scanCsv(path, options)); } function readJSON(pathOrBody, options = readJsonDefaultOptions) { options = { ...readJsonDefaultOptions, ...options }; const method = options.format === "lines" ? polars_internal_1.default.readJsonLines : polars_internal_1.default.readJson; const extensions = [".ndjson", ".json", ".jsonl"]; // Handle If set to `null` case options.inferSchemaLength = options.inferSchemaLength ?? 0; if (Buffer.isBuffer(pathOrBody)) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readJson(pathOrBody, options)); } if (typeof pathOrBody === "string") { const inline = !(0, utils_1.isPath)(pathOrBody, extensions); if (inline) { return (0, dataframe_1._DataFrame)(method(Buffer.from(pathOrBody, "utf-8"), options)); } return (0, dataframe_1._DataFrame)(method(pathOrBody, options)); } throw new Error("must supply either a path or body"); } function scanJson(path, options) { options = { ...readJsonDefaultOptions, ...options }; // Handle If set to `null` case options.inferSchemaLength = options.inferSchemaLength ?? 0; return (0, dataframe_2._LazyDataFrame)(polars_internal_1.default.scanJson(path, options)); } /** * Read into a DataFrame from a parquet file. * @param pathOrBuffer * Path to a file, list of files, or a file like object. If the path is a directory, that directory will be used * as partition aware scan. * @param options.columns Columns to select. Accepts a list of column indices (starting at zero) or a list of column names. * @param options.numRows Stop reading from parquet file after reading ``numRows``. * @param options.parallel * Any of 'auto' | 'columns' | 'row_groups' | 'none' This determines the direction of parallelism. 'auto' will try to determine the optimal direction. Defaults to 'auto' * @param options.rowCount Add row count as column */ function readParquet(pathOrBody, options) { const pliOptions = {}; if (typeof options?.columns?.[0] === "number") { pliOptions.projection = options?.columns; } else { pliOptions.columns = options?.columns; } pliOptions.nRows = options?.numRows; pliOptions.rowCount = options?.rowCount; const parallel = options?.parallel ?? "auto"; if (Buffer.isBuffer(pathOrBody)) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readParquet(pathOrBody, pliOptions, parallel)); } if (typeof pathOrBody === "string") { const inline = !(0, utils_1.isPath)(pathOrBody, [".parquet"]); if (inline) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readParquet(Buffer.from(pathOrBody), pliOptions, parallel)); } return (0, dataframe_1._DataFrame)(polars_internal_1.default.readParquet(pathOrBody, pliOptions, parallel)); } throw new Error("must supply either a path or body"); } function readAvro(pathOrBody, options = {}) { if (Buffer.isBuffer(pathOrBody)) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readAvro(pathOrBody, options)); } if (typeof pathOrBody === "string") { const inline = !(0, utils_1.isPath)(pathOrBody, [".avro"]); if (inline) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readAvro(Buffer.from(pathOrBody), options)); } return (0, dataframe_1._DataFrame)(polars_internal_1.default.readAvro(pathOrBody, options)); } throw new Error("must supply either a path or body"); } /** * Lazily read from a local or cloud-hosted parquet file (or files). This function allows the query optimizer to push down predicates and projections to the scan level, typically increasing performance and reducing memory overhead. * This allows the query optimizer to push down predicates and projections to the scan level, * thereby potentially reducing memory overhead. * @param source - Path(s) to a file. If a single path is given, it can be a globbing pattern. @param options.nRows - Stop reading from parquet file after reading `n_rows`. @param options.rowIndexName - If not None, this will insert a row index column with the given name into the DataFrame @param options.rowIndexOffset - Offset to start the row index column (only used if the name is set) @param options.parallel : {'auto', 'columns', 'row_groups', 'none'} This determines the direction of parallelism. 'auto' will try to determine the optimal direction. @param options.useStatistics - Use statistics in the parquet to determine if pages can be skipped from reading. @param options.hivePartitioning - Infer statistics and schema from hive partitioned URL and use them to prune reads. @param options.glob - Expand path given via globbing rules. @param options.hiveSchema - The column names and data types of the columns by which the data is partitioned. If set to `None` (default), the schema of the Hive partitions is inferred. @param options.tryParseHiveDates - Whether to try parsing hive values as date/datetime types. @param options.rechunk - In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks. @param options.lowMemory - Reduce memory pressure at the expense of performance. @param options.cache - Cache the result after reading. @param options.cloudOptions - Options that indicate how to connect to a cloud provider. If the cloud provider is not supported by Polars, the storage options are passed to `fsspec.open()`. The cloud providers currently supported are AWS, GCP, and Azure. See supported keys here: * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_ * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_ * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_ If `cloudOptions` is not provided, Polars will try to infer the information from environment variables. @param retries - Number of retries if accessing a cloud instance fails. @param includeFilePaths - Include the path of the source file(s) as a column with this name. */ function scanParquet(source, options = {}) { const defaultOptions = { parallel: "auto" }; const pliOptions = { ...defaultOptions, ...options }; return (0, dataframe_2._LazyDataFrame)(polars_internal_1.default.scanParquet(source, pliOptions)); } function readIPC(pathOrBody, options = {}) { if (Buffer.isBuffer(pathOrBody)) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpc(pathOrBody, options)); } if (typeof pathOrBody === "string") { const inline = !(0, utils_1.isPath)(pathOrBody, [".ipc"]); if (inline) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpc(Buffer.from(pathOrBody, "utf-8"), options)); } return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpc(pathOrBody, options)); } throw new Error("must supply either a path or body"); } function readIPCStream(pathOrBody, options = {}) { if (Buffer.isBuffer(pathOrBody)) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpcStream(pathOrBody, options)); } if (typeof pathOrBody === "string") { const inline = !(0, utils_1.isPath)(pathOrBody, [".ipc"]); if (inline) { return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpcStream(Buffer.from(pathOrBody, "utf-8"), options)); } return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpcStream(pathOrBody, options)); } throw new Error("must supply either a path or body"); } function scanIPC(path, options = {}) { return (0, dataframe_2._LazyDataFrame)(polars_internal_1.default.scanIpc(path, options)); } function readCSVStream(stream, options) { const batchSize = options?.batchSize ?? 10000; let count = 0; const end = options?.endRows ?? Number.POSITIVE_INFINITY; return new Promise((resolve, reject) => { const s = stream.pipe(new LineBatcher({ batchSize })); const chunks = []; s.on("data", (chunk) => { // early abort if 'end rows' is specified if (count <= end) { chunks.push(chunk); } else { s.end(); } count += batchSize; }).on("end", () => { try { const buff = Buffer.concat(chunks); const df = readCSVBuffer(buff, options); resolve(df); } catch (err) { reject(err); } }); }); } function readJSONStream(stream, options = readJsonDefaultOptions) { options = { ...readJsonDefaultOptions, ...options }; return new Promise((resolve, reject) => { const chunks = []; stream .pipe(new LineBatcher({ batchSize: options.batchSize })) .on("data", (chunk) => { try { const df = (0, dataframe_1._DataFrame)(polars_internal_1.default.readJson(chunk, options)); chunks.push(df); } catch (err) { reject(err); } }) .on("end", () => { try { const df = (0, functions_1.concat)(chunks); resolve(df); } catch (err) { reject(err); } }); }); }