nodejs-polars
Version:
Polars: Blazingly fast DataFrames in Rust, Python, Node.js, R and SQL
328 lines (322 loc) • 14 kB
JavaScript
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.readRecords = readRecords;
exports.readCSV = readCSV;
exports.scanCSV = scanCSV;
exports.readJSON = readJSON;
exports.scanJson = scanJson;
exports.readParquet = readParquet;
exports.readAvro = readAvro;
exports.scanParquet = scanParquet;
exports.readIPC = readIPC;
exports.readIPCStream = readIPCStream;
exports.scanIPC = scanIPC;
exports.readCSVStream = readCSVStream;
exports.readJSONStream = readJSONStream;
const node_stream_1 = require("node:stream");
const dataframe_1 = require("./dataframe");
const functions_1 = require("./functions");
const polars_internal_1 = __importDefault(require("./internals/polars_internal"));
const dataframe_2 = require("./lazy/dataframe");
const utils_1 = require("./utils");
const readCsvDefaultOptions = {
inferSchemaLength: 100,
hasHeader: true,
ignoreErrors: true,
chunkSize: 10000,
skipRows: 0,
sep: ",",
rechunk: false,
encoding: "utf8",
lowMemory: false,
tryParseDates: false,
skipRowsAfterHeader: 0,
raiseIfEmpty: true,
truncateRaggedLines: true,
missingIsNull: true,
eolChar: "\n",
};
const readJsonDefaultOptions = {
batchSize: 10000,
inferSchemaLength: 50,
format: "json",
};
// utility to read streams as lines.
class LineBatcher extends node_stream_1.Stream.Transform {
#lines;
#accumulatedLines;
#batchSize;
constructor(options) {
super(options);
this.#lines = [];
this.#accumulatedLines = 0;
this.#batchSize = options.batchSize;
}
_transform(chunk, _encoding, done) {
let begin = 0;
let i = 0;
while (i < chunk.length) {
if (chunk[i] === 10) {
// '\n'
this.#accumulatedLines++;
if (this.#accumulatedLines === this.#batchSize) {
this.#lines.push(chunk.subarray(begin, i + 1));
this.push(Buffer.concat(this.#lines));
this.#lines = [];
this.#accumulatedLines = 0;
begin = i + 1;
}
}
i++;
}
this.#lines.push(chunk.subarray(begin));
done();
}
_flush(done) {
this.push(Buffer.concat(this.#lines));
done();
}
}
// helper functions
function readCSVBuffer(buff, options) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readCsv(buff, { ...readCsvDefaultOptions, ...options }));
}
function readRecords(records, options) {
if (options?.schema) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.fromRows(records, options.schema));
}
return (0, dataframe_1._DataFrame)(polars_internal_1.default.fromRows(records, undefined, options?.inferSchemaLength));
}
function readCSV(pathOrBody, options) {
options = { ...readCsvDefaultOptions, ...options };
const extensions = [".tsv", ".csv"];
// Handle If set to `null` case
options.inferSchemaLength = options.inferSchemaLength ?? 0;
if (Buffer.isBuffer(pathOrBody)) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readCsv(pathOrBody, options));
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, extensions);
if (inline) {
const buf = Buffer.from(pathOrBody, "utf-8");
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readCsv(buf, options));
}
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readCsv(pathOrBody, options));
}
throw new Error("must supply either a path or body");
}
const scanCsvDefaultOptions = {
inferSchemaLength: 100,
cache: true,
hasHeader: true,
ignoreErrors: true,
skipRows: 0,
sep: ",",
rechunk: false,
encoding: "utf8",
lowMemory: false,
parseDates: false,
skipRowsAfterHeader: 0,
};
function scanCSV(path, options) {
options = { ...scanCsvDefaultOptions, ...options };
// Handle If set to `null` case
options.inferSchemaLength = options.inferSchemaLength ?? 0;
return (0, dataframe_2._LazyDataFrame)(polars_internal_1.default.scanCsv(path, options));
}
function readJSON(pathOrBody, options = readJsonDefaultOptions) {
options = { ...readJsonDefaultOptions, ...options };
const method = options.format === "lines" ? polars_internal_1.default.readJsonLines : polars_internal_1.default.readJson;
const extensions = [".ndjson", ".json", ".jsonl"];
// Handle If set to `null` case
options.inferSchemaLength = options.inferSchemaLength ?? 0;
if (Buffer.isBuffer(pathOrBody)) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readJson(pathOrBody, options));
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, extensions);
if (inline) {
return (0, dataframe_1._DataFrame)(method(Buffer.from(pathOrBody, "utf-8"), options));
}
return (0, dataframe_1._DataFrame)(method(pathOrBody, options));
}
throw new Error("must supply either a path or body");
}
function scanJson(path, options) {
options = { ...readJsonDefaultOptions, ...options };
// Handle If set to `null` case
options.inferSchemaLength = options.inferSchemaLength ?? 0;
return (0, dataframe_2._LazyDataFrame)(polars_internal_1.default.scanJson(path, options));
}
/**
* Read into a DataFrame from a parquet file.
* @param pathOrBuffer
* Path to a file, list of files, or a file like object. If the path is a directory, that directory will be used
* as partition aware scan.
* @param options.columns Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.
* @param options.numRows Stop reading from parquet file after reading ``numRows``.
* @param options.parallel
* Any of 'auto' | 'columns' | 'row_groups' | 'none'
This determines the direction of parallelism. 'auto' will try to determine the optimal direction.
Defaults to 'auto'
* @param options.rowCount Add row count as column
*/
function readParquet(pathOrBody, options) {
const pliOptions = {};
if (typeof options?.columns?.[0] === "number") {
pliOptions.projection = options?.columns;
}
else {
pliOptions.columns = options?.columns;
}
pliOptions.nRows = options?.numRows;
pliOptions.rowCount = options?.rowCount;
const parallel = options?.parallel ?? "auto";
if (Buffer.isBuffer(pathOrBody)) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readParquet(pathOrBody, pliOptions, parallel));
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, [".parquet"]);
if (inline) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readParquet(Buffer.from(pathOrBody), pliOptions, parallel));
}
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readParquet(pathOrBody, pliOptions, parallel));
}
throw new Error("must supply either a path or body");
}
function readAvro(pathOrBody, options = {}) {
if (Buffer.isBuffer(pathOrBody)) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readAvro(pathOrBody, options));
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, [".avro"]);
if (inline) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readAvro(Buffer.from(pathOrBody), options));
}
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readAvro(pathOrBody, options));
}
throw new Error("must supply either a path or body");
}
/**
* Lazily read from a local or cloud-hosted parquet file (or files).
This function allows the query optimizer to push down predicates and projections to
the scan level, typically increasing performance and reducing memory overhead.
* This allows the query optimizer to push down predicates and projections to the scan level,
* thereby potentially reducing memory overhead.
* @param source - Path(s) to a file. If a single path is given, it can be a globbing pattern.
@param options.nRows - Stop reading from parquet file after reading `n_rows`.
@param options.rowIndexName - If not None, this will insert a row index column with the given name into the DataFrame
@param options.rowIndexOffset - Offset to start the row index column (only used if the name is set)
@param options.parallel : {'auto', 'columns', 'row_groups', 'none'}
This determines the direction of parallelism. 'auto' will try to determine the optimal direction.
@param options.useStatistics - Use statistics in the parquet to determine if pages can be skipped from reading.
@param options.hivePartitioning - Infer statistics and schema from hive partitioned URL and use them to prune reads.
@param options.glob - Expand path given via globbing rules.
@param options.hiveSchema - The column names and data types of the columns by which the data is partitioned.
If set to `None` (default), the schema of the Hive partitions is inferred.
@param options.tryParseHiveDates - Whether to try parsing hive values as date/datetime types.
@param options.rechunk - In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.
@param options.lowMemory - Reduce memory pressure at the expense of performance.
@param options.cache - Cache the result after reading.
@param options.cloudOptions - Options that indicate how to connect to a cloud provider.
If the cloud provider is not supported by Polars, the storage options are passed to `fsspec.open()`.
The cloud providers currently supported are AWS, GCP, and Azure.
See supported keys here:
* `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
* `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
* `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
If `cloudOptions` is not provided, Polars will try to infer the information from environment variables.
@param retries - Number of retries if accessing a cloud instance fails.
@param includeFilePaths - Include the path of the source file(s) as a column with this name.
*/
function scanParquet(source, options = {}) {
const defaultOptions = { parallel: "auto" };
const pliOptions = { ...defaultOptions, ...options };
return (0, dataframe_2._LazyDataFrame)(polars_internal_1.default.scanParquet(source, pliOptions));
}
function readIPC(pathOrBody, options = {}) {
if (Buffer.isBuffer(pathOrBody)) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpc(pathOrBody, options));
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, [".ipc"]);
if (inline) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpc(Buffer.from(pathOrBody, "utf-8"), options));
}
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpc(pathOrBody, options));
}
throw new Error("must supply either a path or body");
}
function readIPCStream(pathOrBody, options = {}) {
if (Buffer.isBuffer(pathOrBody)) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpcStream(pathOrBody, options));
}
if (typeof pathOrBody === "string") {
const inline = !(0, utils_1.isPath)(pathOrBody, [".ipc"]);
if (inline) {
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpcStream(Buffer.from(pathOrBody, "utf-8"), options));
}
return (0, dataframe_1._DataFrame)(polars_internal_1.default.readIpcStream(pathOrBody, options));
}
throw new Error("must supply either a path or body");
}
function scanIPC(path, options = {}) {
return (0, dataframe_2._LazyDataFrame)(polars_internal_1.default.scanIpc(path, options));
}
function readCSVStream(stream, options) {
const batchSize = options?.batchSize ?? 10000;
let count = 0;
const end = options?.endRows ?? Number.POSITIVE_INFINITY;
return new Promise((resolve, reject) => {
const s = stream.pipe(new LineBatcher({ batchSize }));
const chunks = [];
s.on("data", (chunk) => {
// early abort if 'end rows' is specified
if (count <= end) {
chunks.push(chunk);
}
else {
s.end();
}
count += batchSize;
}).on("end", () => {
try {
const buff = Buffer.concat(chunks);
const df = readCSVBuffer(buff, options);
resolve(df);
}
catch (err) {
reject(err);
}
});
});
}
function readJSONStream(stream, options = readJsonDefaultOptions) {
options = { ...readJsonDefaultOptions, ...options };
return new Promise((resolve, reject) => {
const chunks = [];
stream
.pipe(new LineBatcher({ batchSize: options.batchSize }))
.on("data", (chunk) => {
try {
const df = (0, dataframe_1._DataFrame)(polars_internal_1.default.readJson(chunk, options));
chunks.push(df);
}
catch (err) {
reject(err);
}
})
.on("end", () => {
try {
const df = (0, functions_1.concat)(chunks);
resolve(df);
}
catch (err) {
reject(err);
}
});
});
}
;