nodejs-polars
Version:
Polars: Blazingly fast DataFrames in Rust, Python, Node.js, R and SQL
760 lines (759 loc) • 29.3 kB
JavaScript
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.DataFrame = exports._DataFrame = void 0;
const node_stream_1 = require("node:stream");
const functions_1 = require("./functions");
const groupby_1 = require("./groupby");
const construction_1 = require("./internals/construction");
const polars_internal_1 = __importDefault(require("./internals/polars_internal"));
const dataframe_1 = require("./lazy/dataframe");
const expr_1 = require("./lazy/expr");
const series_1 = require("./series");
const datatypes_1 = require("./datatypes");
const utils_1 = require("./utils");
const html_1 = require("./html");
const functions_2 = require("./lazy/functions");
const inspect = Symbol.for("nodejs.util.inspect.custom");
const jupyterDisplay = Symbol.for("Jupyter.display");
function prepareOtherArg(anyValue) {
if (series_1.Series.isSeries(anyValue)) {
return anyValue;
}
return (0, series_1.Series)([anyValue]);
}
function map(df, fn) {
return df.rows().map(fn);
}
function mapPolarsTypeToJSONSchema(colType) {
const typeMapping = {
Null: "null",
Bool: "boolean",
Int8: "integer",
Int16: "integer",
Int32: "integer",
Int64: "integer",
UInt8: "integer",
UInt16: "integer",
UInt32: "integer",
UInt64: "integer",
Float32: "number",
Float64: "number",
Date: "string",
Datetime: "string",
Utf8: "string",
Categorical: "string",
List: "array",
Struct: "object",
};
const dataType = colType.variant;
return typeMapping[dataType] || "string";
}
/**
* @ignore
*/
const _DataFrame = (_df) => {
const unwrap = (method, ...args) => {
return _df[method](...args);
};
const wrap = (method, ...args) => {
return (0, exports._DataFrame)(unwrap(method, ...args));
};
const df = {
/** @ignore */
_df,
[inspect]() {
return _df.toString();
},
*[Symbol.iterator]() {
let start = 0;
const len = this.width;
while (start < len) {
const s = this.toSeries(start);
start++;
yield s;
}
},
get [Symbol.toStringTag]() {
return "DataFrame";
},
get dtypes() {
return _df.dtypes().map(datatypes_1.DataType.deserialize);
},
get height() {
return _df.height;
},
get width() {
return _df.width;
},
get shape() {
return _df.shape;
},
get columns() {
return _df.columns;
},
set columns(names) {
_df.columns = names;
},
/**
* Return back text/html and application/vnd.dataresource+json representations
* of the DataFrame. This is intended to be a simple view of the DataFrame
* inside of notebooks.
*
* @returns Media bundle / mimetype keys for Jupyter frontends
*/
[jupyterDisplay]() {
let rows = 50;
if (process.env.POLARS_FMT_MAX_ROWS) {
rows = Number.parseInt(process.env.POLARS_FMT_MAX_ROWS);
}
const limited = this.limit(rows);
return {
"application/vnd.dataresource+json": limited.toDataResource(),
"text/html": limited.toHTML(),
};
},
get schema() {
return this.getColumns().reduce((acc, curr) => {
acc[curr.name] = curr.dtype;
return acc;
}, {});
},
clone() {
return wrap("clone");
},
describe() {
const describeCast = (df) => {
return (0, exports.DataFrame)(df.getColumns().map((s) => {
if (s.isNumeric() || s.isBoolean()) {
return s.cast(datatypes_1.DataType.Float64);
}
return s;
}));
};
const summary = (0, functions_1.concat)([
describeCast(this.mean()),
describeCast(this.std()),
describeCast(this.min()),
describeCast(this.max()),
describeCast(this.median()),
]);
summary.insertAtIdx(0, (0, series_1.Series)("describe", ["mean", "std", "min", "max", "median"]));
return summary;
},
inner() {
return _df;
},
drop(...names) {
if (!Array.isArray(names[0]) && names.length === 1) {
return wrap("drop", names[0]);
}
const df = this.clone();
for (const name of names.flat(2)) {
df.inner().dropInPlace(name);
}
return df;
},
dropNulls(...subset) {
if (subset.length) {
return wrap("dropNulls", subset.flat(2));
}
return wrap("dropNulls");
},
unique(opts = false, subset, keep = "first") {
const defaultOptions = {
maintainOrder: false,
keep,
};
if (typeof opts === "boolean") {
return wrap("unique", opts, subset, keep);
}
if (opts.subset) {
opts.subset = [opts.subset].flat(3);
}
const o = { ...defaultOptions, ...opts };
return wrap("unique", o.maintainOrder, o.subset, o.keep);
},
explode(...columns) {
return (0, exports._DataFrame)(_df)
.lazy()
.explode(columns)
.collectSync({ noOptimization: true });
},
extend(other) {
return wrap("extend", other.inner());
},
filter(predicate) {
return this.lazy().filter(predicate).collectSync();
},
fillNull(strategy) {
return wrap("fillNull", strategy);
},
findIdxByName(name) {
return unwrap("findIdxByName", name);
},
fold(fn) {
if (this.width === 1) {
return this.toSeries(0);
}
return this.getColumns().reduce((acc, curr) => fn(acc, curr));
},
frameEqual(other, nullEqual = true) {
return unwrap("frameEqual", other._df, nullEqual);
},
getColumn(name) {
return (0, series_1._Series)(_df.column(name));
},
getColumns() {
return _df.getColumns().map(series_1._Series);
},
groupBy(...by) {
return (0, groupby_1._GroupBy)(_df, (0, utils_1.columnOrColumnsStrict)(by));
},
groupByRolling(opts) {
return (0, groupby_1.RollingGroupBy)((0, exports._DataFrame)(_df), opts.indexColumn, opts.period, opts.offset, opts.closed, opts.by, opts.check_sorted);
},
groupByDynamic({ indexColumn, every, period, offset, includeBoundaries, closed, by, }) {
return (0, groupby_1.DynamicGroupBy)((0, exports._DataFrame)(_df), indexColumn, every, period, offset, includeBoundaries, closed, by);
},
upsample(opts, every, by, maintainOrder) {
let timeColumn;
if (typeof opts === "string") {
timeColumn = opts;
}
else {
timeColumn = opts.timeColumn;
by = opts.by;
every = opts.every;
maintainOrder = opts.maintainOrder ?? false;
}
if (typeof by === "string") {
by = [by];
}
else {
by = by ?? [];
}
return (0, exports._DataFrame)(_df.upsample(by, timeColumn, every, maintainOrder));
},
hashRows(obj = 0n, k1 = 1n, k2 = 2n, k3 = 3n) {
if (typeof obj === "number" || typeof obj === "bigint") {
return (0, series_1._Series)(_df.hashRows(BigInt(obj), BigInt(k1), BigInt(k2), BigInt(k3)));
}
const o = { k0: obj, k1: k1, k2: k2, k3: k3, ...obj };
return (0, series_1._Series)(_df.hashRows(BigInt(o.k0), BigInt(o.k1), BigInt(o.k2), BigInt(o.k3)));
},
head(length = 5) {
return wrap("head", length);
},
hstack(columns, inPlace = false) {
if (!Array.isArray(columns)) {
columns = columns.getColumns();
}
const method = inPlace ? "hstackMut" : "hstack";
return wrap(method, columns.map((col) => col.inner()));
},
insertAtIdx(idx, series) {
_df.insertAtIdx(idx, series.inner());
},
interpolate() {
return this.select((0, functions_2.col)("*").interpolate());
},
isDuplicated: () => (0, series_1._Series)(_df.isDuplicated()),
isEmpty: () => _df.height === 0,
isUnique: () => (0, series_1._Series)(_df.isUnique()),
join(other, options) {
options = { how: "inner", ...options };
const on = (0, utils_1.columnOrColumns)(options.on);
const how = options.how;
const suffix = options.suffix;
if (how === "cross") {
return (0, exports._DataFrame)(_df.join(other._df, [], [], how, suffix));
}
let leftOn = (0, utils_1.columnOrColumns)(options.leftOn);
let rightOn = (0, utils_1.columnOrColumns)(options.rightOn);
if (on) {
leftOn = on;
rightOn = on;
}
if ((leftOn && !rightOn) || (rightOn && !leftOn)) {
throw new TypeError("You should pass the column to join on as an argument.");
}
return wrap("join", other._df, leftOn, rightOn, how, suffix);
},
joinAsof(other, options) {
return this.lazy()
.joinAsof(other.lazy(), options)
.collectSync();
},
lazy: () => (0, dataframe_1._LazyDataFrame)(_df.lazy()),
limit: (length = 5) => wrap("head", length),
max(axis = 0) {
if (axis === 1) {
return (0, series_1._Series)(_df.hmax());
}
return this.lazy().max().collectSync();
},
mean(axis = 0, nullStrategy = "ignore") {
if (axis === 1) {
return (0, series_1._Series)(_df.hmean(nullStrategy));
}
return this.lazy().mean().collectSync();
},
median() {
return this.lazy().median().collectSync();
},
unpivot(ids, values) {
return wrap("unpivot", (0, utils_1.columnOrColumns)(ids), (0, utils_1.columnOrColumns)(values));
},
min(axis = 0) {
if (axis === 1) {
return (0, series_1._Series)(_df.hmin());
}
return this.lazy().min().collectSync();
},
nChunks() {
return _df.nChunks();
},
nullCount() {
return wrap("nullCount");
},
partitionBy(by, strict = false, includeKey = true, mapFn = (df) => df) {
by = Array.isArray(by) ? by : [by];
return _df
.partitionBy(by, strict, includeKey)
.map((d) => mapFn((0, exports._DataFrame)(d)));
},
pivot(values, options) {
let { values: values_, index, on, maintainOrder = true, sortColumns = false, aggregateFunc = "first", separator, } = options;
values = values_ ?? values;
values = typeof values === "string" ? [values] : values;
index = typeof index === "string" ? [index] : index;
on = typeof on === "string" ? [on] : on;
let fn;
if (expr_1.Expr.isExpr(aggregateFunc)) {
fn = aggregateFunc;
}
else {
fn =
{
first: (0, functions_2.element)().first(),
sum: (0, functions_2.element)().sum(),
max: (0, functions_2.element)().max(),
min: (0, functions_2.element)().min(),
mean: (0, functions_2.element)().mean(),
median: (0, functions_2.element)().median(),
last: (0, functions_2.element)().last(),
count: (0, functions_2.element)().count(),
}[aggregateFunc] ??
new Error(`Unknown aggregate function ${aggregateFunc}`);
if (fn instanceof Error) {
throw fn;
}
}
return (0, exports._DataFrame)(_df.pivotExpr(values, on, index, fn, maintainOrder, sortColumns, separator));
},
quantile(quantile) {
return this.lazy().quantile(quantile).collectSync();
},
rechunk() {
return wrap("rechunk");
},
rename(mapping) {
const df = this.clone();
for (const [column, new_col] of Object.entries(mapping)) {
df.inner().rename(column, new_col);
}
return df;
},
replaceAtIdx(index, newColumn) {
_df.replaceAtIdx(index, newColumn.inner());
return this;
},
rows(callback) {
if (callback) {
return _df.toRowsCb(callback);
}
return _df.toRows();
},
sample(opts, frac, withReplacement = false, seed) {
// biome-ignore lint/style/noArguments: <explanation>
if (arguments.length === 0) {
return wrap("sampleN", (0, series_1.Series)("", [1]).inner(), withReplacement, false, seed);
}
if (opts?.n !== undefined || opts?.frac !== undefined) {
return this.sample(opts.n, opts.frac, opts.withReplacement, seed);
}
if (typeof opts === "number") {
return wrap("sampleN", (0, series_1.Series)("", [opts]).inner(), withReplacement, false, seed);
}
if (typeof frac === "number") {
return wrap("sampleFrac", (0, series_1.Series)("", [frac]).inner(), withReplacement, false, seed);
}
throw new TypeError("must specify either 'frac' or 'n'");
},
select(...selection) {
const hasExpr = selection.flat().some((s) => expr_1.Expr.isExpr(s));
if (hasExpr) {
return (0, exports._DataFrame)(_df).lazy().select(selection).collectSync();
}
return wrap("select", (0, utils_1.columnOrColumnsStrict)(selection));
},
shift: (opt) => wrap("shift", opt?.periods ?? opt),
shiftAndFill(n, fillValue) {
if (typeof n === "number" && fillValue) {
return (0, exports._DataFrame)(_df).lazy().shiftAndFill(n, fillValue).collectSync();
}
return (0, exports._DataFrame)(_df)
.lazy()
.shiftAndFill(n.n, n.fillValue)
.collectSync();
},
shrinkToFit(inPlace = false) {
if (inPlace) {
_df.shrinkToFit();
}
else {
const d = this.clone();
d.inner().shrinkToFit();
return d;
}
},
slice(opts, length) {
if (typeof opts === "number") {
return wrap("slice", opts, length);
}
return wrap("slice", opts.offset, opts.length);
},
sort(arg, descending = false, nullsLast = false, maintainOrder = false) {
if (arg?.by !== undefined) {
return this.sort(arg.by, arg.descending ?? arg.reverse ?? false, arg.nullsLast, arg.maintainOrder);
}
if (Array.isArray(arg) || expr_1.Expr.isExpr(arg)) {
return (0, exports._DataFrame)(_df)
.lazy()
.sort(arg, descending, nullsLast, maintainOrder)
.collectSync({ noOptimization: true });
}
return wrap("sort", arg, descending, nullsLast, maintainOrder);
},
std() {
return this.lazy().std().collectSync();
},
sum(axis = 0, nullStrategy = "ignore") {
if (axis === 1) {
return (0, series_1._Series)(_df.hsum(nullStrategy));
}
return this.lazy().sum().collectSync();
},
tail: (length = 5) => wrap("tail", length),
serialize(format) {
return _df.serialize(format);
},
writeCSV(dest, options = {}) {
if (dest instanceof node_stream_1.Writable || typeof dest === "string") {
return _df.writeCsv(dest, options);
}
const buffers = [];
const writeStream = new node_stream_1.Stream.Writable({
write(chunk, _encoding, callback) {
buffers.push(chunk);
callback(null);
},
});
_df.writeCsv(writeStream, dest ?? options);
writeStream.end("");
return Buffer.concat(buffers);
},
toRecords() {
return _df.toObjects();
},
toJSON(...args) {
// this is passed by `JSON.stringify` when calling `toJSON()`
if (args[0] === "") {
return _df.toJs();
}
return _df.serialize("json").toString();
},
toHTML() {
let htmlTable = "<table>";
// Add table headers
htmlTable += "<thead><tr>";
for (const field of this.getColumns()) {
htmlTable += `<th>${(0, html_1.escapeHTML)(field.name)}</th>`;
}
htmlTable += "</tr></thead>";
// Add table data
htmlTable += "<tbody>";
for (const row of this.toRecords()) {
htmlTable += "<tr>";
for (const field of this.getColumns()) {
htmlTable += `<td>${(0, html_1.escapeHTML)(String(row[field.name]))}</td>`;
}
htmlTable += "</tr>";
}
htmlTable += "</tbody></table>";
return htmlTable;
},
toDataResource() {
const data = this.toRecords();
const fields = this.getColumns().map((column) => ({
name: column.name,
type: mapPolarsTypeToJSONSchema(column.dtype),
}));
return { data, schema: { fields } };
},
toObject() {
return this.getColumns().reduce((acc, curr) => {
acc[curr.name] = curr.toArray();
return acc;
}, {});
},
writeJSON(dest, options = { format: "lines" }) {
if (dest instanceof node_stream_1.Writable || typeof dest === "string") {
return _df.writeJson(dest, options);
}
const buffers = [];
const writeStream = new node_stream_1.Stream.Writable({
write(chunk, _encoding, callback) {
buffers.push(chunk);
callback(null);
},
});
_df.writeJson(writeStream, { ...options, ...dest });
writeStream.end("");
return Buffer.concat(buffers);
},
writeParquet(dest, options = { compression: "uncompressed" }) {
if (dest instanceof node_stream_1.Writable || typeof dest === "string") {
return _df.writeParquet(dest, options.compression);
}
const buffers = [];
const writeStream = new node_stream_1.Stream.Writable({
write(chunk, _encoding, callback) {
buffers.push(chunk);
callback(null);
},
});
_df.writeParquet(writeStream, dest?.compression ?? options?.compression);
writeStream.end("");
return Buffer.concat(buffers);
},
writeAvro(dest, options = { compression: "uncompressed" }) {
if (dest instanceof node_stream_1.Writable || typeof dest === "string") {
return _df.writeAvro(dest, options.compression);
}
const buffers = [];
const writeStream = new node_stream_1.Stream.Writable({
write(chunk, _encoding, callback) {
buffers.push(chunk);
callback(null);
},
});
_df.writeAvro(writeStream, dest?.compression ?? options?.compression);
writeStream.end("");
return Buffer.concat(buffers);
},
writeIPC(dest, options = { compression: "uncompressed" }) {
if (dest instanceof node_stream_1.Writable || typeof dest === "string") {
return _df.writeIpc(dest, options.compression);
}
const buffers = [];
const writeStream = new node_stream_1.Stream.Writable({
write(chunk, _encoding, callback) {
buffers.push(chunk);
callback(null);
},
});
_df.writeIpc(writeStream, dest?.compression ?? options?.compression);
writeStream.end("");
return Buffer.concat(buffers);
},
writeIPCStream(dest, options = { compression: "uncompressed" }) {
if (dest instanceof node_stream_1.Writable || typeof dest === "string") {
return _df.writeIpcStream(dest, options.compression);
}
const buffers = [];
const writeStream = new node_stream_1.Stream.Writable({
write(chunk, _encoding, callback) {
buffers.push(chunk);
callback(null);
},
});
_df.writeIpcStream(writeStream, dest?.compression ?? options?.compression);
writeStream.end("");
return Buffer.concat(buffers);
},
toSeries: (index = 0) => (0, series_1._Series)(_df.selectAtIdx(index)),
toStruct(name) {
return (0, series_1._Series)(_df.toStruct(name));
},
toString() {
return _df.toString();
},
transpose(options) {
const includeHeader = options?.includeHeader ?? false;
const headeName = options?.headerName ?? "column";
const keep_names_as = includeHeader ? headeName : undefined;
if (options?.columnNames) {
function takeNItems(iterable, n) {
const result = [];
let i = 0;
for (const item of iterable) {
if (i >= n) {
break;
}
result.push(item);
i++;
}
return result;
}
options.columnNames = Array.isArray(options.columnNames)
? options.columnNames.slice(0, this.height)
: takeNItems(options.columnNames, this.height);
}
if (!options?.columnNames) {
return wrap("transpose", keep_names_as, undefined);
}
return wrap("transpose", keep_names_as, options.columnNames);
},
unnest(names) {
names = Array.isArray(names) ? names : [names];
return (0, exports._DataFrame)(_df.unnest(names));
},
var() {
return this.lazy().var().collectSync();
},
map: (fn) => map((0, exports._DataFrame)(_df), fn),
row(idx) {
return _df.toRow(idx);
},
vstack: (other) => wrap("vstack", other.inner()),
withColumn(column) {
if (series_1.Series.isSeries(column)) {
return wrap("withColumn", column.inner());
}
return this.withColumns(column);
},
withColumns(...columns) {
if ((0, utils_1.isSeriesArray)(columns)) {
return columns.reduce((acc, curr) => acc.withColumn(curr), (0, exports._DataFrame)(_df));
}
return this.lazy()
.withColumns(columns)
.collectSync({ noOptimization: true });
},
withColumnRenamed(opt, replacement) {
if (typeof opt === "string") {
return this.rename({ [opt]: replacement });
}
return this.rename({ [opt.existing]: opt.replacement });
},
withRowCount(name = "row_nr") {
return wrap("withRowCount", name);
},
where(predicate) {
return this.filter(predicate);
},
add: (other) => wrap("add", prepareOtherArg(other).inner()),
sub: (other) => wrap("sub", prepareOtherArg(other).inner()),
div: (other) => wrap("div", prepareOtherArg(other).inner()),
mul: (other) => wrap("mul", prepareOtherArg(other).inner()),
rem: (other) => wrap("rem", prepareOtherArg(other).inner()),
plus: (other) => wrap("add", prepareOtherArg(other).inner()),
minus: (other) => wrap("sub", prepareOtherArg(other).inner()),
divideBy: (other) => wrap("div", prepareOtherArg(other).inner()),
multiplyBy: (other) => wrap("mul", prepareOtherArg(other).inner()),
modulo: (other) => wrap("rem", prepareOtherArg(other).inner()),
};
return new Proxy(df, {
get(target, prop, receiver) {
if (typeof prop === "string" && target.columns.includes(prop)) {
return target.getColumn(prop);
}
if (typeof prop !== "symbol" && !Number.isNaN(Number(prop))) {
return target.row(Number(prop));
}
return Reflect.get(target, prop, receiver);
},
set(target, prop, receiver) {
if (series_1.Series.isSeries(receiver)) {
if (typeof prop === "string" && target.columns.includes(prop)) {
const idx = target.columns.indexOf(prop);
target.replaceAtIdx(idx, receiver.alias(prop));
return true;
}
}
Reflect.set(target, prop, receiver);
return true;
},
has(target, p) {
if (p === jupyterDisplay) {
return true;
}
return target.columns.includes(p);
},
ownKeys(target) {
return target.columns;
},
getOwnPropertyDescriptor(target, prop) {
return {
configurable: true,
enumerable: true,
value: target.getColumn(prop),
};
},
});
};
exports._DataFrame = _DataFrame;
function DataFrameConstructor(data, options) {
if (!data) {
return (0, exports._DataFrame)(objToDF({}));
}
if (Array.isArray(data)) {
return (0, exports._DataFrame)((0, construction_1.arrayToJsDataFrame)(data, options));
}
return (0, exports._DataFrame)(objToDF(data, options));
}
function objToDF(obj, options) {
let columns;
if (options?.schema && options?.schemaOverrides) {
throw new Error("Cannot use both 'schema' and 'schemaOverrides'");
}
// explicit schema
if (options?.schema) {
const schema = options.schema;
const schemaKeys = Object.keys(options.schema);
const values = Object.values(obj);
if (schemaKeys.length !== values.length) {
throw new Error("The number of columns in the schema does not match the number of columns in the data");
}
columns = values.map((values, idx) => {
const name = schemaKeys[idx];
const dtype = schema[name];
return (0, series_1.Series)(name, values, dtype).inner();
});
}
else {
columns = Object.entries(obj).map(([name, values]) => {
if (series_1.Series.isSeries(values)) {
return values.rename(name).inner();
}
// schema overrides
if (options?.schemaOverrides) {
const dtype = options.schemaOverrides[name];
if (dtype) {
return (0, series_1.Series)(name, values, dtype).inner();
}
}
return (0, series_1.Series)(name, values).inner();
});
}
return new polars_internal_1.default.JsDataFrame(columns);
}
const isDataFrame = (anyVal) => anyVal?.[Symbol.toStringTag] === "DataFrame";
exports.DataFrame = Object.assign(DataFrameConstructor, {
isDataFrame,
deserialize: (buf, fmt) => (0, exports._DataFrame)(polars_internal_1.default.JsDataFrame.deserialize(buf, fmt)),
});