UNPKG

nodejs-polars

Version:

Polars: Blazingly fast DataFrames in Rust, Python, Node.js, R and SQL

760 lines (759 loc) 29.3 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.DataFrame = exports._DataFrame = void 0; const node_stream_1 = require("node:stream"); const functions_1 = require("./functions"); const groupby_1 = require("./groupby"); const construction_1 = require("./internals/construction"); const polars_internal_1 = __importDefault(require("./internals/polars_internal")); const dataframe_1 = require("./lazy/dataframe"); const expr_1 = require("./lazy/expr"); const series_1 = require("./series"); const datatypes_1 = require("./datatypes"); const utils_1 = require("./utils"); const html_1 = require("./html"); const functions_2 = require("./lazy/functions"); const inspect = Symbol.for("nodejs.util.inspect.custom"); const jupyterDisplay = Symbol.for("Jupyter.display"); function prepareOtherArg(anyValue) { if (series_1.Series.isSeries(anyValue)) { return anyValue; } return (0, series_1.Series)([anyValue]); } function map(df, fn) { return df.rows().map(fn); } function mapPolarsTypeToJSONSchema(colType) { const typeMapping = { Null: "null", Bool: "boolean", Int8: "integer", Int16: "integer", Int32: "integer", Int64: "integer", UInt8: "integer", UInt16: "integer", UInt32: "integer", UInt64: "integer", Float32: "number", Float64: "number", Date: "string", Datetime: "string", Utf8: "string", Categorical: "string", List: "array", Struct: "object", }; const dataType = colType.variant; return typeMapping[dataType] || "string"; } /** * @ignore */ const _DataFrame = (_df) => { const unwrap = (method, ...args) => { return _df[method](...args); }; const wrap = (method, ...args) => { return (0, exports._DataFrame)(unwrap(method, ...args)); }; const df = { /** @ignore */ _df, [inspect]() { return _df.toString(); }, *[Symbol.iterator]() { let start = 0; const len = this.width; while (start < len) { const s = this.toSeries(start); start++; yield s; } }, get [Symbol.toStringTag]() { return "DataFrame"; }, get dtypes() { return _df.dtypes().map(datatypes_1.DataType.deserialize); }, get height() { return _df.height; }, get width() { return _df.width; }, get shape() { return _df.shape; }, get columns() { return _df.columns; }, set columns(names) { _df.columns = names; }, /** * Return back text/html and application/vnd.dataresource+json representations * of the DataFrame. This is intended to be a simple view of the DataFrame * inside of notebooks. * * @returns Media bundle / mimetype keys for Jupyter frontends */ [jupyterDisplay]() { let rows = 50; if (process.env.POLARS_FMT_MAX_ROWS) { rows = Number.parseInt(process.env.POLARS_FMT_MAX_ROWS); } const limited = this.limit(rows); return { "application/vnd.dataresource+json": limited.toDataResource(), "text/html": limited.toHTML(), }; }, get schema() { return this.getColumns().reduce((acc, curr) => { acc[curr.name] = curr.dtype; return acc; }, {}); }, clone() { return wrap("clone"); }, describe() { const describeCast = (df) => { return (0, exports.DataFrame)(df.getColumns().map((s) => { if (s.isNumeric() || s.isBoolean()) { return s.cast(datatypes_1.DataType.Float64); } return s; })); }; const summary = (0, functions_1.concat)([ describeCast(this.mean()), describeCast(this.std()), describeCast(this.min()), describeCast(this.max()), describeCast(this.median()), ]); summary.insertAtIdx(0, (0, series_1.Series)("describe", ["mean", "std", "min", "max", "median"])); return summary; }, inner() { return _df; }, drop(...names) { if (!Array.isArray(names[0]) && names.length === 1) { return wrap("drop", names[0]); } const df = this.clone(); for (const name of names.flat(2)) { df.inner().dropInPlace(name); } return df; }, dropNulls(...subset) { if (subset.length) { return wrap("dropNulls", subset.flat(2)); } return wrap("dropNulls"); }, unique(opts = false, subset, keep = "first") { const defaultOptions = { maintainOrder: false, keep, }; if (typeof opts === "boolean") { return wrap("unique", opts, subset, keep); } if (opts.subset) { opts.subset = [opts.subset].flat(3); } const o = { ...defaultOptions, ...opts }; return wrap("unique", o.maintainOrder, o.subset, o.keep); }, explode(...columns) { return (0, exports._DataFrame)(_df) .lazy() .explode(columns) .collectSync({ noOptimization: true }); }, extend(other) { return wrap("extend", other.inner()); }, filter(predicate) { return this.lazy().filter(predicate).collectSync(); }, fillNull(strategy) { return wrap("fillNull", strategy); }, findIdxByName(name) { return unwrap("findIdxByName", name); }, fold(fn) { if (this.width === 1) { return this.toSeries(0); } return this.getColumns().reduce((acc, curr) => fn(acc, curr)); }, frameEqual(other, nullEqual = true) { return unwrap("frameEqual", other._df, nullEqual); }, getColumn(name) { return (0, series_1._Series)(_df.column(name)); }, getColumns() { return _df.getColumns().map(series_1._Series); }, groupBy(...by) { return (0, groupby_1._GroupBy)(_df, (0, utils_1.columnOrColumnsStrict)(by)); }, groupByRolling(opts) { return (0, groupby_1.RollingGroupBy)((0, exports._DataFrame)(_df), opts.indexColumn, opts.period, opts.offset, opts.closed, opts.by, opts.check_sorted); }, groupByDynamic({ indexColumn, every, period, offset, includeBoundaries, closed, by, }) { return (0, groupby_1.DynamicGroupBy)((0, exports._DataFrame)(_df), indexColumn, every, period, offset, includeBoundaries, closed, by); }, upsample(opts, every, by, maintainOrder) { let timeColumn; if (typeof opts === "string") { timeColumn = opts; } else { timeColumn = opts.timeColumn; by = opts.by; every = opts.every; maintainOrder = opts.maintainOrder ?? false; } if (typeof by === "string") { by = [by]; } else { by = by ?? []; } return (0, exports._DataFrame)(_df.upsample(by, timeColumn, every, maintainOrder)); }, hashRows(obj = 0n, k1 = 1n, k2 = 2n, k3 = 3n) { if (typeof obj === "number" || typeof obj === "bigint") { return (0, series_1._Series)(_df.hashRows(BigInt(obj), BigInt(k1), BigInt(k2), BigInt(k3))); } const o = { k0: obj, k1: k1, k2: k2, k3: k3, ...obj }; return (0, series_1._Series)(_df.hashRows(BigInt(o.k0), BigInt(o.k1), BigInt(o.k2), BigInt(o.k3))); }, head(length = 5) { return wrap("head", length); }, hstack(columns, inPlace = false) { if (!Array.isArray(columns)) { columns = columns.getColumns(); } const method = inPlace ? "hstackMut" : "hstack"; return wrap(method, columns.map((col) => col.inner())); }, insertAtIdx(idx, series) { _df.insertAtIdx(idx, series.inner()); }, interpolate() { return this.select((0, functions_2.col)("*").interpolate()); }, isDuplicated: () => (0, series_1._Series)(_df.isDuplicated()), isEmpty: () => _df.height === 0, isUnique: () => (0, series_1._Series)(_df.isUnique()), join(other, options) { options = { how: "inner", ...options }; const on = (0, utils_1.columnOrColumns)(options.on); const how = options.how; const suffix = options.suffix; if (how === "cross") { return (0, exports._DataFrame)(_df.join(other._df, [], [], how, suffix)); } let leftOn = (0, utils_1.columnOrColumns)(options.leftOn); let rightOn = (0, utils_1.columnOrColumns)(options.rightOn); if (on) { leftOn = on; rightOn = on; } if ((leftOn && !rightOn) || (rightOn && !leftOn)) { throw new TypeError("You should pass the column to join on as an argument."); } return wrap("join", other._df, leftOn, rightOn, how, suffix); }, joinAsof(other, options) { return this.lazy() .joinAsof(other.lazy(), options) .collectSync(); }, lazy: () => (0, dataframe_1._LazyDataFrame)(_df.lazy()), limit: (length = 5) => wrap("head", length), max(axis = 0) { if (axis === 1) { return (0, series_1._Series)(_df.hmax()); } return this.lazy().max().collectSync(); }, mean(axis = 0, nullStrategy = "ignore") { if (axis === 1) { return (0, series_1._Series)(_df.hmean(nullStrategy)); } return this.lazy().mean().collectSync(); }, median() { return this.lazy().median().collectSync(); }, unpivot(ids, values) { return wrap("unpivot", (0, utils_1.columnOrColumns)(ids), (0, utils_1.columnOrColumns)(values)); }, min(axis = 0) { if (axis === 1) { return (0, series_1._Series)(_df.hmin()); } return this.lazy().min().collectSync(); }, nChunks() { return _df.nChunks(); }, nullCount() { return wrap("nullCount"); }, partitionBy(by, strict = false, includeKey = true, mapFn = (df) => df) { by = Array.isArray(by) ? by : [by]; return _df .partitionBy(by, strict, includeKey) .map((d) => mapFn((0, exports._DataFrame)(d))); }, pivot(values, options) { let { values: values_, index, on, maintainOrder = true, sortColumns = false, aggregateFunc = "first", separator, } = options; values = values_ ?? values; values = typeof values === "string" ? [values] : values; index = typeof index === "string" ? [index] : index; on = typeof on === "string" ? [on] : on; let fn; if (expr_1.Expr.isExpr(aggregateFunc)) { fn = aggregateFunc; } else { fn = { first: (0, functions_2.element)().first(), sum: (0, functions_2.element)().sum(), max: (0, functions_2.element)().max(), min: (0, functions_2.element)().min(), mean: (0, functions_2.element)().mean(), median: (0, functions_2.element)().median(), last: (0, functions_2.element)().last(), count: (0, functions_2.element)().count(), }[aggregateFunc] ?? new Error(`Unknown aggregate function ${aggregateFunc}`); if (fn instanceof Error) { throw fn; } } return (0, exports._DataFrame)(_df.pivotExpr(values, on, index, fn, maintainOrder, sortColumns, separator)); }, quantile(quantile) { return this.lazy().quantile(quantile).collectSync(); }, rechunk() { return wrap("rechunk"); }, rename(mapping) { const df = this.clone(); for (const [column, new_col] of Object.entries(mapping)) { df.inner().rename(column, new_col); } return df; }, replaceAtIdx(index, newColumn) { _df.replaceAtIdx(index, newColumn.inner()); return this; }, rows(callback) { if (callback) { return _df.toRowsCb(callback); } return _df.toRows(); }, sample(opts, frac, withReplacement = false, seed) { // biome-ignore lint/style/noArguments: <explanation> if (arguments.length === 0) { return wrap("sampleN", (0, series_1.Series)("", [1]).inner(), withReplacement, false, seed); } if (opts?.n !== undefined || opts?.frac !== undefined) { return this.sample(opts.n, opts.frac, opts.withReplacement, seed); } if (typeof opts === "number") { return wrap("sampleN", (0, series_1.Series)("", [opts]).inner(), withReplacement, false, seed); } if (typeof frac === "number") { return wrap("sampleFrac", (0, series_1.Series)("", [frac]).inner(), withReplacement, false, seed); } throw new TypeError("must specify either 'frac' or 'n'"); }, select(...selection) { const hasExpr = selection.flat().some((s) => expr_1.Expr.isExpr(s)); if (hasExpr) { return (0, exports._DataFrame)(_df).lazy().select(selection).collectSync(); } return wrap("select", (0, utils_1.columnOrColumnsStrict)(selection)); }, shift: (opt) => wrap("shift", opt?.periods ?? opt), shiftAndFill(n, fillValue) { if (typeof n === "number" && fillValue) { return (0, exports._DataFrame)(_df).lazy().shiftAndFill(n, fillValue).collectSync(); } return (0, exports._DataFrame)(_df) .lazy() .shiftAndFill(n.n, n.fillValue) .collectSync(); }, shrinkToFit(inPlace = false) { if (inPlace) { _df.shrinkToFit(); } else { const d = this.clone(); d.inner().shrinkToFit(); return d; } }, slice(opts, length) { if (typeof opts === "number") { return wrap("slice", opts, length); } return wrap("slice", opts.offset, opts.length); }, sort(arg, descending = false, nullsLast = false, maintainOrder = false) { if (arg?.by !== undefined) { return this.sort(arg.by, arg.descending ?? arg.reverse ?? false, arg.nullsLast, arg.maintainOrder); } if (Array.isArray(arg) || expr_1.Expr.isExpr(arg)) { return (0, exports._DataFrame)(_df) .lazy() .sort(arg, descending, nullsLast, maintainOrder) .collectSync({ noOptimization: true }); } return wrap("sort", arg, descending, nullsLast, maintainOrder); }, std() { return this.lazy().std().collectSync(); }, sum(axis = 0, nullStrategy = "ignore") { if (axis === 1) { return (0, series_1._Series)(_df.hsum(nullStrategy)); } return this.lazy().sum().collectSync(); }, tail: (length = 5) => wrap("tail", length), serialize(format) { return _df.serialize(format); }, writeCSV(dest, options = {}) { if (dest instanceof node_stream_1.Writable || typeof dest === "string") { return _df.writeCsv(dest, options); } const buffers = []; const writeStream = new node_stream_1.Stream.Writable({ write(chunk, _encoding, callback) { buffers.push(chunk); callback(null); }, }); _df.writeCsv(writeStream, dest ?? options); writeStream.end(""); return Buffer.concat(buffers); }, toRecords() { return _df.toObjects(); }, toJSON(...args) { // this is passed by `JSON.stringify` when calling `toJSON()` if (args[0] === "") { return _df.toJs(); } return _df.serialize("json").toString(); }, toHTML() { let htmlTable = "<table>"; // Add table headers htmlTable += "<thead><tr>"; for (const field of this.getColumns()) { htmlTable += `<th>${(0, html_1.escapeHTML)(field.name)}</th>`; } htmlTable += "</tr></thead>"; // Add table data htmlTable += "<tbody>"; for (const row of this.toRecords()) { htmlTable += "<tr>"; for (const field of this.getColumns()) { htmlTable += `<td>${(0, html_1.escapeHTML)(String(row[field.name]))}</td>`; } htmlTable += "</tr>"; } htmlTable += "</tbody></table>"; return htmlTable; }, toDataResource() { const data = this.toRecords(); const fields = this.getColumns().map((column) => ({ name: column.name, type: mapPolarsTypeToJSONSchema(column.dtype), })); return { data, schema: { fields } }; }, toObject() { return this.getColumns().reduce((acc, curr) => { acc[curr.name] = curr.toArray(); return acc; }, {}); }, writeJSON(dest, options = { format: "lines" }) { if (dest instanceof node_stream_1.Writable || typeof dest === "string") { return _df.writeJson(dest, options); } const buffers = []; const writeStream = new node_stream_1.Stream.Writable({ write(chunk, _encoding, callback) { buffers.push(chunk); callback(null); }, }); _df.writeJson(writeStream, { ...options, ...dest }); writeStream.end(""); return Buffer.concat(buffers); }, writeParquet(dest, options = { compression: "uncompressed" }) { if (dest instanceof node_stream_1.Writable || typeof dest === "string") { return _df.writeParquet(dest, options.compression); } const buffers = []; const writeStream = new node_stream_1.Stream.Writable({ write(chunk, _encoding, callback) { buffers.push(chunk); callback(null); }, }); _df.writeParquet(writeStream, dest?.compression ?? options?.compression); writeStream.end(""); return Buffer.concat(buffers); }, writeAvro(dest, options = { compression: "uncompressed" }) { if (dest instanceof node_stream_1.Writable || typeof dest === "string") { return _df.writeAvro(dest, options.compression); } const buffers = []; const writeStream = new node_stream_1.Stream.Writable({ write(chunk, _encoding, callback) { buffers.push(chunk); callback(null); }, }); _df.writeAvro(writeStream, dest?.compression ?? options?.compression); writeStream.end(""); return Buffer.concat(buffers); }, writeIPC(dest, options = { compression: "uncompressed" }) { if (dest instanceof node_stream_1.Writable || typeof dest === "string") { return _df.writeIpc(dest, options.compression); } const buffers = []; const writeStream = new node_stream_1.Stream.Writable({ write(chunk, _encoding, callback) { buffers.push(chunk); callback(null); }, }); _df.writeIpc(writeStream, dest?.compression ?? options?.compression); writeStream.end(""); return Buffer.concat(buffers); }, writeIPCStream(dest, options = { compression: "uncompressed" }) { if (dest instanceof node_stream_1.Writable || typeof dest === "string") { return _df.writeIpcStream(dest, options.compression); } const buffers = []; const writeStream = new node_stream_1.Stream.Writable({ write(chunk, _encoding, callback) { buffers.push(chunk); callback(null); }, }); _df.writeIpcStream(writeStream, dest?.compression ?? options?.compression); writeStream.end(""); return Buffer.concat(buffers); }, toSeries: (index = 0) => (0, series_1._Series)(_df.selectAtIdx(index)), toStruct(name) { return (0, series_1._Series)(_df.toStruct(name)); }, toString() { return _df.toString(); }, transpose(options) { const includeHeader = options?.includeHeader ?? false; const headeName = options?.headerName ?? "column"; const keep_names_as = includeHeader ? headeName : undefined; if (options?.columnNames) { function takeNItems(iterable, n) { const result = []; let i = 0; for (const item of iterable) { if (i >= n) { break; } result.push(item); i++; } return result; } options.columnNames = Array.isArray(options.columnNames) ? options.columnNames.slice(0, this.height) : takeNItems(options.columnNames, this.height); } if (!options?.columnNames) { return wrap("transpose", keep_names_as, undefined); } return wrap("transpose", keep_names_as, options.columnNames); }, unnest(names) { names = Array.isArray(names) ? names : [names]; return (0, exports._DataFrame)(_df.unnest(names)); }, var() { return this.lazy().var().collectSync(); }, map: (fn) => map((0, exports._DataFrame)(_df), fn), row(idx) { return _df.toRow(idx); }, vstack: (other) => wrap("vstack", other.inner()), withColumn(column) { if (series_1.Series.isSeries(column)) { return wrap("withColumn", column.inner()); } return this.withColumns(column); }, withColumns(...columns) { if ((0, utils_1.isSeriesArray)(columns)) { return columns.reduce((acc, curr) => acc.withColumn(curr), (0, exports._DataFrame)(_df)); } return this.lazy() .withColumns(columns) .collectSync({ noOptimization: true }); }, withColumnRenamed(opt, replacement) { if (typeof opt === "string") { return this.rename({ [opt]: replacement }); } return this.rename({ [opt.existing]: opt.replacement }); }, withRowCount(name = "row_nr") { return wrap("withRowCount", name); }, where(predicate) { return this.filter(predicate); }, add: (other) => wrap("add", prepareOtherArg(other).inner()), sub: (other) => wrap("sub", prepareOtherArg(other).inner()), div: (other) => wrap("div", prepareOtherArg(other).inner()), mul: (other) => wrap("mul", prepareOtherArg(other).inner()), rem: (other) => wrap("rem", prepareOtherArg(other).inner()), plus: (other) => wrap("add", prepareOtherArg(other).inner()), minus: (other) => wrap("sub", prepareOtherArg(other).inner()), divideBy: (other) => wrap("div", prepareOtherArg(other).inner()), multiplyBy: (other) => wrap("mul", prepareOtherArg(other).inner()), modulo: (other) => wrap("rem", prepareOtherArg(other).inner()), }; return new Proxy(df, { get(target, prop, receiver) { if (typeof prop === "string" && target.columns.includes(prop)) { return target.getColumn(prop); } if (typeof prop !== "symbol" && !Number.isNaN(Number(prop))) { return target.row(Number(prop)); } return Reflect.get(target, prop, receiver); }, set(target, prop, receiver) { if (series_1.Series.isSeries(receiver)) { if (typeof prop === "string" && target.columns.includes(prop)) { const idx = target.columns.indexOf(prop); target.replaceAtIdx(idx, receiver.alias(prop)); return true; } } Reflect.set(target, prop, receiver); return true; }, has(target, p) { if (p === jupyterDisplay) { return true; } return target.columns.includes(p); }, ownKeys(target) { return target.columns; }, getOwnPropertyDescriptor(target, prop) { return { configurable: true, enumerable: true, value: target.getColumn(prop), }; }, }); }; exports._DataFrame = _DataFrame; function DataFrameConstructor(data, options) { if (!data) { return (0, exports._DataFrame)(objToDF({})); } if (Array.isArray(data)) { return (0, exports._DataFrame)((0, construction_1.arrayToJsDataFrame)(data, options)); } return (0, exports._DataFrame)(objToDF(data, options)); } function objToDF(obj, options) { let columns; if (options?.schema && options?.schemaOverrides) { throw new Error("Cannot use both 'schema' and 'schemaOverrides'"); } // explicit schema if (options?.schema) { const schema = options.schema; const schemaKeys = Object.keys(options.schema); const values = Object.values(obj); if (schemaKeys.length !== values.length) { throw new Error("The number of columns in the schema does not match the number of columns in the data"); } columns = values.map((values, idx) => { const name = schemaKeys[idx]; const dtype = schema[name]; return (0, series_1.Series)(name, values, dtype).inner(); }); } else { columns = Object.entries(obj).map(([name, values]) => { if (series_1.Series.isSeries(values)) { return values.rename(name).inner(); } // schema overrides if (options?.schemaOverrides) { const dtype = options.schemaOverrides[name]; if (dtype) { return (0, series_1.Series)(name, values, dtype).inner(); } } return (0, series_1.Series)(name, values).inner(); }); } return new polars_internal_1.default.JsDataFrame(columns); } const isDataFrame = (anyVal) => anyVal?.[Symbol.toStringTag] === "DataFrame"; exports.DataFrame = Object.assign(DataFrameConstructor, { isDataFrame, deserialize: (buf, fmt) => (0, exports._DataFrame)(polars_internal_1.default.JsDataFrame.deserialize(buf, fmt)), });