lorix
Version:
Javascript dataframe API
391 lines (328 loc) • 15.1 kB
JavaScript
import lodash from 'lodash';
import lorix from './lorix.js';
import { currentRow } from './src/window.js';
import {
verySmallDataFrame1,
verySmallDataFrame2,
verySmallDataFrame3,
verySmallDataFrame4,
verySmallValidObjArray,
verySmallInvalidObjArray,
verySmallDataFrameInnerJoinResult,
smallDataFrame1,
smallDataFrame2,
smallDataFrame3,
smallDataFrame4,
smallDataFrame5,
smallDataFrame6,
iris
} from './test/sample_data.js'
// loading data data
// let df1 = await lorix.readCsv("./test/data/iris.csv");
// let df2 = await lorix.readCsv('./data/test2.csv');
// let df3 = await lorix.readDsv('test.psv'); // Error
// let df3 = await lorix.readDsv('test.psv', ""); // Error
// let df3 = await lorix.readDsv('test.psv', {}); // Error
// let df3 = await lorix.readDsv('test.psv', "|");
let vsDf1 = verySmallDataFrame1;
let vsDf2 = verySmallDataFrame2;
let vsDf3 = verySmallDataFrame3;
let vsDf4 = verySmallDataFrame4;
let sDf1 = smallDataFrame1;
let sDf2 = smallDataFrame2;
let sDf3 = smallDataFrame3;
let sDf4 = smallDataFrame4;
let sDf5 = smallDataFrame5;
let sDf6 = smallDataFrame6;
let irisDf = iris;
// let arr1 = verySmallValidObjArray;
// let arr2 = verySmallInvalidObjArray;
console.log("lorix.DataFrame.fromArray()");
// console.log("valid data");
// let df3 = lorix.DataFrame.fromArray(arr1);
// df3.head();
// console.log("invalid data");
// let df4 = lorix.DataFrame.fromArray(arr2);
// df4.head();
// selecting columns
console.log("df.select()");
// console.log(df);
// const df1 = df.select('id', 'name')
// console.log(df1);
// const df2 = df1.select('id');
// console.log(df2);
// df.head();
// dropping columns
console.log("df.drop()");
// let df3 = df.drop('name');
// head
console.log("df.head()");
// console.log("df1");
// df1.head();
// console.log("df2");
// df2.head();
// console.log("df3");
// df3.head();
// console.log("sDf4");
// sDf4.head();
// console.log("sDf5");
// sDf5.head();
// console.log("df5s");
// df5.head();
// console.log("df6s");
// df6.head();
// console.log("iris");
// irisDf.head();
// sort data
console.log("df.orderBy()");
// df3.orderBy(["invalidColumn"]).head();
// console.log(df3.orderBy(["id"]).toArray());
// console.log(df3.orderBy(["name"]).toArray());
// console.log(df3.orderBy(["id", "weight"]).toArray());
// console.log(df3.orderBy(["id", "weight"], ["desc", "asc"]).toArray());
// console.log(df3.orderBy(["id"], "test").toArray());
// console.log(df3.orderBy(["id"], ["wog"]).toArray());
// array iteration
// console.log(df1.columns);
// console.log([...df1]);
// console.log(df1.toArray());
console.log("df.withColumn()");
// df1 = df1.withColumn("newCol");
// df1.withColumn("newCol", df1.col("id") + df1.col("age")); // Attempt at cleaner syntax
// df1 = df1.withColumn("newCol", (row) => row["id"] + row["age"]);
// df1 = df1.withColumn("newCol", () => 1 + 2);
// df1 = df1.withColumn("newCol", () => new Date());
// df1 = df1.withColumn("newCol", (row) => row["somerandomprop"]); // Error - reference to non-ex
// df1 = df1.withColumn("1newCol", (row) => row["id"] + row["age"]); // Error - invalid column name
// df1.head();
// df2 = (
// df2
// .withColumn("newCol", () => 1)
// .withColumn("newCol1", () => 2)
// )
// Error - needs a hotfix because expr is run on a dummy proxy dataframe
// sDf5.withColumn("new_date", (row) => row["latest_date"].toISOString()).head();
console.log("df1.filter()");
// df3.filter((r) => r["id"] == 100).head();
// (
// df3
// .filter((r) => r["weight"] < 80)
// .filter((r) => r["id"] > 20)
// .head()
// )
// df3.filter((r) => r["nonExistantCol"] == 100).head(); // Error - invalid column
// df3.filter("test").head(); // Error - needs a function
// Error - needs a hotfix because expr is run on a dummy proxy dataframe
// sDf5.filter((row) => row["latest_date"].toISOString().split("T")[0] == "2023-02-06").head();
console.log("df.distinct()");
// (
// df4
// .select("id")
// .distinct()
// .filter((r) => r["id"] > 50)
// .orderBy(["id"], ["desc"])
// .head(df4.rows.length)
// )
console.log("df1.groupBy()");
// console.log(df1.groupBy(["id", "age"]));
// df1.groupBy(["id", "age"], {"newCol": "sum"});
// df1.groupBy(["id", "age"], {"newCol": "mean"});
// df1.groupBy(["id", "age"], {"newCol": "count"});
// df1.groupBy(
// ["id", "age"],
// {
// "newCol": ["sum", "mean", "count"],
// "newCol2": "sum"
// }
// ).head();
// df1.groupBy(["id", "age"], {"newColz": "count"}); // Error - invalid column reference
// console.log(
// irisDf
// .groupBy(
// ["species"],
// {
// "sepal_length": ["min", "max", "mean", "count", "sum"]
// }
// )
// .toArray()
// );
// (
// irisDf
// .groupBy(
// ["species"],
// {
// "sepal_length": ["min", "max", "mean", "count", "sum"]
// }
// )
// .head()
// )
console.log("df.window()");
// (
// df5
// .withColumn("sum", lorix.window(lorix.sum("salary"), ["dept"], [["salary"], ["desc"]]))
// .withColumn("lag", lorix.window(lorix.lag("salary", 1), ["dept"], [["salary"], ["desc"]]))
// .withColumn("lead", lorix.window(lorix.lead("salary", 1), ["dept"], [["salary"], ["desc"]]))
// .withColumn("stddev", lorix.window(lorix.stddev("salary"), ["dept"], [["salary"], ["desc"]]))
// .withColumn("rownum", lorix.window(lorix.rownumber(), ["dept"], [["salary"], ["desc"]]))
// // .withColumn("sum", lorix.window(lorix.sum("salary"), ["dept"], [], [1, lorix.currentRow]))
// // .withColumn("sum", lorix.window(lorix.sum("salary"), ["dept"], [], [lorix.unboundedPreceding, lorix.unboundedProceding]))
// .withColumn("stddev", (r) => Math.round(r["stddev"], 0))
// // .orderBy(["dept", "salary"], ["asc", "desc"])
// // .filter((r) => r["row"] == 1)
// .head(100)
// );
// (
// df5
// .withColumn("sum", lorix.window(lorix.sum("caca")))
// .withColumn("sum", lorix.window(lorix.sum("salary"), ["dept"], [["salary"], ["desc"]], [1, lorix.currentRow] ))
// .withColumn("lag", lorix.window(lorix.lag("salary", 1), ["dept"], [["salary"], ["desc"]], [1, lorix.currentRow] ))
// .withColumn("lead", lorix.window(lorix.lead("salary", 1), ["dept"], [["salary"], ["desc"]], [1, lorix.currentRow] ))
// .withColumn("stddev", lorix.window(lorix.stddev("salary"), ["dept"], [["salary"], ["desc"]], [1, lorix.currentRow] ))
// .withColumn("variance", lorix.window(lorix.variance("salary"),["dept"], [["salary"], ["desc"]], [1, lorix.currentRow] ))
// .withColumn("mean", lorix.window(lorix.mean("salary"), ["dept"], [["salary"], ["desc"]], [1, lorix.currentRow] ))
// .withColumn("median", lorix.window(lorix.median("salary"), ["dept"], [["salary"], ["desc"]], [1, lorix.currentRow] ))
// .withColumn("min", lorix.window(lorix.min("salary"), ["dept"], [["salary"], ["desc"]], [1, lorix.currentRow] ))
// .withColumn("max", lorix.window(lorix.max("salary"), ["dept"], [["salary"], ["desc"]], [1, lorix.currentRow] ))
// .withColumn("rownum", lorix.window(lorix.rownumber(), ["dept"], [["salary"], ["desc"]], [1, lorix.currentRow] ))
// .withColumn("stddev", (r) => r["stddev"] !== null ? Math.round(r["stddev"], 0) : null)
// .withColumn("variance", (r) => r["variance"] !== null ? Math.round(r["variance"], 0) : null)
// .withColumn("mean", (r) => r["mean"] !== null ? Math.round(r["mean"], 0) : null)
// .withColumn("median", (r) => r["median"] !== null ? Math.round(r["median"], 0) : null)
// .head(100)
// )
// (
// df5
// .withColumn("mean", lorix.window(lorix.mean("salary"), ["dept"], []))
// .withColumn("mean", (r) => Math.round(r["mean"], 0))
// .orderBy(["dept", "salary"], ["asc", "desc"])
// .withColumn("row_num", lorix.window(lorix.rowNumber(), ["dept"], ["salary"]))
// .head(100)
// );
// console.log(df3.withColumn("quantile", lorix.window(lorix.quantile("age"), ["id"], [["weight"], ["desc"]])));
// df3.withColumn("quantile", lorix.window(lorix.quantile("age"), ["id"], [["weight"], ["desc"]]));
// df5.withColumn("mean", lorix.window(lorix.mean("salary"), [], [])).head(100);
// df3.withColumn("max", lorix.window(lorix.max("weight"), ["id"], [["weight"], ["desc"]])).head(100);
// df3.withColumn("median", lorix.window(lorix.median("weight"), ["id"], [["weight"], ["desc"]])).head(100);
// df3.withColumn("quantile", lorix.window(lorix.quantile("weight"), ["id"], [["weight"], ["desc"]])).head(100);
// df3.withColumn("firstQuartile", lorix.window(lorix.quantile("weight", 0.25), ["id"], [["weight"], ["desc"]])).head(100);
// df3.withColumn("thirdQuartile", lorix.window(lorix.quantile("weight", 0.75), ["id"], [["weight"], ["desc"]])).head(100);
// df3.withColumn("variance", lorix.window(lorix.variance("weight"), ["id"], [["weight"], ["desc"]])).head(100);
// df3.withColumn("stdev", lorix.window(lorix.stdev("weight"), ["id"], [["weight"], ["desc"]])).head(100);
// df.withColumn("quantile", lorix.window(lorix.sum("age"), ["id"], [], [lorix.unboundedPreceeding, lorix.currentRow]));
// df1.head();
// df1.window(
// ["id"],
// [["age"], ["desc"]],
// {
// "min": lorix.min("age"),
// "max": lorix.max("age"),
// "median": lorix.median("age"),
// "quantile": lorix.quantile("age"), // Default is 0.5 (median)
// "first_qrtl": lorix.quantile("age", 0.25), // First quartile
// "third_qrtl": lorix.quantile("age", 0.75), // Third quartile
// "variance": lorix.variance("age"),
// "stdev": lorix.stdev("age")
// }
// ).head();
console.log("df1.orderBy()");
// df1.orderBy(["age"]).head();
// df1.orderBy(["id", "age"], ["asc", "desc"]).head();
// df1.orderBy("id").head(); // Error - need an array of columns
console.log("df1 for...of iteration");
// for (let row of df1) {
// console.log(row);
// }
// joins
console.log('cross join');
// (df1.crossJoin(df2, (l, r) => l.id == r.id)).head(); // Error - number of arguments
// df1.crossJoin(df2).head(); // Cross join
console.log('inner join');
// (df1.innerJoin(df2, ["id", "testicle", "testiculae"])).head();
// (df1.innerJoin(df2, ["id"])).head();
// df1 = (
// df1
// .withColumn("name", (row) => {if (row.id == 3) { return "gary" } else return row.name })
// )
// df1.head();
// (df1.innerJoin(df2, (l, r) => (l.id == r.id) || (r.name == "gary") )).head();
// (df1.innerJoin(df2, (l, r) => (l.id == r.id) )).head();
// console.log((df1.innerJoin(df2, ["id"], ["id"])).toArray());
// console.log(verySmallDataFrameInnerJoinResult.toArray());
// console.log((df1.innerJoin(df2, ["id"])).toArray());
// (df1.innerJoin(df2, (l, r) => l.id == r.id)).head();
// (df1.innerJoin(df2, (l, r) => (l.id == r.id) & (l.name == r.name))).head();
// (df1.innerJoin(df2, (l, r) => (l.id == r.id) & (l.age == r.age))).head(); // Error - reference to non-existent column
// (df1.innerJoin(df2, (l, r) => l.id == r.id, false)).head(); // Error - number of arguments
// (df1.innerJoin(df2, "id")).head(); // Error - argument types
// (df1.innerJoin(df2, (l, r) => l.id == r.id)).head(); // Non-indexed inner join
// (df1.innerJoin(df2, (l, r) => (l.id == r.id) & (l.age == r.age))).head(); // Non-indexed inner join
// (df1.innerJoin(df2, ["id"])).head(); // Indexed inner join
// (df1.innerJoin(df2, ["id", "age"])).head(); // Indexed inner join
console.log('left join');
// (df1.leftJoin(df2, (l, r) => l.id == r.id)).head(); // Non-indexed left join
// (df1.leftJoin(df2, "id")).head(); // Error - argument types
// (df1.leftJoin(df2, (l, r) => l.id == r.id)).head(); // Non-indexed left join
// (df1.leftJoin(df2, (l, r) => (l.id == r.id) & (l.age == r.age))).head(); // Non-indexed left join
// (df1.leftJoin(df2, (l, r) => (l.id == r.id) | (l.age == r.age))).head();
// (df1.leftJoin(df2, ["id"])).head();
// (df1.leftJoin(df2, ["id", "age"], ["id", "age"])).head();
// df1.leftJoin(df2); // Error
console.log('right join');
// (df1.rightJoin(df2, (l, r) => l.id == r.id)).head();
// (df1.rightJoin(df2, (l, r) => l.id == r.id)).head();
// (df1.rightJoin(df2, (l, r) => (l.id == r.id) & (l.age == r.age))).head();
// (df1.rightJoin(df2, (l, r) => (l.id == r.id) | (l.age == r.age))).head();
// (df1.rightJoin(df2, ["id"])).head();
// (df1.rightJoin(df2, ["id", "age"], ["id", "age"])).head();
// df1.rightJoin(df2); // Error
console.log('left anti join');
// console.log((df1.leftAntiJoin(df2, (l, r) => (l.id == r.id) && (l.name == r.name) )).toArray()) // Non-indexed left join
// console.log(df1.rightAntiJoin(df2, (l, r) => (l.id == r.id) && (l.name == r.name) ).toArray()); // Non-indexed left join
// (df1.leftAntiJoin(df2, (l, r) => (l.id == r.id) && (l.name == r.name) )).head(100); // Non-indexed left join
// (df1.rightAntiJoin(df2, (l, r) => (l.id == r.id) && (l.name == r.name) )).head(100); // Non-indexed left join
// (df2.fullOuterJoin(df6, (l, r) => (l.id == r.id) && (l.name == r.name) )).head(100); // Non-indexed left join
// (df2.fullOuterJoin(df6, ["id", "name"])).head(100);
// console.log((df2.fullOuterJoin(df6, ["id", "name"])).toArray());
// (df1.leftJoin(df2, "id")).head(); // Error - argument types
// (df1.leftJoin(df2, (l, r) => l.id == r.id)).head(); // Non-indexed left join
// (df1.leftJoin(df2, (l, r) => (l.id == r.id) & (l.age == r.age))).head(); // Non-indexed left join
// (df1.leftJoin(df2, (l, r) => (l.id == r.id) | (l.age == r.age))).head();
// (df1.leftAntiJoin(df2, ["id", "name"])).head();
// (df1.rightAntiJoin(df2, ["id", "name"])).head();
// (df1.leftJoin(df2, ["id", "age"], ["id", "age"])).head();
// df1.leftJoin(df2); // Error
console.log("replace()");
// signature: df1.replace(["col"], valueToReplace, newValue);
// (df4.replace(["name"], "billy", "silly")).head(100);
// console.log(sDf4.regexReplace(["name"], /r/i, "rrr").toArray());
// console.log(sDf4.regexReplace(["name"], /r/ig, "rrr").toArray());
// console.log(sDf4.regexReplace(["name", "colour"], /r/ig, "rrr").toArray());
console.log("pivot()");
sDf6.head();
sDf6.pivot(
["id", "name"], // GroupBy Cols
"colour", // Pivot Col
"weight", // Value Col
"count" // Agg Type
).head();
sDf6.pivot(
["id", "name"], // GroupBy Cols
"colour", // Pivot Col
"weight", // Value Col
"sum" // Agg Type
).head();
sDf6.pivot(
["id", "name"], // GroupBy Cols
"colour", // Pivot Col
"weight", // Value Col
"mean" // Agg Type
).head();
console.log("unionByName()");
// console.log(sDf1.unionByName(sDf2).toArray());
// await lorix.writeTsv(df1, "df1_output.tsv");
// await lorix.writeCsv(df1, "df1_output.csv");
// await lorix.writeDsv(df1, "df1_output.psv"); // Error
// await lorix.writeDsv(df1, "df1_output.psv", ""); // Error
// await lorix.writeDsv(df1, "df1_output.psv", {}); // Error
// await lorix.writeDsv(df1, "df1_output.psv", "|");
// await lorix.writeJson(df1, "df1_output.json");