lorix
Version:
Javascript dataframe API
112 lines (91 loc) • 3.94 kB
JavaScript
import {
group,
rollup,
sum,
mean,
min,
max
} from 'd3-array';
import { DataFrame } from "./dataframe.js";
import { _isSubsetArray } from './utils.js';
function _generateGroupByFunctions(cols) {
let groupByFunctions = [];
for (let col of cols)
groupByFunctions.push((g) => g[col]);
return groupByFunctions;
}
function _flattenAggMap(groups, groupByCols, aggColName, resultAgg={}) {
/**
* Traverses the Map of group columns, populating an output
* object with both the group columns, and the results of the
* pre-applied .rollup() function.
*/
return Array.from(groups, ([key, value]) =>
value instanceof Map
? _flattenAggMap(value, groupByCols.slice(1), aggColName, Object.assign({}, { ...resultAgg, [groupByCols[0]]: key } ))
: Object.assign({}, { ...resultAgg, [groupByCols[0]]: key, [aggColName] : value })
).flat();
}
function _aggregate(type, df, groupByFunctions, groupByCols, aggCol) {
let map;
let aggColumnName = aggCol + "_" + type;
if (type == "sum")
map = rollup(df.rows, v => sum(v, d => d[aggCol]), ...groupByFunctions);
if (type == "mean")
map = rollup(df.rows, v => mean(v, d => d[aggCol]), ...groupByFunctions);
if (type == "min")
map = rollup(df.rows, v => min(v, d => d[aggCol]), ...groupByFunctions);
if (type == "max")
map = rollup(df.rows, v => max(v, d => d[aggCol]), ...groupByFunctions);
if (type == "count")
map = rollup(df.rows, v => v.length, ...groupByFunctions);
// Catch any undefined aggregation types passed
if (map == undefined)
throw Error(`Invalid aggregation provided to groupBy: '${type}'`);
return new DataFrame(_flattenAggMap(map, groupByCols, aggColumnName), groupByCols.concat([aggColumnName]));
}
export function groupAggregation(df, groupByCols, groupByAggs) {
/**
* @output
* Retuns either:
* - A new Dataframe object with the results of the aggregations
* defined in `groupByAggs` for each distinct group of the list
* of columns in `groupByCols`.
* - A nested Map object, if no aggregation is provided through
* `groupByAggs`, with the values of each `groupByCols` column
* as the keys at each level of the Map.
*
* @input
* - df: Dataframe on which the groupBy will be executed.
* - groupByCols: Array of strings representing columns to group by.
* - groupByAggs: Object of column-to-aggregation(s) mappings that define
* the aggregation(s) to perform for each column, across each group.
* A column can have one or more aggregations defined, and this is
* passed as either a string or an array of strings specifying the
* aggregations. e.g. {"colA": "sum", "colB": ["sum", "count"]}.
*/
// If no aggregation object is passed, return a Map defining the groups
let groupByFunctions = _generateGroupByFunctions(groupByCols);
if (groupByAggs == undefined)
return group(df.rows, ...groupByFunctions);
// Check that columns exist in Dataframe
let aggCols = Object.getOwnPropertyNames(groupByAggs);
if (!(_isSubsetArray(groupByCols.concat(aggCols), df.columns)))
throw Error(`Invalid columns provided to groupBy '${groupByCols.concat(aggCols)}'`);
let dfs = [];
for (let [k, v] of Object.entries(groupByAggs)) {
// Check if value is list of aggregations for the
// given column, or a single one.
if (v instanceof Array) {
for (let agg of v) {
dfs.push(_aggregate(agg, df, groupByFunctions, groupByCols, k));
}
} else {
dfs.push(_aggregate(v, df, groupByFunctions, groupByCols, k));
}
}
// Join resultant Dataframes in dfs
if (dfs.length > 1)
return dfs.reduce((df1, df2) => df1.innerJoin(df2, groupByCols));
return dfs[0];
}