lorix
Version:
Javascript dataframe API
447 lines (354 loc) • 16.7 kB
JavaScript
import lodash from "lodash";
import {
_crossJoin,
_join
} from "./joins.js";
import {
groupAggregation
} from "./groups.js"
import {
applyWindowFunction
} from "./window.js"
import {
validateFunctionReferencesWithProxy,
_getUniqueObjectProperties,
_isSubsetArray,
_isValidColumnName,
_getArrayOfObjectReferences,
_getDistinctFn,
_isString,
_getDistinctColumnValues,
_getNullObjectFromProperties,
_assignPivotValues,
_getPivotColsAggregationMap
} from "./utils.js";
export class DataFrame {
constructor(rowArray=[], columns=[]) {
// Check that row properties align with specified columns
if (rowArray.length && columns.length) {
const rowColumns = _getUniqueObjectProperties(rowArray);
const diff = lodash.difference(rowColumns, columns);
if (diff.length)
throw Error(`There are differences between row properties and specified columns: '${diff.join(', ')}'`);
}
this.rows = rowArray;
this.columns = columns;
}
// Enable to iterate over DataFrame rows
*iterator() {
for (let row of this.rows)
yield row;
}
[Symbol.iterator]() {
return this.iterator();
}
static fromArray(arr) {
// Return a Dataframe from an array of objects.
if (arr instanceof Array) {
let cols = _getUniqueObjectProperties(arr);
if (cols.length > 0)
return new DataFrame(arr, cols);
}
throw Error("Dataframe.fromArray() only accepts a non-empty array of objects.");
}
toArray() {
return this.rows;
}
head(n=10) {
if (n > 0)
console.table(this.rows.slice(0, n), this.columns);
}
size() {
return [this.rows.length, this.columns.length];
}
slice(i=0, j=-1) {
// Returns a DataFrame with the subset of rows
// specified by the i, j indexes.
if (j !== -1)
return new DataFrame(this.rows.slice(i, j + 1), this.columns);
return new DataFrame(this.rows.slice(i), this.columns);
}
select(...fields) {
if (!fields.length)
throw Error("No columns provided to select().");
// Check that fields passed exist in DataFrame
const diff = lodash.difference(fields, this.columns)
if (diff.length)
throw Error(`Field(s) '${diff.join(', ')}' do not exist in DataFrame.`);
const outputArray = [];
for (const row of this.rows)
outputArray.push(lodash.pick(row, fields))
return new DataFrame(outputArray, fields);
}
drop(...fields) {
if (!fields.length)
throw Error("No columns provided to drop().");
// Check that fields passed exist in DataFrame
const diff = lodash.difference(fields, this.columns)
if (diff.length)
throw Error(`Field(s) '${diff.join(', ')}' do not exist in DataFrame.`);
const outputArray = [];
for (const row of this.rows)
outputArray.push(lodash.omit(row, fields))
return new DataFrame(outputArray, lodash.difference(this.columns, fields));
}
withColumn(col, expr) {
// Returns a new Dataframe with a new column definition.
// Note: if a reference is made to a non-existent column
// the result will be undefined.
// Check that `col` is a string that does not start with a number.
if (!_isValidColumnName(col))
throw Error(`Column name "${col}" is not valid.`);
// Check that `expr` is a function.
if ((expr == undefined) || !(expr instanceof Function))
throw Error("expr provided to withColumn needs to be a function.");
// Check if `expr` is a window function, and
// apply it to the DataFrame, if so.
if (expr.hasOwnProperty("isWindow"))
return applyWindowFunction(this, col, ...expr());
// Check what existing columns are being referenced,
// and throw an error if at least one does not exist.
// validateFunctionReferencesWithProxy(expr, this.columns);
let newRows = this.rows.map((row) => ({...row, ...{[col]: expr(row)}}));
let newCols = this.columns.includes(col) ? this.columns : this.columns.concat([col]);
return new DataFrame(newRows, newCols);
}
filter(expr) {
// Returns a new Dataframe with rows filtered according
// to the function `expr`.
// Note: if a reference is made to a non-existent column
// an error will be thrown.
// Check number of arguments.
if (arguments.length < 1 || arguments.length > 1)
throw Error(`filter() takes a single argument. Arguments passed: ${arguments.length}`);
// Check that `expr` is a function.
if((expr == undefined) || !(expr instanceof Function))
throw Error("`expr` provided to filter needs to be a function.");
// Check what existing columns are being referenced,
// if any, and throw an error if at least one does not exist.
// validateFunctionReferencesWithProxy(expr, this.columns);
let newRows = this.rows.filter((row) => expr(row));
return new DataFrame(newRows, this.columns);
}
distinct(subset=[]) {
// Return a new DataFrame with duplicate rows dropped.
// If `subset` of columns is not passed, then duplicates
// will be identified across all columns.
// Check number of arguments.
if (arguments.length > 1)
throw Error(`distinct() takes a single argument. Arguments passed: ${arguments.length}`);
if (!(subset instanceof Array))
throw Error("`subset` provided to distinct needs to be a function.");
// Check that all columns specified in `subset` exist in the DataFrame.
if ((subset.length) && !(_isSubsetArray(subset, this.columns)))
throw Error(`Invalid columns specified in distinct(): ${lodash.difference(subset, this.columns)}`);
if (subset.length)
return new DataFrame(lodash.uniqBy(this.rows, _getDistinctFn(subset)), this.columns);
// If a subset of columns isn't provided, remove
// duplicate rows across all DataFrame columns
return new DataFrame(lodash.uniqBy(this.rows, _getDistinctFn(this.columns)), this.columns);
}
regexReplace(cols, replaceRegex, newString) {
/**
* Returns a new DataFrame with regular expression
* `replaceRegex` replaced by `newString` in
* one or more columns defined in Array `cols`.
*/
// Check number of arguments.
if (arguments.length < 3 || arguments.length > 3)
throw Error(`regexReplace() takes three arguments. Number of arguments passed: ${arguments.length}`);
// Check that `cols` is an Array.
if (!(cols instanceof Array))
throw Error("First parameter provided to regexReplace() needs to be an array of columns.");
// Check that `replaceRegex` is a regular expression.
if (!(replaceRegex instanceof RegExp))
throw Error("Second parameter provided to regexReplace() needs to be a regular expression.");
// Check that `newString` is a string.
if (!_isString(newString))
throw Error("Third parameter provided to regexReplace() needs to be a string.");
// Check that columns in `cols` are valid.
const diff = lodash.difference(cols, this.columns)
if (diff.length)
throw Error(`Invalid columns provided in regexReplace(): '${diff.join(', ')}'`);
// Replace strings in columns.
let newRows = lodash.cloneDeep(this.rows);
for (let col of cols) {
newRows = newRows.map((row) => {
try {
row[col] = row[col].replace(replaceRegex, newString);
return row;
} catch(error) { /* pass if replacing a non-string */ }
});
}
return new DataFrame(newRows, this.columns);
}
replaceAll(cols, oldString, newString) {
/**
* Returns a new DataFrame with all instances of
* string `oldString` replaced by `newString` in
* one or more columns defined in Array `cols`.
*/
// Check number of arguments.
if (arguments.length < 3 || arguments.length > 3)
throw Error(`replaceAll() takes three arguments. Number of arguments passed: ${arguments.length}`);
// Check that `cols` is an Array.
if (!(cols instanceof Array))
throw Error("`cols` provided to replaceAll() needs to be an array of columns.");
// Check that `oldString` and `newString` are a strings.
if (!_isString(oldString) || !_isString(newString))
throw Error("Second and third parameters provided to replaceAll() need to be strings.");
// Check that columns in `cols` are valid.
const diff = lodash.difference(cols, this.columns)
if (diff.length)
throw Error(`Invalid columns provided in replaceAll(): '${diff.join(', ')}'`);
// Replace strings in columns.
let newRows = lodash.cloneDeep(this.rows);
for (let col of cols) {
newRows = newRows.map(row => {
try {
row[col] = row[col].replaceAll(oldString, newString);
return row;
} catch(error) { /* pass if replacing a non-string */ }
});
}
return new DataFrame(newRows, this.columns);
}
replace(cols, oldString, newString) {
/**
* Returns a new DataFrame with first instance of
* string `oldString` replaced by `newString` in
* one or more columns defined in Array `cols`.
*/
// Check number of arguments.
if (arguments.length < 3 || arguments.length > 3)
throw Error(`replace() takes three arguments. Number of arguments passed: ${arguments.length}`);
// Check that `cols` is an Array.
if(!(cols instanceof Array))
throw Error("`cols` provided to replace() needs to be an array of columns.");
// Check that `oldString` and `newString` are a strings.
if (!_isString(oldString) || !_isString(newString))
throw Error("Second and third parameters provided to replace() need to be strings.");
// Check that columns in `cols` are valid.
const diff = lodash.difference(cols, this.columns)
if (diff.length)
throw Error(`Invalid columns provided in replace(): '${diff.join(', ')}'`);
// Replace strings in columns.
let newRows = lodash.cloneDeep(this.rows);
for (let col of cols) {
newRows = newRows.map(row => {
try {
row[col] = row[col].replace(oldString, newString);
return row;
} catch(error) { /* pass if replacing a non-string */ }
});
}
return new DataFrame(newRows, this.columns);
}
crossJoin(df) {
if (arguments.length < 1 || arguments.length > 1)
throw Error(`crossJoin() takes a single argument. Arguments passed: ${arguments.length}`);
return _crossJoin(this, df);
}
innerJoin(df, leftOn, rightOn) {
if (arguments.length < 2 || arguments.length > 3)
throw Error(`innerJoin() takes either two or three arguments. Arguments passed: ${arguments.length}`);
let on;
if (arguments.length == 2) on = leftOn;
return _join("inner", this, df, on, leftOn, rightOn);
}
leftJoin(df, leftOn, rightOn) {
if (arguments.length < 2 || arguments.length > 3)
throw Error(`leftJoin() takes either two or three arguments. Arguments passed: ${arguments.length}`);
let on;
if (arguments.length == 2) on = leftOn;
return _join("left", this, df, on, leftOn, rightOn);
}
leftAntiJoin(df, leftOn, rightOn) {
if (arguments.length < 2 || arguments.length > 3)
throw Error(`leftAntiJoin() takes either two or three arguments. Arguments passed: ${arguments.length}`);
let on;
if (arguments.length == 2) on = leftOn;
return _join("leftAnti", this, df, on, leftOn, rightOn);
}
rightJoin(df, leftOn, rightOn) {
if (arguments.length < 2 || arguments.length > 3)
throw Error(`rightJoin() takes either two or three arguments. Arguments passed: ${arguments.length}`);
let on;
if (arguments.length == 2) on = leftOn;
return _join("right", this, df, on, leftOn, rightOn);
}
rightAntiJoin(df, leftOn, rightOn) {
if (arguments.length < 2 || arguments.length > 3)
throw Error(`rightAntiJoin() takes either two or three arguments. Arguments passed: ${arguments.length}`);
let on;
if (arguments.length == 2) on = leftOn;
return _join("rightAnti", this, df, on, leftOn, rightOn);
}
fullOuterJoin(df, leftOn, rightOn) {
if (arguments.length < 2 || arguments.length > 3)
throw Error(`fullOuterJoin() takes either two or three arguments. Arguments passed: ${arguments.length}`);
let on;
if (arguments.length == 2) on = leftOn;
return _join("fullOuter", this, df, on, leftOn, rightOn);
}
orderBy(cols, order) {
// Returns a new Dataframe with rows ordered by columns
// defined in the `cols` array. These can be "asc" or "desc",
// as defined for each corresponding element in the `order`
// array.
// e.g. df.orderBy(["col1", "col2"], ["asc", "desc"])
if (!(cols instanceof Array) || !(cols.length))
throw Error(`orderBy() requires non-empty array of columns.`);
if (arguments.length == 2 && (!(order instanceof Array) || !(order.length)))
throw Error(`orderBy() requires an optional non-empty sort order array.`);
// Check column array validity
if (!(_isSubsetArray(cols, this.columns)))
throw Error(`Invalid columns found in orderBy(): ${lodash.difference(cols, this.columns)}`);
if (arguments.length == 2 && !(_isSubsetArray(order, ["asc", "desc"])))
throw Error(`Invalid columns found in orderBy(): ${lodash.difference(order, ["asc", "desc"])}`);
return new DataFrame(lodash.orderBy(this.rows, cols, order || []), this.columns);
}
groupBy(cols, agg) {
// Return a Map or a DataFrame depending on whether
// `agg` is defined or not.
if (!(cols instanceof Array) || !(cols.length))
throw Error(`groupBy() requires non-empty array of columns.`);
// Check column array validity
if (!(_isSubsetArray(cols, this.columns)))
throw Error(`Invalid columns found in groupBy(): ${lodash.difference(cols, this.columns)}`);
return groupAggregation(this, cols, agg);
}
pivot(groupByCols, pivotCol, valueCol, aggType) {
/**
* Returns a new DataFrame that has been
* transposed by aggregating the value columns
* across the pivoted values in `pivotCol`, and
* grouping these by columns defined by `groupByCols`.
*/
let props = _getDistinctColumnValues(this, pivotCol);
let nullObj = _getNullObjectFromProperties(props);
let rows = [nullObj];
let nullDf = new DataFrame(rows, props);
let groupDf = this.groupBy([...groupByCols, pivotCol], {[valueCol]: aggType});
let df = groupDf.crossJoin(nullDf);
_assignPivotValues(df, pivotCol, `${valueCol}_${aggType}`, props);
let groupByMap = df.groupBy(groupByCols, _getPivotColsAggregationMap(props, aggType));
return groupByMap;
}
unionByName(df) {
/**
* Returns a new DataFrame including rows from
* both DataFrames being unioned.
* Throws an error if the columns between both
* DataFrames are different.
*/
// Check if `df` is a DataFrame
if (!(df instanceof DataFrame))
throw Error(`Can only union with another DataFrame.`);
// Check if this DataFrame and `df` have the same columns.
const diff = lodash.difference(this.columns, df.columns);
if (diff.length)
throw Error(`unionByName() cannot union DataFrames with different columns: '${diff.join(', ')}'`);
return new DataFrame([...this.rows, ...df.rows], this.columns);
}
}