danfojs
Version:
JavaScript library providing high performance, intuitive, and easy to use data structures for manipulating and processing structured data.
651 lines (649 loc) • 22.9 kB
JavaScript
"use strict";
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
var __spreadArray = (this && this.__spreadArray) || function (to, from, pack) {
if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {
if (ar || !(i in from)) {
if (!ar) ar = Array.prototype.slice.call(from, 0, i);
ar[i] = from[i];
}
}
return to.concat(ar || Array.prototype.slice.call(from));
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
/**
* @license
* Copyright 2022 JsData. All rights reserved.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ==========================================================================
*/
var frame_1 = __importDefault(require("../core/frame"));
var mathjs_1 = require("mathjs");
var concat_1 = __importDefault(require("../transformers/concat"));
var series_1 = __importDefault(require("../core/series"));
/**
* The class performs all groupby operation on a dataframe
* involving all aggregate funciton
* @param {colDict} colDict Object of unique keys in the group by column
* @param {keyCol} keyCol Array contains the column names
* @param {data} Array the dataframe data
* @param {columnName} Array of all column name in the dataframe.
* @param {colDtype} Array columns dtype
*/
var Groupby = /** @class */ (function () {
function Groupby(keyCol, data, columnName, colDtype, colIndex) {
this.colDict = {};
this.keyToValue = {};
this.keyCol = keyCol;
this.data = data;
this.columnName = columnName;
//this.dataTensors = {}; //store the tensor version of the groupby data
this.colDtype = colDtype;
this.colIndex = colIndex;
}
/**
* Generate group object data needed for group operations
* let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ] ];
* let cols = [ "A", "B", "C" ];
* let df = new dfd.DataFrame(data, { columns: cols });
* let groupDf = df.groupby([ "A" ]);
* The following internal object is generated and save to this.colDict
* {
* '1': { A: [ 1 ], B: [ 2 ], C: [ 3 ] },
* '4': { A: [ 4 ], B: [ 5 ], C: [ 6 ] },
* '20': { A: [ 20 ], B: [ 30 ], C: [ 40 ] },
* '39': { A: [ 39 ], B: [ 89 ], C: [ 78 ] }
* }
* Since for groupby using more than one columns is index via '-'
* e.g for df.groupby(['A','B'])
* the result will look like this
* {
* '1-2': {A: [ 1 ], B: [ 2 ], C: [ 3 ]},
* '4-5': {A: [ 4 ], B: [ 5 ], C: [ 6 ]}
* }
* but in doing analysis on a specific column like this
* df.groupby(['A','B']).col(['C'])
* will have the following set of internal result
* {
* '1-2': { C: [ 3 ]},
* '4-5': {C: [ 6 ]}
* }
* In building our multindex type of DataFrame for this data,
* we've somehow loose track of value for column A and B.
* This could actually be generated by using split('-') on the object keys
* e.g '1-2'.split('-') will give us the value for A and B.
* But we might have weird case scenerio where A and B value has '-`
* e.g
* {
* '1--2-': { C: [ 3 ]},
* '4--5-': {C: [ 6 ]}
* }
* using `.split('-') might not work well
* Hence we create a key-value `keyToValue` object to store index and their
* associated value
* NOTE: In the previous implementation we made use of Graph representation
* for the group by data and Depth First search (DFS). But we decided to use key-value
* object in javascript as an hashmap to reduce search time compared to using Grpah and DFS
*/
Groupby.prototype.group = function () {
var _a;
var self = this;
var keyToValue = {};
var group = (_a = this.data) === null || _a === void 0 ? void 0 : _a.reduce(function (prev, current) {
var indexes = [];
for (var i in self.colIndex) {
var index_1 = self.colIndex[i];
indexes.push(current[index_1]);
}
var index = indexes.join('-');
if (!keyToValue[index]) {
keyToValue[index] = indexes;
}
if (prev[index]) {
var data = prev[index];
for (var i in self.columnName) {
var colName = self.columnName[i];
data[colName].push(current[i]);
}
}
else {
prev[index] = {};
for (var i in self.columnName) {
var colName = self.columnName[i];
prev[index][colName] = [current[i]];
}
}
return prev;
}, {});
this.colDict = group;
this.keyToValue = keyToValue;
return this;
};
/**
* Generate new internal groupby data
* group = df.groupby(['A', 'B']).col('C')
* This filter the colDict property as generated by `.group()`
* it filter each group to contain only column `C` in their internal object
* e.g
* {
* '1-2': {A: [ 1 ], B: [ 2 ], C: [ 3 ]},
* '4-5': {A: [ 4 ], B: [ 5 ], C: [ 6 ]}
* }
* to
* {
* '1-2': { C: [ 3 ]},
* '4-5': {C: [ 6 ]}
* }
* @param colNames column names
* @return Groupby
*/
Groupby.prototype.col = function (colNames) {
var _this = this;
if (typeof colNames === "undefined") {
colNames = this.columnName.filter(function (_, index) {
return !_this.colIndex.includes(index);
});
}
var self = this;
colNames.forEach(function (val) {
if (!self.columnName.includes(val))
throw new Error("Column " + val + " does not exist in groups");
});
var colDict = __assign({}, this.colDict);
for (var _i = 0, _a = Object.entries(colDict); _i < _a.length; _i++) {
var _b = _a[_i], key = _b[0], values = _b[1];
var c = {};
var keyVal = __assign({}, values);
for (var colKey in colNames) {
var colName = colNames[colKey];
c[colName] = keyVal[colName];
}
colDict[key] = c;
}
var gp = new Groupby(this.keyCol, null, this.columnName, this.colDtype, this.colIndex);
gp.colDict = colDict;
gp.groupColNames = colNames;
gp.keyToValue = this.keyToValue;
return gp;
};
/**
* Perform all groupby arithmetic operations
* In the previous implementation all groups data are
* stord as DataFrame, which involve lot of memory usage
* Hence each groups are just pure javascrit object
* and all arithmetic operation is done directly on javascript
* arrays.
* e.g
* using this internal data
* {
* '1-2': {A: [ 1,3 ], B: [ 2,5 ], C: [ 3, 5 ]},
* '4-5': {A: [ 4,1 ], B: [ 5,0 ], C: [ 6, 12 ]}
* }
* 1) using groupby(['A', 'B']).arithmetic("mean")
* result: * {
* '1-2': {A_mean: [ 2 ], B_mean: [ 3.5 ], C_mean: [ 4 ]},
* '4-5': {A_mean: [ 2.5 ], B: [ 2.5 ], C_mean: [ 9 ]}
* }
* 2) .arithmetic({
* A: 'mean',
* B: 'sum',
* C: 'min'
* })
* result:
* {
* '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ]},
* '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ]}
* }
* 3) .arithmetic({
* A: 'mean',
* B: 'sum',
* C: ['min', 'max']
* })
* result:
* {
* '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ], C_max: [5]},
* '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ], C_max: [12]}
* }
* @param operation
*/
Groupby.prototype.arithemetic = function (operation) {
var opsName = ["mean", "sum", "count", "mode", "std", "var", "cumsum", "cumprod",
"cummax", "cummin", "median", "min", "max"];
if (typeof operation === "string") {
if (!opsName.includes(operation)) {
throw new Error("group operation: " + operation + " is not valid");
}
}
else {
Object.keys(operation).forEach(function (key) {
var ops = operation[key];
if (Array.isArray(ops)) {
for (var _i = 0, ops_1 = ops; _i < ops_1.length; _i++) {
var op = ops_1[_i];
if (!opsName.includes(op)) {
throw new Error("group operation: " + op + " for column " + key + " is not valid");
}
}
}
else {
if (!opsName.includes(ops)) {
throw new Error("group operation: " + ops + " for column " + key + " is not valid");
}
}
});
}
var colDict = __assign({}, this.colDict);
for (var _i = 0, _a = Object.entries(colDict); _i < _a.length; _i++) {
var _b = _a[_i], key = _b[0], values = _b[1];
var colVal = {};
var keyVal = __assign({}, values);
var groupColNames = this.groupColNames;
for (var colKey = 0; colKey < groupColNames.length; colKey++) {
var colName = groupColNames[colKey];
var colIndex = this.columnName.indexOf(colName);
var colDtype = this.colDtype[colIndex];
var operationVal = (typeof operation === "string") ? operation : operation[colName];
if (colDtype === "string" && operationVal !== "count")
throw new Error("Can't perform math operation on column " + colName);
if (typeof operation === "string") {
var colName2 = colName + "_" + operation;
colVal[colName2] = this.groupMathLog(keyVal[colName], operation);
}
else {
if (Array.isArray(operation[colName])) {
for (var _c = 0, _d = operation[colName]; _c < _d.length; _c++) {
var ops = _d[_c];
var colName2 = colName + "_" + ops;
colVal[colName2] = this.groupMathLog(keyVal[colName], ops);
}
}
else {
var ops = operation[colName];
var colName2 = colName + "_" + ops;
colVal[colName2] = this.groupMathLog(keyVal[colName], ops);
}
}
}
colDict[key] = colVal;
}
return colDict;
};
/**
* Peform all arithmetic logic
* @param colVal
* @param ops
*/
Groupby.prototype.groupMathLog = function (colVal, ops) {
var data = [];
switch (ops) {
case "max":
var max = colVal.reduce(function (prev, curr) {
if (prev > curr) {
return prev;
}
return curr;
});
data.push(max);
break;
case "min":
var min = colVal.reduce(function (prev, curr) {
if (prev < curr) {
return prev;
}
return curr;
});
data.push(min);
break;
case "sum":
var sum = colVal.reduce(function (prev, curr) {
return prev + curr;
});
data.push(sum);
break;
case "count":
data.push(colVal.length);
break;
case "mean":
var sumMean = colVal.reduce(function (prev, curr) {
return prev + curr;
});
data.push(sumMean / colVal.length);
break;
case "std":
data.push((0, mathjs_1.std)(colVal));
break;
case "var":
data.push((0, mathjs_1.variance)(colVal));
break;
case "median":
data.push((0, mathjs_1.median)(colVal));
break;
case "mode":
data.push((0, mathjs_1.mode)(colVal));
break;
case "cumsum":
colVal.reduce(function (prev, curr) {
var sum = prev + curr;
data.push(sum);
return sum;
}, 0);
break;
case "cummin":
data = [colVal[0]];
colVal.slice(1).reduce(function (prev, curr) {
if (prev < curr) {
data.push(prev);
return prev;
}
data.push(curr);
return curr;
}, data[0]);
break;
case "cummax":
data = [colVal[0]];
colVal.slice(1).reduce(function (prev, curr) {
if (prev > curr) {
data.push(prev);
return prev;
}
data.push(curr);
return curr;
}, data[0]);
break;
case "cumprod":
colVal.reduce(function (prev, curr) {
var sum = prev * curr;
data.push(sum);
return sum;
}, 1);
break;
}
return data;
};
/**
* Takes in internal groupby internal data and convert
* them to a single data frame.
* @param colDict
*/
Groupby.prototype.toDataFrame = function (colDict) {
var data = {};
for (var _i = 0, _a = this.colKeyDict(colDict); _i < _a.length; _i++) {
var key = _a[_i];
var value = colDict[key];
var keyDict = {};
var oneValue = Object.values(value)[0];
var valueLen = oneValue.length;
for (var key1 in this.keyCol) {
var keyName = this.keyCol[key1];
var keyValue = this.keyToValue[key][key1];
keyDict[keyName] = Array(valueLen).fill(keyValue);
}
var combine = __assign(__assign({}, keyDict), value);
if (Object.keys(data).length < 1) {
data = combine;
}
else {
for (var _b = 0, _c = Object.keys(data); _b < _c.length; _b++) {
var dataKey = _c[_b];
var dataValue = combine[dataKey];
data[dataKey] = __spreadArray(__spreadArray([], data[dataKey], true), dataValue, true);
}
}
}
return new frame_1.default(data);
};
Groupby.prototype.operations = function (ops) {
if (!this.groupColNames) {
var colGroup = this.col(undefined);
var colDict_1 = colGroup.arithemetic(ops);
var df_1 = colGroup.toDataFrame(colDict_1);
return df_1;
}
var colDict = this.arithemetic(ops);
var df = this.toDataFrame(colDict);
return df;
};
/**
* Obtain the count for each group
* @returns DataFrame
*
*/
Groupby.prototype.count = function () {
return this.operations("count");
};
/**
* Obtain the sum of columns for each group
* @returns DataFrame
*
*/
Groupby.prototype.sum = function () {
return this.operations("sum");
};
/**
* Obtain the standard deviation of columns for each group
* @returns DataFrame
*/
Groupby.prototype.std = function () {
return this.operations("std");
};
/**
* Obtain the variance of columns for each group
* @returns DataFrame
*/
Groupby.prototype.var = function () {
return this.operations("var");
};
/**
* Obtain the mean of columns for each group
* @returns DataFrame
*/
Groupby.prototype.mean = function () {
return this.operations("mean");
};
/**
* Obtain the cumsum of columns for each group
* @returns DataFrame
*
*/
Groupby.prototype.cumSum = function () {
return this.operations("cumsum");
};
/**
* Obtain the cummax of columns for each group
* @returns DataFrame
*/
Groupby.prototype.cumMax = function () {
return this.operations("cummax");
};
/**
* Obtain the cumprod of columns for each group
* @returns DataFrame
*/
Groupby.prototype.cumProd = function () {
return this.operations("cumprod");
};
/**
* Obtain the cummin of columns for each group
* @returns DataFrame
*/
Groupby.prototype.cumMin = function () {
return this.operations("cummin");
};
/**
* Obtain the max value of columns for each group
* @returns DataFrame
*
*/
Groupby.prototype.max = function () {
return this.operations("max");
};
/**
* Obtain the min of columns for each group
* @returns DataFrame
*/
Groupby.prototype.min = function () {
return this.operations("min");
};
/**
* Obtain a specific group
* @param keys Array<string | number>
* @returns DataFrame
*/
Groupby.prototype.getGroup = function (keys) {
var dictKey = keys.join("-");
var colDict = {};
colDict[dictKey] = __assign({}, this.colDict[dictKey]);
return this.toDataFrame(colDict);
};
/**
* Perform aggregation on all groups
* @param ops
* @returns DataFrame
*/
Groupby.prototype.agg = function (ops) {
var columns = Object.keys(ops);
var col_gp = this.col(columns);
var data = col_gp.arithemetic(ops);
var df = col_gp.toDataFrame(data);
return df;
};
/**
* Apply custom aggregator function
* to each group
* @param callable
* @returns DataFrame
* @example
* let grp = df.groupby(['A'])
* grp.apply((x) => x.count())
*/
Groupby.prototype.apply = function (callable) {
var colDict = {};
for (var _i = 0, _a = this.colKeyDict(this.colDict); _i < _a.length; _i++) {
var key = _a[_i];
var valDataframe = new frame_1.default(this.colDict[key]);
colDict[key] = callable(valDataframe);
}
return this.concatGroups(colDict);
};
Groupby.prototype.concatGroups = function (colDict) {
var data = [];
for (var _i = 0, _a = Object.entries(colDict); _i < _a.length; _i++) {
var _b = _a[_i], key = _b[0], values = _b[1];
var copyDf = void 0;
if (values instanceof frame_1.default) {
copyDf = values.copy();
}
else {
var columns = values.index;
columns = columns.length > 1 ? columns : ['applyOps'];
copyDf = new frame_1.default([values.values], { columns: columns });
}
var len = copyDf.shape[0];
var key1 = void 0;
for (key1 in this.keyCol) {
var keyName = this.keyCol[key1];
var keyValue = this.keyToValue[key][key1];
var dfValue = Array(len).fill(keyValue);
var atIndex = parseInt(key1);
if (this.groupColNames) {
copyDf.addColumn(keyName, dfValue, { inplace: true, atIndex: atIndex });
}
else {
copyDf.addColumn(keyName + "_Group", dfValue, { inplace: true, atIndex: atIndex });
}
}
data.push(copyDf);
}
return (0, concat_1.default)({ dfList: data, axis: 0 });
};
Object.defineProperty(Groupby.prototype, "ngroups", {
/**
* obtain the total number of groups
* @returns number
*/
get: function () {
var keys = Object.keys(this.colDict);
return keys.length;
},
enumerable: false,
configurable: true
});
Object.defineProperty(Groupby.prototype, "groups", {
/**
* obtaind the internal group data
* @returns {[keys: string]: {}}
*/
get: function () {
return this.colDict;
},
enumerable: false,
configurable: true
});
/**
* Obtain the first row of each group
* @returns DataFrame
*/
Groupby.prototype.first = function () {
return this.apply(function (x) {
return x.head(1);
});
};
/**
* Obtain the last row of each group
* @returns DataFrame
*/
Groupby.prototype.last = function () {
return this.apply(function (x) {
return x.tail(1);
});
};
/**
* Obtains the dataframe se of each groups
* @returns DataFrame
*/
Groupby.prototype.size = function () {
return this.apply(function (x) {
return new series_1.default([x.shape[0]]);
});
};
Groupby.prototype.colKeyDict = function (colDict) {
var keyDict = {};
for (var _i = 0, _a = Object.keys(colDict); _i < _a.length; _i++) {
var key = _a[_i];
var firstKey = key.split("-")[0];
if (firstKey in keyDict) {
keyDict[firstKey].push(key);
}
else {
keyDict[firstKey] = [key];
}
}
var keys = [];
for (var _b = 0, _c = Object.keys(keyDict); _b < _c.length; _b++) {
var key = _c[_b];
keys.push.apply(keys, keyDict[key]);
}
return keys;
};
return Groupby;
}());
exports.default = Groupby;