danfojs
Version:
JavaScript library providing high performance, intuitive, and easy to use data structures for manipulating and processing structured data.
273 lines (271 loc) • 8.23 kB
TypeScript
/**
* @license
* Copyright 2022 JsData. All rights reserved.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ==========================================================================
*/
import DataFrame from "../core/frame";
import { ArrayType1D, ArrayType2D } from "../shared/types";
import Series from "../core/series";
/**
* The class performs all groupby operation on a dataframe
* involving all aggregate funciton
* @param {colDict} colDict Object of unique keys in the group by column
* @param {keyCol} keyCol Array contains the column names
* @param {data} Array the dataframe data
* @param {columnName} Array of all column name in the dataframe.
* @param {colDtype} Array columns dtype
*/
export default class Groupby {
colDict: {
[key: string]: {};
};
keyCol: ArrayType1D;
data?: ArrayType2D | null;
columnName: ArrayType1D;
colDtype: ArrayType1D;
colIndex: ArrayType1D;
groupDict?: any;
groupColNames?: Array<string>;
keyToValue: {
[key: string]: ArrayType1D;
};
constructor(keyCol: ArrayType1D, data: ArrayType2D | null, columnName: ArrayType1D, colDtype: ArrayType1D, colIndex: ArrayType1D);
/**
* Generate group object data needed for group operations
* let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ] ];
* let cols = [ "A", "B", "C" ];
* let df = new dfd.DataFrame(data, { columns: cols });
* let groupDf = df.groupby([ "A" ]);
* The following internal object is generated and save to this.colDict
* {
* '1': { A: [ 1 ], B: [ 2 ], C: [ 3 ] },
* '4': { A: [ 4 ], B: [ 5 ], C: [ 6 ] },
* '20': { A: [ 20 ], B: [ 30 ], C: [ 40 ] },
* '39': { A: [ 39 ], B: [ 89 ], C: [ 78 ] }
* }
* Since for groupby using more than one columns is index via '-'
* e.g for df.groupby(['A','B'])
* the result will look like this
* {
* '1-2': {A: [ 1 ], B: [ 2 ], C: [ 3 ]},
* '4-5': {A: [ 4 ], B: [ 5 ], C: [ 6 ]}
* }
* but in doing analysis on a specific column like this
* df.groupby(['A','B']).col(['C'])
* will have the following set of internal result
* {
* '1-2': { C: [ 3 ]},
* '4-5': {C: [ 6 ]}
* }
* In building our multindex type of DataFrame for this data,
* we've somehow loose track of value for column A and B.
* This could actually be generated by using split('-') on the object keys
* e.g '1-2'.split('-') will give us the value for A and B.
* But we might have weird case scenerio where A and B value has '-`
* e.g
* {
* '1--2-': { C: [ 3 ]},
* '4--5-': {C: [ 6 ]}
* }
* using `.split('-') might not work well
* Hence we create a key-value `keyToValue` object to store index and their
* associated value
* NOTE: In the previous implementation we made use of Graph representation
* for the group by data and Depth First search (DFS). But we decided to use key-value
* object in javascript as an hashmap to reduce search time compared to using Grpah and DFS
*/
group(): Groupby;
/**
* Generate new internal groupby data
* group = df.groupby(['A', 'B']).col('C')
* This filter the colDict property as generated by `.group()`
* it filter each group to contain only column `C` in their internal object
* e.g
* {
* '1-2': {A: [ 1 ], B: [ 2 ], C: [ 3 ]},
* '4-5': {A: [ 4 ], B: [ 5 ], C: [ 6 ]}
* }
* to
* {
* '1-2': { C: [ 3 ]},
* '4-5': {C: [ 6 ]}
* }
* @param colNames column names
* @return Groupby
*/
col(colNames: ArrayType1D | undefined): Groupby;
/**
* Perform all groupby arithmetic operations
* In the previous implementation all groups data are
* stord as DataFrame, which involve lot of memory usage
* Hence each groups are just pure javascrit object
* and all arithmetic operation is done directly on javascript
* arrays.
* e.g
* using this internal data
* {
* '1-2': {A: [ 1,3 ], B: [ 2,5 ], C: [ 3, 5 ]},
* '4-5': {A: [ 4,1 ], B: [ 5,0 ], C: [ 6, 12 ]}
* }
* 1) using groupby(['A', 'B']).arithmetic("mean")
* result: * {
* '1-2': {A_mean: [ 2 ], B_mean: [ 3.5 ], C_mean: [ 4 ]},
* '4-5': {A_mean: [ 2.5 ], B: [ 2.5 ], C_mean: [ 9 ]}
* }
* 2) .arithmetic({
* A: 'mean',
* B: 'sum',
* C: 'min'
* })
* result:
* {
* '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ]},
* '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ]}
* }
* 3) .arithmetic({
* A: 'mean',
* B: 'sum',
* C: ['min', 'max']
* })
* result:
* {
* '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ], C_max: [5]},
* '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ], C_max: [12]}
* }
* @param operation
*/
private arithemetic;
/**
* Peform all arithmetic logic
* @param colVal
* @param ops
*/
private groupMathLog;
/**
* Takes in internal groupby internal data and convert
* them to a single data frame.
* @param colDict
*/
private toDataFrame;
private operations;
/**
* Obtain the count for each group
* @returns DataFrame
*
*/
count(): DataFrame;
/**
* Obtain the sum of columns for each group
* @returns DataFrame
*
*/
sum(): DataFrame;
/**
* Obtain the standard deviation of columns for each group
* @returns DataFrame
*/
std(): DataFrame;
/**
* Obtain the variance of columns for each group
* @returns DataFrame
*/
var(): DataFrame;
/**
* Obtain the mean of columns for each group
* @returns DataFrame
*/
mean(): DataFrame;
/**
* Obtain the cumsum of columns for each group
* @returns DataFrame
*
*/
cumSum(): DataFrame;
/**
* Obtain the cummax of columns for each group
* @returns DataFrame
*/
cumMax(): DataFrame;
/**
* Obtain the cumprod of columns for each group
* @returns DataFrame
*/
cumProd(): DataFrame;
/**
* Obtain the cummin of columns for each group
* @returns DataFrame
*/
cumMin(): DataFrame;
/**
* Obtain the max value of columns for each group
* @returns DataFrame
*
*/
max(): DataFrame;
/**
* Obtain the min of columns for each group
* @returns DataFrame
*/
min(): DataFrame;
/**
* Obtain a specific group
* @param keys Array<string | number>
* @returns DataFrame
*/
getGroup(keys: Array<string | number>): DataFrame;
/**
* Perform aggregation on all groups
* @param ops
* @returns DataFrame
*/
agg(ops: {
[key: string]: Array<string> | string;
}): DataFrame;
/**
* Apply custom aggregator function
* to each group
* @param callable
* @returns DataFrame
* @example
* let grp = df.groupby(['A'])
* grp.apply((x) => x.count())
*/
apply(callable: (x: DataFrame) => DataFrame | Series): DataFrame;
private concatGroups;
/**
* obtain the total number of groups
* @returns number
*/
get ngroups(): number;
/**
* obtaind the internal group data
* @returns {[keys: string]: {}}
*/
get groups(): {
[keys: string]: {};
};
/**
* Obtain the first row of each group
* @returns DataFrame
*/
first(): DataFrame;
/**
* Obtain the last row of each group
* @returns DataFrame
*/
last(): DataFrame;
/**
* Obtains the dataframe se of each groups
* @returns DataFrame
*/
size(): DataFrame;
private colKeyDict;
}