ml-random-forest
Version:
Random forest for classification and regression
730 lines (658 loc) • 23 kB
JavaScript
'use strict';
Object.defineProperty(exports, '__esModule', { value: true });
var arrayMode = require('ml-array-mode');
var mlCart = require('ml-cart');
var mlMatrix = require('ml-matrix');
var Random = require('random-js');
var arrayMean = require('ml-array-mean');
var arrayMedian = require('ml-array-median');
function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; }
function _interopNamespace(e) {
if (e && e.__esModule) return e;
var n = Object.create(null);
if (e) {
Object.keys(e).forEach(function (k) {
if (k !== 'default') {
var d = Object.getOwnPropertyDescriptor(e, k);
Object.defineProperty(n, k, d.get ? d : {
enumerable: true,
get: function () { return e[k]; }
});
}
});
}
n["default"] = e;
return Object.freeze(n);
}
var arrayMode__default = /*#__PURE__*/_interopDefaultLegacy(arrayMode);
var Random__namespace = /*#__PURE__*/_interopNamespace(Random);
var arrayMean__default = /*#__PURE__*/_interopDefaultLegacy(arrayMean);
var arrayMedian__default = /*#__PURE__*/_interopDefaultLegacy(arrayMedian);
function checkFloat(n) {
return n > 0.0 && n <= 1.0;
}
function isFloat(n) {
return Number(n) === n && n % 1 !== 0;
}
/**
* Select n with replacement elements on the training set and values, where n is the size of the training set.
* @ignore
* @param {Matrix} trainingSet
* @param {Array} trainingValue
* @param {number} seed - seed for the random selection, must be a 32-bit integer.
* @return {object} with new X and y.
*/
function examplesBaggingWithReplacement(
trainingSet,
trainingValue,
seed,
) {
let engine;
let distribution = Random__namespace.integer(0, trainingSet.rows - 1);
if (seed === undefined) {
engine = Random__namespace.MersenneTwister19937.autoSeed();
} else if (Number.isInteger(seed)) {
engine = Random__namespace.MersenneTwister19937.seed(seed);
} else {
throw new RangeError(
`Expected seed must be undefined or integer not ${seed}`,
);
}
let Xr = new Array(trainingSet.rows);
let yr = new Array(trainingSet.rows);
let oob = new Array(trainingSet.rows).fill(0);
let oobN = trainingSet.rows;
for (let i = 0; i < trainingSet.rows; ++i) {
let index = distribution(engine);
Xr[i] = trainingSet.getRow(index);
yr[i] = trainingValue[index];
if (oob[index]++ === 0) {
oobN--;
}
}
let Xoob = new Array(oobN);
let ioob = new Array(oobN);
// run backwards to have ioob filled in increasing order
for (let i = trainingSet.rows - 1; i >= 0 && oobN > 0; --i) {
if (oob[i] === 0) {
Xoob[--oobN] = trainingSet.getRow(i);
ioob[oobN] = i;
}
}
return {
X: new mlMatrix.Matrix(Xr),
y: yr,
Xoob: new mlMatrix.Matrix(Xoob),
ioob,
seed: engine.next(),
};
}
/**
* selects n features from the training set with or without replacement, returns the new training set and the indexes used.
* @ignore
* @param {Matrix} trainingSet
* @param {number} n - features.
* @param {boolean} replacement
* @param {number} seed - seed for the random selection, must be a 32-bit integer.
* @return {object}
*/
function featureBagging(trainingSet, n, replacement, seed) {
if (trainingSet.columns < n) {
throw new RangeError(
'N should be less or equal to the number of columns of X',
);
}
let distribution = Random__namespace.integer(0, trainingSet.columns - 1);
let engine;
if (seed === undefined) {
engine = Random__namespace.MersenneTwister19937.autoSeed();
} else if (Number.isInteger(seed)) {
engine = Random__namespace.MersenneTwister19937.seed(seed);
} else {
throw new RangeError(
`Expected seed must be undefined or integer not ${seed}`,
);
}
let toRet = new mlMatrix.Matrix(trainingSet.rows, n);
let usedIndex;
let index;
if (replacement) {
usedIndex = new Array(n);
for (let i = 0; i < n; ++i) {
index = distribution(engine);
usedIndex[i] = index;
toRet.setColumn(i, trainingSet.getColumn(index));
}
} else {
usedIndex = new Set();
index = distribution(engine);
for (let i = 0; i < n; ++i) {
while (usedIndex.has(index)) {
index = distribution(engine);
}
toRet.setColumn(i, trainingSet.getColumn(index));
usedIndex.add(index);
}
usedIndex = Array.from(usedIndex);
}
return {
X: toRet,
usedIndex: usedIndex,
seed: engine.next(),
};
}
/**
* collects and combines the individual results from the tree predictions on Out-Of-Bag data
* @ignore
* @param {{index: {Array},predicted: {Array}}[]} oob: array of individual tree predictions
* @param {array} y: true labels
* @param {(predictions:{Array})=>{number}} aggregate: aggregation function
* @return {Array}
*/
const collectOOB = (oob, y, aggregate) => {
const res = Array(y.length);
for (let i = 0; i < y.length; i++) {
const all = [];
for (let j = 0; j < oob.length; j++) {
const o = oob[j];
if (o.index[0] === i) {
all.push(o.predicted[0]);
o.index = o.index.slice(1);
o.predicted = o.predicted.slice(1);
}
}
res[i] = { true: y[i], all: all, predicted: aggregate(all) };
}
return res;
};
/**
* @class RandomForestBase
*/
class RandomForestBase {
/**
* Create a new base random forest for a classifier or regression model.
* @constructor
* @param {object} options
* @param {number|String} [options.maxFeatures] - the number of features used on each estimator.
* * if is an integer it selects maxFeatures elements over the sample features.
* * if is a float between (0, 1), it takes the percentage of features.
* @param {boolean} [options.replacement] - use replacement over the sample features.
* @param {number} [options.seed] - seed for feature and samples selection, must be a 32-bit integer.
* @param {number} [options.nEstimators] - number of estimator to use.
* @param {object} [options.treeOptions] - options for the tree classifier, see [ml-cart]{@link https://mljs.github.io/decision-tree-cart/}
* @param {boolean} [options.isClassifier] - boolean to check if is a classifier or regression model (used by subclasses).
* @param {boolean} [options.useSampleBagging] - use bagging over training samples.
* @param {boolean} [options.noOOB] - don't calculate Out-Of-Bag predictions.
* @param {object} model - for load purposes.
*/
constructor(options, model) {
if (options === true) {
this.replacement = model.replacement;
this.maxFeatures = model.maxFeatures;
this.nEstimators = model.nEstimators;
this.treeOptions = model.treeOptions;
this.isClassifier = model.isClassifier;
this.seed = model.seed;
this.n = model.n;
this.indexes = model.indexes;
this.useSampleBagging = model.useSampleBagging;
this.noOOB = true;
this.maxSamples = model.maxSamples;
let Estimator = this.isClassifier ? mlCart.DecisionTreeClassifier : mlCart.DecisionTreeRegression;
this.estimators = model.estimators.map((est) => Estimator.load(est));
} else {
this.replacement = options.replacement;
this.maxFeatures = options.maxFeatures;
this.nEstimators = options.nEstimators;
this.treeOptions = options.treeOptions;
this.isClassifier = options.isClassifier;
this.seed = options.seed;
this.useSampleBagging = options.useSampleBagging;
this.noOOB = options.noOOB;
this.maxSamples = options.maxSamples;
}
}
/**
* Train the decision tree with the given training set and labels.
* @param {Matrix|Array} trainingSet
* @param {Array} trainingValues
*/
train(trainingSet, trainingValues) {
let currentSeed = this.seed;
trainingSet = mlMatrix.Matrix.checkMatrix(trainingSet);
this.maxFeatures = this.maxFeatures || trainingSet.columns;
this.numberFeatures = trainingSet.columns;
this.numberSamples = trainingSet.rows;
if (checkFloat(this.maxFeatures)) {
this.n = Math.floor(trainingSet.columns * this.maxFeatures);
} else if (Number.isInteger(this.maxFeatures)) {
if (this.maxFeatures > trainingSet.columns) {
throw new RangeError(
`The maxFeatures parameter should be less than ${trainingSet.columns}`,
);
} else {
this.n = this.maxFeatures;
}
} else {
throw new RangeError(
`Cannot process the maxFeatures parameter ${this.maxFeatures}`,
);
}
if (this.maxSamples) {
if (this.maxSamples < 0) {
throw new RangeError(`Please choose a positive value for maxSamples`);
} else {
if (isFloat(this.maxSamples)) {
if (this.maxSamples > 1.0) {
throw new RangeError(
'Please choose either a float value between 0 and 1 or a positive integer for maxSamples',
);
} else {
this.numberSamples = Math.floor(trainingSet.rows * this.maxSamples);
}
} else if (Number.isInteger(this.maxSamples)) {
if (this.maxSamples > trainingSet.rows) {
throw new RangeError(
`The maxSamples parameter should be less than ${trainingSet.rows}`,
);
} else {
this.numberSamples = this.maxSamples;
}
}
}
}
if (this.maxSamples) {
if (trainingSet.rows !== this.numberSamples) {
let tmp = new mlMatrix.Matrix(this.numberSamples, trainingSet.columns);
for (let j = 0; j < this.numberSamples; j++) {
tmp.removeRow(0);
}
for (let i = 0; i < this.numberSamples; i++) {
tmp.addRow(trainingSet.getRow(i));
}
trainingSet = tmp;
trainingValues = trainingValues.slice(0, this.numberSamples);
}
}
let Estimator;
if (this.isClassifier) {
Estimator = mlCart.DecisionTreeClassifier;
} else {
Estimator = mlCart.DecisionTreeRegression;
}
this.estimators = new Array(this.nEstimators);
this.indexes = new Array(this.nEstimators);
let oobResults = new Array(this.nEstimators);
for (let i = 0; i < this.nEstimators; ++i) {
let res = this.useSampleBagging
? examplesBaggingWithReplacement(
trainingSet,
trainingValues,
currentSeed,
)
: {
X: trainingSet,
y: trainingValues,
seed: currentSeed,
Xoob: undefined,
yoob: [],
ioob: [],
};
let X = res.X;
let y = res.y;
currentSeed = res.seed;
let { Xoob, ioob } = res;
// Other implementations of random forests apply feature bagging at every split during tree generation.
// So I think it would be better to implement it at the CART level, not here.
res = featureBagging(X, this.n, this.replacement, currentSeed);
X = res.X;
currentSeed = res.seed;
this.indexes[i] = res.usedIndex;
this.estimators[i] = new Estimator(this.treeOptions);
this.estimators[i].train(X, y);
if (!this.noOOB && this.useSampleBagging) {
let xoob = new mlMatrix.MatrixColumnSelectionView(Xoob, this.indexes[i]);
oobResults[i] = {
index: ioob,
predicted: this.estimators[i].predict(xoob),
};
}
}
if (!this.noOOB && this.useSampleBagging && oobResults.length > 0) {
this.oobResults = collectOOB(
oobResults,
trainingValues,
this.selection.bind(this),
);
}
}
/**
* Evaluate the feature importances for each tree in the ensemble
* @return {Array} feature importances
*/
featureImportance() {
const trees = JSON.parse(JSON.stringify(this.estimators));
const indexes = JSON.parse(JSON.stringify(this.indexes));
let importance = [];
function computeFeatureImportances(i, node) {
// node.gain can be null or undefined
if (!node || !('splitColumn' in node) || !(node.gain > 0)) return;
let f = node.gain * node.numberSamples;
if ('left' in node) {
f -= (node.left.gain || 0) * (node.left.numberSamples || 0);
}
if ('right' in node) {
f -= (node.right.gain || 0) * (node.right.numberSamples || 0);
}
importance[i][node.splitColumn] += f;
if (node.left) {
computeFeatureImportances(i, node.left);
}
if (node.right) {
computeFeatureImportances(i, node.right);
}
}
function normalizeImportances(i) {
const s = importance[i].reduce((cum, v) => {
return (cum += v);
}, 0);
importance[i] = importance[i].map((v) => {
return v / s;
});
}
for (let i = 0; i < trees.length; i++) {
importance.push(new Array(this.numberFeatures).fill(0.0));
computeFeatureImportances(i, trees[i].root);
normalizeImportances(i);
}
let avgImportance = new Array(this.numberFeatures).fill(0.0);
for (let i = 0; i < importance.length; i++) {
for (let x = 0; x < this.numberFeatures; x++) {
avgImportance[indexes[i][x]] += importance[i][x];
}
}
const s = avgImportance.reduce((cum, v) => {
return (cum += v);
}, 0);
return avgImportance.map((v) => {
return v / s;
});
}
/**
* Method that returns the way the algorithm generates the predictions, for example, in classification
* you can return the mode of all predictions retrieved by the trees, or in case of regression you can
* use the mean or the median.
* @abstract
* @param {Array} values - predictions of the estimators.
* @return {number} prediction.
*/
// eslint-disable-next-line no-unused-vars
selection(values) {
throw new Error("Abstract method 'selection' not implemented!");
}
/**
* Predicts the output given the matrix to predict.
* @param {Matrix|Array} toPredict
* @return {Array} predictions
*/
predict(toPredict) {
const predictionValues = this.predictionValues(toPredict);
let predictions = new Array(predictionValues.rows);
for (let i = 0; i < predictionValues.rows; ++i) {
predictions[i] = this.selection(predictionValues.getRow(i));
}
return predictions;
}
/**
* Predicts the output given the matrix to predict.
* @param {Matrix|Array} toPredict
* @return {MatrixTransposeView} predictions of estimators
*/
predictionValues(toPredict) {
let predictionValues = new Array(this.nEstimators);
toPredict = mlMatrix.Matrix.checkMatrix(toPredict);
for (let i = 0; i < this.nEstimators; ++i) {
let X = new mlMatrix.MatrixColumnSelectionView(toPredict, this.indexes[i]);
predictionValues[i] = this.estimators[i].predict(X);
}
return (predictionValues = new mlMatrix.MatrixTransposeView(
new mlMatrix.WrapperMatrix2D(predictionValues),
));
}
/**
* Returns the Out-Of-Bag predictions.
* @return {Array} predictions
*/
predictOOB() {
if (!this.oobResults || this.oobResults.length === 0) {
throw new Error(
'No Out-Of-Bag results found. Did you forgot to train first?',
);
}
return this.oobResults.map((v) => v.predicted);
}
/**
* Export the current model to JSON.
* @return {object} - Current model.
*/
toJSON() {
return {
indexes: this.indexes,
n: this.n,
replacement: this.replacement,
maxFeatures: this.maxFeatures,
nEstimators: this.nEstimators,
treeOptions: this.treeOptions,
isClassifier: this.isClassifier,
seed: this.seed,
estimators: this.estimators.map((est) => est.toJSON()),
useSampleBagging: this.useSampleBagging,
};
}
}
const defaultOptions$1 = {
maxFeatures: 1.0,
replacement: true,
nEstimators: 50,
seed: 42,
useSampleBagging: true,
noOOB: false,
};
/**
* @class RandomForestClassifier
* @augments RandomForestBase
*/
class RandomForestClassifier extends RandomForestBase {
/**
* Create a new base random forest for a classifier or regression model.
* @constructor
* @param {object} options
* @param {number} [options.maxFeatures=1.0] - the number of features used on each estimator.
* * if is an integer it selects maxFeatures elements over the sample features.
* * if is a float between (0, 1), it takes the percentage of features.
* @param {boolean} [options.replacement=true] - use replacement over the sample features.
* @param {number} [options.seed=42] - seed for feature and samples selection, must be a 32-bit integer.
* @param {number} [options.nEstimators=50] - number of estimator to use.
* @param {object} [options.treeOptions={}] - options for the tree classifier, see [ml-cart]{@link https://mljs.github.io/decision-tree-cart/}
* @param {boolean} [options.useSampleBagging=true] - use bagging over training samples.
* @param {number} [options.maxSamples=null] - if null, then draw X.shape[0] samples. If int, then draw maxSamples samples. If float, then draw maxSamples * X.shape[0] samples. Thus, maxSamples should be in the interval (0.0, 1.0].
* @param {object} model - for load purposes.
*/
constructor(options, model) {
if (options === true) {
super(true, model.baseModel);
} else {
options = Object.assign({}, defaultOptions$1, options);
options.isClassifier = true;
super(options);
}
}
/**
* retrieve the prediction given the selection method.
* @param {Array} values - predictions of the estimators.
* @return {number} prediction
*/
selection(values) {
return arrayMode__default["default"](values);
}
/**
* Export the current model to JSON.
* @return {object} - Current model.
*/
toJSON() {
let baseModel = super.toJSON();
return {
baseModel: baseModel,
name: 'RFClassifier',
};
}
/**
* Returns the confusion matrix
* Make sure to run train first.
* @return {object} - Current model.
*/
getConfusionMatrix() {
if (!this.oobResults) {
throw new Error('No Out-Of-Bag results available.');
}
const labels = new Set();
const matrix = this.oobResults.reduce((p, v) => {
labels.add(v.true);
labels.add(v.predicted);
const x = p[v.predicted] || {};
x[v.true] = (x[v.true] || 0) + 1;
p[v.predicted] = x;
return p;
}, {});
const sortedLabels = [...labels].sort();
return sortedLabels.map((v) =>
sortedLabels.map((w) => (matrix[v] || {})[w] || 0),
);
}
/**
* Load a Decision tree classifier with the given model.
* @param {object} model
* @return {RandomForestClassifier}
*/
static load(model) {
if (model.name !== 'RFClassifier') {
throw new RangeError(`Invalid model: ${model.name}`);
}
return new RandomForestClassifier(true, model);
}
/**
* Predicts the probability of a label given the matrix to predict.
* @param {Matrix|Array} toPredict
* @param {number} label
* @return {Array} predictions
*/
predictProbability(toPredict, label) {
const predictionValues = this.predictionValues(toPredict);
let predictions = new Array(predictionValues.rows);
for (let i = 0; i < predictionValues.rows; ++i) {
const pvs = predictionValues.getRow(i);
const l = pvs.length;
const roundFactor = Math.pow(10, 6);
predictions[i] =
Math.round(
pvs.reduce((p, v) => {
if (v === label) {
p += roundFactor / l;
}
return p;
}),
) / roundFactor;
}
return predictions;
}
}
const selectionMethods = {
mean: arrayMean__default["default"],
median: arrayMedian__default["default"],
};
const defaultOptions = {
maxFeatures: 1.0,
replacement: false,
nEstimators: 50,
treeOptions: {},
selectionMethod: 'mean',
seed: 42,
useSampleBagging: true,
noOOB: false,
};
/**
* @class RandomForestRegression
* @augments RandomForestBase
*/
class RandomForestRegression extends RandomForestBase {
/**
* Create a new base random forest for a classifier or regression model.
* @constructor
* @param {object} options
* @param {number} [options.maxFeatures=1.0] - the number of features used on each estimator.
* * if is an integer it selects maxFeatures elements over the sample features.
* * if is a float between (0, 1), it takes the percentage of features.
* @param {boolean} [options.replacement=true] - use replacement over the sample features.
* @param {number} [options.seed=42] - seed for feature and samples selection, must be a 32-bit integer.
* @param {number} [options.nEstimators=50] - number of estimator to use.
* @param {object} [options.treeOptions={}] - options for the tree classifier, see [ml-cart]{@link https://mljs.github.io/decision-tree-cart/}
* @param {string} [options.selectionMethod="mean"] - the way to calculate the prediction from estimators, "mean" and "median" are supported.
* @param {boolean} [options.useSampleBagging=true] - use bagging over training samples.
* @param {number} [options.maxSamples=null] - if null, then draw X.shape[0] samples. If int, then draw maxSamples samples. If float, then draw maxSamples * X.shape[0] samples. Thus, maxSamples should be in the interval (0.0, 1.0].
* @param {object} model - for load purposes.
*/
constructor(options, model) {
if (options === true) {
super(true, model.baseModel);
this.selectionMethod = model.selectionMethod;
} else {
options = Object.assign({}, defaultOptions, options);
if (
!(
options.selectionMethod === 'mean' ||
options.selectionMethod === 'median'
)
) {
throw new RangeError(
`Unsupported selection method ${options.selectionMethod}`,
);
}
options.isClassifier = false;
super(options);
this.selectionMethod = options.selectionMethod;
}
}
/**
* retrieve the prediction given the selection method.
* @param {Array} values - predictions of the estimators.
* @return {number} prediction
*/
selection(values) {
return selectionMethods[this.selectionMethod](values);
}
/**
* Export the current model to JSON.
* @return {object} - Current model.
*/
toJSON() {
let baseModel = super.toJSON();
return {
baseModel: baseModel,
selectionMethod: this.selectionMethod,
name: 'RFRegression',
};
}
/**
* Load a Decision tree classifier with the given model.
* @param {object} model
* @return {RandomForestRegression}
*/
static load(model) {
if (model.name !== 'RFRegression') {
throw new RangeError(`Invalid model: ${model.name}`);
}
return new RandomForestRegression(true, model);
}
}
exports.RandomForestClassifier = RandomForestClassifier;
exports.RandomForestRegression = RandomForestRegression;