UNPKG

ml-random-forest

Version:

Random forest for classification and regression

730 lines (658 loc) 23 kB
'use strict'; Object.defineProperty(exports, '__esModule', { value: true }); var arrayMode = require('ml-array-mode'); var mlCart = require('ml-cart'); var mlMatrix = require('ml-matrix'); var Random = require('random-js'); var arrayMean = require('ml-array-mean'); var arrayMedian = require('ml-array-median'); function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; } function _interopNamespace(e) { if (e && e.__esModule) return e; var n = Object.create(null); if (e) { Object.keys(e).forEach(function (k) { if (k !== 'default') { var d = Object.getOwnPropertyDescriptor(e, k); Object.defineProperty(n, k, d.get ? d : { enumerable: true, get: function () { return e[k]; } }); } }); } n["default"] = e; return Object.freeze(n); } var arrayMode__default = /*#__PURE__*/_interopDefaultLegacy(arrayMode); var Random__namespace = /*#__PURE__*/_interopNamespace(Random); var arrayMean__default = /*#__PURE__*/_interopDefaultLegacy(arrayMean); var arrayMedian__default = /*#__PURE__*/_interopDefaultLegacy(arrayMedian); function checkFloat(n) { return n > 0.0 && n <= 1.0; } function isFloat(n) { return Number(n) === n && n % 1 !== 0; } /** * Select n with replacement elements on the training set and values, where n is the size of the training set. * @ignore * @param {Matrix} trainingSet * @param {Array} trainingValue * @param {number} seed - seed for the random selection, must be a 32-bit integer. * @return {object} with new X and y. */ function examplesBaggingWithReplacement( trainingSet, trainingValue, seed, ) { let engine; let distribution = Random__namespace.integer(0, trainingSet.rows - 1); if (seed === undefined) { engine = Random__namespace.MersenneTwister19937.autoSeed(); } else if (Number.isInteger(seed)) { engine = Random__namespace.MersenneTwister19937.seed(seed); } else { throw new RangeError( `Expected seed must be undefined or integer not ${seed}`, ); } let Xr = new Array(trainingSet.rows); let yr = new Array(trainingSet.rows); let oob = new Array(trainingSet.rows).fill(0); let oobN = trainingSet.rows; for (let i = 0; i < trainingSet.rows; ++i) { let index = distribution(engine); Xr[i] = trainingSet.getRow(index); yr[i] = trainingValue[index]; if (oob[index]++ === 0) { oobN--; } } let Xoob = new Array(oobN); let ioob = new Array(oobN); // run backwards to have ioob filled in increasing order for (let i = trainingSet.rows - 1; i >= 0 && oobN > 0; --i) { if (oob[i] === 0) { Xoob[--oobN] = trainingSet.getRow(i); ioob[oobN] = i; } } return { X: new mlMatrix.Matrix(Xr), y: yr, Xoob: new mlMatrix.Matrix(Xoob), ioob, seed: engine.next(), }; } /** * selects n features from the training set with or without replacement, returns the new training set and the indexes used. * @ignore * @param {Matrix} trainingSet * @param {number} n - features. * @param {boolean} replacement * @param {number} seed - seed for the random selection, must be a 32-bit integer. * @return {object} */ function featureBagging(trainingSet, n, replacement, seed) { if (trainingSet.columns < n) { throw new RangeError( 'N should be less or equal to the number of columns of X', ); } let distribution = Random__namespace.integer(0, trainingSet.columns - 1); let engine; if (seed === undefined) { engine = Random__namespace.MersenneTwister19937.autoSeed(); } else if (Number.isInteger(seed)) { engine = Random__namespace.MersenneTwister19937.seed(seed); } else { throw new RangeError( `Expected seed must be undefined or integer not ${seed}`, ); } let toRet = new mlMatrix.Matrix(trainingSet.rows, n); let usedIndex; let index; if (replacement) { usedIndex = new Array(n); for (let i = 0; i < n; ++i) { index = distribution(engine); usedIndex[i] = index; toRet.setColumn(i, trainingSet.getColumn(index)); } } else { usedIndex = new Set(); index = distribution(engine); for (let i = 0; i < n; ++i) { while (usedIndex.has(index)) { index = distribution(engine); } toRet.setColumn(i, trainingSet.getColumn(index)); usedIndex.add(index); } usedIndex = Array.from(usedIndex); } return { X: toRet, usedIndex: usedIndex, seed: engine.next(), }; } /** * collects and combines the individual results from the tree predictions on Out-Of-Bag data * @ignore * @param {{index: {Array},predicted: {Array}}[]} oob: array of individual tree predictions * @param {array} y: true labels * @param {(predictions:{Array})=>{number}} aggregate: aggregation function * @return {Array} */ const collectOOB = (oob, y, aggregate) => { const res = Array(y.length); for (let i = 0; i < y.length; i++) { const all = []; for (let j = 0; j < oob.length; j++) { const o = oob[j]; if (o.index[0] === i) { all.push(o.predicted[0]); o.index = o.index.slice(1); o.predicted = o.predicted.slice(1); } } res[i] = { true: y[i], all: all, predicted: aggregate(all) }; } return res; }; /** * @class RandomForestBase */ class RandomForestBase { /** * Create a new base random forest for a classifier or regression model. * @constructor * @param {object} options * @param {number|String} [options.maxFeatures] - the number of features used on each estimator. * * if is an integer it selects maxFeatures elements over the sample features. * * if is a float between (0, 1), it takes the percentage of features. * @param {boolean} [options.replacement] - use replacement over the sample features. * @param {number} [options.seed] - seed for feature and samples selection, must be a 32-bit integer. * @param {number} [options.nEstimators] - number of estimator to use. * @param {object} [options.treeOptions] - options for the tree classifier, see [ml-cart]{@link https://mljs.github.io/decision-tree-cart/} * @param {boolean} [options.isClassifier] - boolean to check if is a classifier or regression model (used by subclasses). * @param {boolean} [options.useSampleBagging] - use bagging over training samples. * @param {boolean} [options.noOOB] - don't calculate Out-Of-Bag predictions. * @param {object} model - for load purposes. */ constructor(options, model) { if (options === true) { this.replacement = model.replacement; this.maxFeatures = model.maxFeatures; this.nEstimators = model.nEstimators; this.treeOptions = model.treeOptions; this.isClassifier = model.isClassifier; this.seed = model.seed; this.n = model.n; this.indexes = model.indexes; this.useSampleBagging = model.useSampleBagging; this.noOOB = true; this.maxSamples = model.maxSamples; let Estimator = this.isClassifier ? mlCart.DecisionTreeClassifier : mlCart.DecisionTreeRegression; this.estimators = model.estimators.map((est) => Estimator.load(est)); } else { this.replacement = options.replacement; this.maxFeatures = options.maxFeatures; this.nEstimators = options.nEstimators; this.treeOptions = options.treeOptions; this.isClassifier = options.isClassifier; this.seed = options.seed; this.useSampleBagging = options.useSampleBagging; this.noOOB = options.noOOB; this.maxSamples = options.maxSamples; } } /** * Train the decision tree with the given training set and labels. * @param {Matrix|Array} trainingSet * @param {Array} trainingValues */ train(trainingSet, trainingValues) { let currentSeed = this.seed; trainingSet = mlMatrix.Matrix.checkMatrix(trainingSet); this.maxFeatures = this.maxFeatures || trainingSet.columns; this.numberFeatures = trainingSet.columns; this.numberSamples = trainingSet.rows; if (checkFloat(this.maxFeatures)) { this.n = Math.floor(trainingSet.columns * this.maxFeatures); } else if (Number.isInteger(this.maxFeatures)) { if (this.maxFeatures > trainingSet.columns) { throw new RangeError( `The maxFeatures parameter should be less than ${trainingSet.columns}`, ); } else { this.n = this.maxFeatures; } } else { throw new RangeError( `Cannot process the maxFeatures parameter ${this.maxFeatures}`, ); } if (this.maxSamples) { if (this.maxSamples < 0) { throw new RangeError(`Please choose a positive value for maxSamples`); } else { if (isFloat(this.maxSamples)) { if (this.maxSamples > 1.0) { throw new RangeError( 'Please choose either a float value between 0 and 1 or a positive integer for maxSamples', ); } else { this.numberSamples = Math.floor(trainingSet.rows * this.maxSamples); } } else if (Number.isInteger(this.maxSamples)) { if (this.maxSamples > trainingSet.rows) { throw new RangeError( `The maxSamples parameter should be less than ${trainingSet.rows}`, ); } else { this.numberSamples = this.maxSamples; } } } } if (this.maxSamples) { if (trainingSet.rows !== this.numberSamples) { let tmp = new mlMatrix.Matrix(this.numberSamples, trainingSet.columns); for (let j = 0; j < this.numberSamples; j++) { tmp.removeRow(0); } for (let i = 0; i < this.numberSamples; i++) { tmp.addRow(trainingSet.getRow(i)); } trainingSet = tmp; trainingValues = trainingValues.slice(0, this.numberSamples); } } let Estimator; if (this.isClassifier) { Estimator = mlCart.DecisionTreeClassifier; } else { Estimator = mlCart.DecisionTreeRegression; } this.estimators = new Array(this.nEstimators); this.indexes = new Array(this.nEstimators); let oobResults = new Array(this.nEstimators); for (let i = 0; i < this.nEstimators; ++i) { let res = this.useSampleBagging ? examplesBaggingWithReplacement( trainingSet, trainingValues, currentSeed, ) : { X: trainingSet, y: trainingValues, seed: currentSeed, Xoob: undefined, yoob: [], ioob: [], }; let X = res.X; let y = res.y; currentSeed = res.seed; let { Xoob, ioob } = res; // Other implementations of random forests apply feature bagging at every split during tree generation. // So I think it would be better to implement it at the CART level, not here. res = featureBagging(X, this.n, this.replacement, currentSeed); X = res.X; currentSeed = res.seed; this.indexes[i] = res.usedIndex; this.estimators[i] = new Estimator(this.treeOptions); this.estimators[i].train(X, y); if (!this.noOOB && this.useSampleBagging) { let xoob = new mlMatrix.MatrixColumnSelectionView(Xoob, this.indexes[i]); oobResults[i] = { index: ioob, predicted: this.estimators[i].predict(xoob), }; } } if (!this.noOOB && this.useSampleBagging && oobResults.length > 0) { this.oobResults = collectOOB( oobResults, trainingValues, this.selection.bind(this), ); } } /** * Evaluate the feature importances for each tree in the ensemble * @return {Array} feature importances */ featureImportance() { const trees = JSON.parse(JSON.stringify(this.estimators)); const indexes = JSON.parse(JSON.stringify(this.indexes)); let importance = []; function computeFeatureImportances(i, node) { // node.gain can be null or undefined if (!node || !('splitColumn' in node) || !(node.gain > 0)) return; let f = node.gain * node.numberSamples; if ('left' in node) { f -= (node.left.gain || 0) * (node.left.numberSamples || 0); } if ('right' in node) { f -= (node.right.gain || 0) * (node.right.numberSamples || 0); } importance[i][node.splitColumn] += f; if (node.left) { computeFeatureImportances(i, node.left); } if (node.right) { computeFeatureImportances(i, node.right); } } function normalizeImportances(i) { const s = importance[i].reduce((cum, v) => { return (cum += v); }, 0); importance[i] = importance[i].map((v) => { return v / s; }); } for (let i = 0; i < trees.length; i++) { importance.push(new Array(this.numberFeatures).fill(0.0)); computeFeatureImportances(i, trees[i].root); normalizeImportances(i); } let avgImportance = new Array(this.numberFeatures).fill(0.0); for (let i = 0; i < importance.length; i++) { for (let x = 0; x < this.numberFeatures; x++) { avgImportance[indexes[i][x]] += importance[i][x]; } } const s = avgImportance.reduce((cum, v) => { return (cum += v); }, 0); return avgImportance.map((v) => { return v / s; }); } /** * Method that returns the way the algorithm generates the predictions, for example, in classification * you can return the mode of all predictions retrieved by the trees, or in case of regression you can * use the mean or the median. * @abstract * @param {Array} values - predictions of the estimators. * @return {number} prediction. */ // eslint-disable-next-line no-unused-vars selection(values) { throw new Error("Abstract method 'selection' not implemented!"); } /** * Predicts the output given the matrix to predict. * @param {Matrix|Array} toPredict * @return {Array} predictions */ predict(toPredict) { const predictionValues = this.predictionValues(toPredict); let predictions = new Array(predictionValues.rows); for (let i = 0; i < predictionValues.rows; ++i) { predictions[i] = this.selection(predictionValues.getRow(i)); } return predictions; } /** * Predicts the output given the matrix to predict. * @param {Matrix|Array} toPredict * @return {MatrixTransposeView} predictions of estimators */ predictionValues(toPredict) { let predictionValues = new Array(this.nEstimators); toPredict = mlMatrix.Matrix.checkMatrix(toPredict); for (let i = 0; i < this.nEstimators; ++i) { let X = new mlMatrix.MatrixColumnSelectionView(toPredict, this.indexes[i]); predictionValues[i] = this.estimators[i].predict(X); } return (predictionValues = new mlMatrix.MatrixTransposeView( new mlMatrix.WrapperMatrix2D(predictionValues), )); } /** * Returns the Out-Of-Bag predictions. * @return {Array} predictions */ predictOOB() { if (!this.oobResults || this.oobResults.length === 0) { throw new Error( 'No Out-Of-Bag results found. Did you forgot to train first?', ); } return this.oobResults.map((v) => v.predicted); } /** * Export the current model to JSON. * @return {object} - Current model. */ toJSON() { return { indexes: this.indexes, n: this.n, replacement: this.replacement, maxFeatures: this.maxFeatures, nEstimators: this.nEstimators, treeOptions: this.treeOptions, isClassifier: this.isClassifier, seed: this.seed, estimators: this.estimators.map((est) => est.toJSON()), useSampleBagging: this.useSampleBagging, }; } } const defaultOptions$1 = { maxFeatures: 1.0, replacement: true, nEstimators: 50, seed: 42, useSampleBagging: true, noOOB: false, }; /** * @class RandomForestClassifier * @augments RandomForestBase */ class RandomForestClassifier extends RandomForestBase { /** * Create a new base random forest for a classifier or regression model. * @constructor * @param {object} options * @param {number} [options.maxFeatures=1.0] - the number of features used on each estimator. * * if is an integer it selects maxFeatures elements over the sample features. * * if is a float between (0, 1), it takes the percentage of features. * @param {boolean} [options.replacement=true] - use replacement over the sample features. * @param {number} [options.seed=42] - seed for feature and samples selection, must be a 32-bit integer. * @param {number} [options.nEstimators=50] - number of estimator to use. * @param {object} [options.treeOptions={}] - options for the tree classifier, see [ml-cart]{@link https://mljs.github.io/decision-tree-cart/} * @param {boolean} [options.useSampleBagging=true] - use bagging over training samples. * @param {number} [options.maxSamples=null] - if null, then draw X.shape[0] samples. If int, then draw maxSamples samples. If float, then draw maxSamples * X.shape[0] samples. Thus, maxSamples should be in the interval (0.0, 1.0]. * @param {object} model - for load purposes. */ constructor(options, model) { if (options === true) { super(true, model.baseModel); } else { options = Object.assign({}, defaultOptions$1, options); options.isClassifier = true; super(options); } } /** * retrieve the prediction given the selection method. * @param {Array} values - predictions of the estimators. * @return {number} prediction */ selection(values) { return arrayMode__default["default"](values); } /** * Export the current model to JSON. * @return {object} - Current model. */ toJSON() { let baseModel = super.toJSON(); return { baseModel: baseModel, name: 'RFClassifier', }; } /** * Returns the confusion matrix * Make sure to run train first. * @return {object} - Current model. */ getConfusionMatrix() { if (!this.oobResults) { throw new Error('No Out-Of-Bag results available.'); } const labels = new Set(); const matrix = this.oobResults.reduce((p, v) => { labels.add(v.true); labels.add(v.predicted); const x = p[v.predicted] || {}; x[v.true] = (x[v.true] || 0) + 1; p[v.predicted] = x; return p; }, {}); const sortedLabels = [...labels].sort(); return sortedLabels.map((v) => sortedLabels.map((w) => (matrix[v] || {})[w] || 0), ); } /** * Load a Decision tree classifier with the given model. * @param {object} model * @return {RandomForestClassifier} */ static load(model) { if (model.name !== 'RFClassifier') { throw new RangeError(`Invalid model: ${model.name}`); } return new RandomForestClassifier(true, model); } /** * Predicts the probability of a label given the matrix to predict. * @param {Matrix|Array} toPredict * @param {number} label * @return {Array} predictions */ predictProbability(toPredict, label) { const predictionValues = this.predictionValues(toPredict); let predictions = new Array(predictionValues.rows); for (let i = 0; i < predictionValues.rows; ++i) { const pvs = predictionValues.getRow(i); const l = pvs.length; const roundFactor = Math.pow(10, 6); predictions[i] = Math.round( pvs.reduce((p, v) => { if (v === label) { p += roundFactor / l; } return p; }), ) / roundFactor; } return predictions; } } const selectionMethods = { mean: arrayMean__default["default"], median: arrayMedian__default["default"], }; const defaultOptions = { maxFeatures: 1.0, replacement: false, nEstimators: 50, treeOptions: {}, selectionMethod: 'mean', seed: 42, useSampleBagging: true, noOOB: false, }; /** * @class RandomForestRegression * @augments RandomForestBase */ class RandomForestRegression extends RandomForestBase { /** * Create a new base random forest for a classifier or regression model. * @constructor * @param {object} options * @param {number} [options.maxFeatures=1.0] - the number of features used on each estimator. * * if is an integer it selects maxFeatures elements over the sample features. * * if is a float between (0, 1), it takes the percentage of features. * @param {boolean} [options.replacement=true] - use replacement over the sample features. * @param {number} [options.seed=42] - seed for feature and samples selection, must be a 32-bit integer. * @param {number} [options.nEstimators=50] - number of estimator to use. * @param {object} [options.treeOptions={}] - options for the tree classifier, see [ml-cart]{@link https://mljs.github.io/decision-tree-cart/} * @param {string} [options.selectionMethod="mean"] - the way to calculate the prediction from estimators, "mean" and "median" are supported. * @param {boolean} [options.useSampleBagging=true] - use bagging over training samples. * @param {number} [options.maxSamples=null] - if null, then draw X.shape[0] samples. If int, then draw maxSamples samples. If float, then draw maxSamples * X.shape[0] samples. Thus, maxSamples should be in the interval (0.0, 1.0]. * @param {object} model - for load purposes. */ constructor(options, model) { if (options === true) { super(true, model.baseModel); this.selectionMethod = model.selectionMethod; } else { options = Object.assign({}, defaultOptions, options); if ( !( options.selectionMethod === 'mean' || options.selectionMethod === 'median' ) ) { throw new RangeError( `Unsupported selection method ${options.selectionMethod}`, ); } options.isClassifier = false; super(options); this.selectionMethod = options.selectionMethod; } } /** * retrieve the prediction given the selection method. * @param {Array} values - predictions of the estimators. * @return {number} prediction */ selection(values) { return selectionMethods[this.selectionMethod](values); } /** * Export the current model to JSON. * @return {object} - Current model. */ toJSON() { let baseModel = super.toJSON(); return { baseModel: baseModel, selectionMethod: this.selectionMethod, name: 'RFRegression', }; } /** * Load a Decision tree classifier with the given model. * @param {object} model * @return {RandomForestRegression} */ static load(model) { if (model.name !== 'RFRegression') { throw new RangeError(`Invalid model: ${model.name}`); } return new RandomForestRegression(true, model); } } exports.RandomForestClassifier = RandomForestClassifier; exports.RandomForestRegression = RandomForestRegression;