UNPKG

@jsmlt/jsmlt

Version:

JavaScript Machine Learning

437 lines (352 loc) 15.9 kB
'use strict'; Object.defineProperty(exports, "__esModule", { value: true }); exports.DecisionTreeNode = undefined; var _extends = Object.assign || function (target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i]; for (var key in source) { if (Object.prototype.hasOwnProperty.call(source, key)) { target[key] = source[key]; } } } return target; }; var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }(); var _base = require('../base'); var _linalg = require('../../math/linalg'); var LinAlg = _interopRequireWildcard(_linalg); var _arrays = require('../../util/arrays'); var Arrays = _interopRequireWildcard(_arrays); function _interopRequireWildcard(obj) { if (obj && obj.__esModule) { return obj; } else { var newObj = {}; if (obj != null) { for (var key in obj) { if (Object.prototype.hasOwnProperty.call(obj, key)) newObj[key] = obj[key]; } } newObj.default = obj; return newObj; } } function _toConsumableArray(arr) { if (Array.isArray(arr)) { for (var i = 0, arr2 = Array(arr.length); i < arr.length; i++) { arr2[i] = arr[i]; } return arr2; } else { return Array.from(arr); } } function _possibleConstructorReturn(self, call) { if (!self) { throw new ReferenceError("this hasn't been initialised - super() hasn't been called"); } return call && (typeof call === "object" || typeof call === "function") ? call : self; } function _inherits(subClass, superClass) { if (typeof superClass !== "function" && superClass !== null) { throw new TypeError("Super expression must either be null or a function, not " + typeof superClass); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, enumerable: false, writable: true, configurable: true } }); if (superClass) Object.setPrototypeOf ? Object.setPrototypeOf(subClass, superClass) : subClass.__proto__ = superClass; } function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } // Internal dependencies /** * @typedef {Object} DataSplitGroups * @property {Array.<Array.<number>>} indices - Two-dimensional array containing, for both groups, * the indices of the samples belonging to the group * @property {Array.<Array.<number>>} features - Two-dimensional array containing, for both groups, * the features of the samples belonging to the group * @property {Array.<Array.<number>>} labels - Two-dimensional array containing, for both groups, * the labels of the samples belonging to the group */ /** * @typedef {Object} DataSplit * @property {number} feature - Index of the feature by which to split * @property {number} featureValue - Split value of the feature by which to split * @property {DataSplitGroups} groups - Data groups resulting from the split */ /** * Decision tree node. Holds properties of a single tree node. */ var DecisionTreeNode = exports.DecisionTreeNode = function DecisionTreeNode() { _classCallCheck(this, DecisionTreeNode); }; /** * Decision tree learner. Builds a decision tree by greedily splitting samples on one feature * hierarchically. */ var DecisionTree = function (_Classifier) { _inherits(DecisionTree, _Classifier); /** * Constructor. Initialize class members and store user-defined options. * * @param {Object} [optionsUser] - User-defined options for KNN * @param {string} [optionsUser.criterion = 'gini'] - Splitting criterion. Either 'gini', for the * Gini coefficient, or 'entropy' for the Shannon entropy * @param {number|string} [optionsUser.numFeatures = 1.0] - Number of features to subsample at * each node. Either a number (float), in which case the input fraction of features is used * (e.g., 1.0 for all features), or a string. If string, 'sqrt' and 'log2' are supported, * causing the algorithm to use sqrt(n) and log2(n) features, respectively (where n is the * total number of features) */ function DecisionTree() { var optionsUser = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {}; _classCallCheck(this, DecisionTree); // Parse options var _this = _possibleConstructorReturn(this, (DecisionTree.__proto__ || Object.getPrototypeOf(DecisionTree)).call(this)); var optionsDefault = { criterion: 'gini', numFeatures: 1.0 }; var options = _extends({}, optionsDefault, optionsUser); // Set options _this.criterion = options.criterion; _this.numFeatures = options.numFeatures; return _this; } /** * Calculate the impurity for multiple groups of labels. The impurity criterion used can be * specified by the user through the user-defined options. * * @param {Array.<Array.<mixed>>} groups - Groups of labels. Each group is an array of labels * @return {number} Impurity for the provided groups */ _createClass(DecisionTree, [{ key: 'calculateImpurity', value: function calculateImpurity(groups) { if (this.criterion === 'gini') { return this.calculateWeightedImpurity(groups, this.gini); } else if (this.criterion === 'entropy') { return this.calculateWeightedImpurity(groups, this.entropy); } return null; } /** * Calculate the weighted impurity for multiple groups of labels. The returned impurity is * calculated as the weighted sum of the impurities of the individual groups, where the * weights are determined by the number of samples in the group. * * @param {Array.<Array.<mixed>>} groups - Groups of labels. Each group is an array of labels * @param {function(labels: Array.<number>): number} callback - Callback function taking an array * of labels as its first and only argument * @return {number} Weighted impurity for the provided groups */ }, { key: 'calculateWeightedImpurity', value: function calculateWeightedImpurity(groups, impurityCallback) { // Impurity per group var impurities = []; // Total number of elements var numElements = 0; // Loop over the groups and calculate the group's impurity var _iteratorNormalCompletion = true; var _didIteratorError = false; var _iteratorError = undefined; try { for (var _iterator = groups[Symbol.iterator](), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) { var group = _step.value; impurities.push(impurityCallback(group)); numElements += group.length; } // Return the weighted sum of impurities } catch (err) { _didIteratorError = true; _iteratorError = err; } finally { try { if (!_iteratorNormalCompletion && _iterator.return) { _iterator.return(); } } finally { if (_didIteratorError) { throw _iteratorError; } } } return impurities.reduce(function (r, a, i) { return r + a * groups[i].length / numElements; }, 0); } /** * Calculate the Gini coefficient a set of labels. * * @param {Array.<mixed>} labels - Array of predicted labels * @return {number} Gini impurity */ }, { key: 'gini', value: function gini(labels) { var uniqueLabels = [].concat(_toConsumableArray(new Set(labels))); return uniqueLabels.reduce(function (r, label) { var frac = labels.filter(function (x) { return x === label; }).length / labels.length; return r + frac * (1 - frac); }, 0); } /** * Calculate the Shannon entropy a set of labels. * * @param {Array.<mixed>} labels - Array of predicted labels * @return {number} Shannon entropy */ }, { key: 'entropy', value: function entropy(labels) { var uniqueLabels = [].concat(_toConsumableArray(new Set(labels))); return uniqueLabels.reduce(function (r, label) { var frac = labels.filter(function (x) { return x === label; }).length / labels.length; return r - frac * Math.log(frac); }, 0); } /** * Split a set of samples into two groups by some splitting value for a feature. The samples with * a feature value lower than the split value go the left (first) group, and the other samples go * to the right (second) group. * * @param {Array.<number>} XSub - Features of samples to split by some feature * @param {Array.<mixed>} ySub - Labels of samples * @param {number} fInd - Index of feature to split by * @param {number} splitValue - Value to be used as the splitting point for the feature * @return {DataSplitGroups} Assigned sample indices, features, and labels for both of the groups */ }, { key: 'splitSamples', value: function splitSamples(XSub, ySub, fInd, splitValue) { var groupsIndices = [[], []]; var groupsX = [[], []]; var groupsY = [[], []]; XSub.forEach(function (x, i) { if (x[fInd] < splitValue) { groupsIndices[0].push(i); groupsX[0].push(x); groupsY[0].push(ySub[i]); } else { groupsIndices[1].push(i); groupsX[1].push(x); groupsY[1].push(ySub[i]); } }); return { indices: groupsIndices, features: groupsX, labels: groupsY }; } /** * Find the best splitting feature and feature value for a set of data points. * * @param {Array.<Array.<number>>} XSub - Features of samples to find the split for * @param {Array.<mixed>} ySub - Labels of samples * @param {number} baseImpurity - Impurity of parent node * @return {DataSplit} */ }, { key: 'findSplit', value: function findSplit(XSub, ySub, baseImpurity) { var _this2 = this; // Extract information from training data var shape = LinAlg.getShape(XSub); // Best split found var bestSplitGain = -Infinity; var bestSplitFeature = void 0; var bestSplitFeatureValue = void 0; var bestSplitGroups = void 0; // Transpose features array to easily access all sample values for a given feature var XSubT = LinAlg.transpose(XSub); // Randomly sample features to consider var possibleIndices = [].concat(_toConsumableArray(Array(shape[1]))).map(function (x, i) { return i; }); var fIndices = Arrays.sample(possibleIndices, this.numFeaturesInt, false); // Calculate best split by looping over all features and considering the split quality for // all of each feature's values. The best split is the feature value at which to split such // that the impurity is minimized fIndices.forEach(function (fInd) { // Extract unique, sorted sample values for this feature var sampleValues = [].concat(_toConsumableArray(new Set(XSubT[fInd]))); sampleValues.sort(function (a, b) { return (a > b) * 2 - 1; }); // Find split values as the average value between all sorted unique values var splitValues = LinAlg.scale(LinAlg.sum(sampleValues.slice(1), sampleValues.slice(0, -1)), 0.5); // Loop over all split values splitValues.forEach(function (splitValue) { // Groups samples. The first and second group correspond with the samples in the left // and right parts of the split, respectively var groups = _this2.splitSamples(XSub, ySub, fInd, splitValue); // Calculate impurity and impurity gain var impurity = _this2.calculateImpurity(groups.labels); var gain = baseImpurity - impurity; // Check whether this split is better than the current best split if (gain > bestSplitGain && groups.features[0].length > 0 && groups.features[1].length > 0) { bestSplitGain = gain; bestSplitFeature = fInd; bestSplitFeatureValue = splitValue; bestSplitGroups = groups; } }); }); return { feature: bestSplitFeature, featureValue: bestSplitFeatureValue, groups: bestSplitGroups }; } /** * Build a (sub-)tree from a set of samples. * * @param {Array.<Array.<number>>} XSub - Features of samples to build a tree for * @param {Array.<mixed>} ySub - Labels of samples * @param {number} [depth = 0] - Current tree depth. 0 indicates the root node * @return {DecisionTreeNode} Decision tree node */ }, { key: 'buildTree', value: function buildTree(XSub, ySub) { var depth = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 0; // Create tree node var node = new DecisionTreeNode(); // Calculate node impurity var impurity = this.calculateImpurity([ySub]); node.impurity = impurity; if (impurity === 0) { node.type = 'leaf'; node.prediction = ySub[0]; return node; } var _findSplit = this.findSplit(XSub, ySub, impurity), feature = _findSplit.feature, featureValue = _findSplit.featureValue, groups = _findSplit.groups; // Fill node details node.type = 'node'; node.feature = feature; node.featureValue = featureValue; node.left = this.buildTree(groups.features[0], groups.labels[0], depth + 1); node.right = this.buildTree(groups.features[1], groups.labels[1], depth + 1); return node; } /** * @see {@link Classifier#train} */ }, { key: 'train', value: function train(X, y) { if (X.length !== y.length) { throw new Error('Number of data points should match number of labels.'); } // Process training options var shape = LinAlg.getShape(X); if (this.numFeatures === 'sqrt') { this.numFeaturesInt = Math.floor(Math.sqrt(shape[1])); } else if (this.numFeatures === 'log2') { this.numFeaturesInt = Math.floor(Math.log2(shape[1])); } else { this.numFeaturesInt = Math.max(1, Math.min(this.numFeatures, Math.floor(this.numFeatures * shape[1]))); } // Construct decision tree this.tree = this.buildTree(X, y); } /** * @see {@link Classifier#predict} */ }, { key: 'predict', value: function predict(X) { var _this3 = this; if (typeof this.tree === 'undefined') { throw new Error('Model has to be trained in order to make predictions.'); } // Make prediction for each data point var predictions = X.map(function (x) { return _this3.predictSample(x); }); return predictions; } /** * Make a prediction for a single sample. * * @param {Array.<number>} sampleFeatures - Data point features * @return {mixed} Prediction. Label of class with highest prevalence among k nearest neighbours */ }, { key: 'predictSample', value: function predictSample(sampleFeatures) { var node = this.tree; while (node.type === 'node') { node = sampleFeatures[node.feature] < node.featureValue ? node.left : node.right; } return node.prediction; } }]); return DecisionTree; }(_base.Classifier); exports.default = DecisionTree;