@jsmlt/jsmlt
Version:
JavaScript Machine Learning
445 lines (360 loc) • 18.9 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", {
value: true
});
exports["default"] = exports.DecisionTreeNode = void 0;
var _base = require("../base");
var Arrays = _interopRequireWildcard(require("../../arrays"));
var Random = _interopRequireWildcard(require("../../random"));
function _getRequireWildcardCache() { if (typeof WeakMap !== "function") return null; var cache = new WeakMap(); _getRequireWildcardCache = function _getRequireWildcardCache() { return cache; }; return cache; }
function _interopRequireWildcard(obj) { if (obj && obj.__esModule) { return obj; } var cache = _getRequireWildcardCache(); if (cache && cache.has(obj)) { return cache.get(obj); } var newObj = {}; if (obj != null) { var hasPropertyDescriptor = Object.defineProperty && Object.getOwnPropertyDescriptor; for (var key in obj) { if (Object.prototype.hasOwnProperty.call(obj, key)) { var desc = hasPropertyDescriptor ? Object.getOwnPropertyDescriptor(obj, key) : null; if (desc && (desc.get || desc.set)) { Object.defineProperty(newObj, key, desc); } else { newObj[key] = obj[key]; } } } } newObj["default"] = obj; if (cache) { cache.set(obj, newObj); } return newObj; }
function _typeof(obj) { if (typeof Symbol === "function" && typeof Symbol.iterator === "symbol") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; } return _typeof(obj); }
function _toConsumableArray(arr) { return _arrayWithoutHoles(arr) || _iterableToArray(arr) || _nonIterableSpread(); }
function _nonIterableSpread() { throw new TypeError("Invalid attempt to spread non-iterable instance"); }
function _iterableToArray(iter) { if (Symbol.iterator in Object(iter) || Object.prototype.toString.call(iter) === "[object Arguments]") return Array.from(iter); }
function _arrayWithoutHoles(arr) { if (Array.isArray(arr)) { for (var i = 0, arr2 = new Array(arr.length); i < arr.length; i++) { arr2[i] = arr[i]; } return arr2; } }
function ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); if (enumerableOnly) symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; }); keys.push.apply(keys, symbols); } return keys; }
function _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i] != null ? arguments[i] : {}; if (i % 2) { ownKeys(source, true).forEach(function (key) { _defineProperty(target, key, source[key]); }); } else if (Object.getOwnPropertyDescriptors) { Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)); } else { ownKeys(source).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } } return target; }
function _defineProperty(obj, key, value) { if (key in obj) { Object.defineProperty(obj, key, { value: value, enumerable: true, configurable: true, writable: true }); } else { obj[key] = value; } return obj; }
function _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } }
function _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; }
function _possibleConstructorReturn(self, call) { if (call && (_typeof(call) === "object" || typeof call === "function")) { return call; } return _assertThisInitialized(self); }
function _assertThisInitialized(self) { if (self === void 0) { throw new ReferenceError("this hasn't been initialised - super() hasn't been called"); } return self; }
function _getPrototypeOf(o) { _getPrototypeOf = Object.setPrototypeOf ? Object.getPrototypeOf : function _getPrototypeOf(o) { return o.__proto__ || Object.getPrototypeOf(o); }; return _getPrototypeOf(o); }
function _inherits(subClass, superClass) { if (typeof superClass !== "function" && superClass !== null) { throw new TypeError("Super expression must either be null or a function"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); if (superClass) _setPrototypeOf(subClass, superClass); }
function _setPrototypeOf(o, p) { _setPrototypeOf = Object.setPrototypeOf || function _setPrototypeOf(o, p) { o.__proto__ = p; return o; }; return _setPrototypeOf(o, p); }
function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } }
/**
* @typedef {Object} DataSplitGroups
* @property {Array.<Array.<number>>} indices - Two-dimensional array containing, for both groups,
* the indices of the samples belonging to the group
* @property {Array.<Array.<number>>} features - Two-dimensional array containing, for both groups,
* the features of the samples belonging to the group
* @property {Array.<Array.<number>>} labels - Two-dimensional array containing, for both groups,
* the labels of the samples belonging to the group
*/
/**
* @typedef {Object} DataSplit
* @property {number} feature - Index of the feature by which to split
* @property {number} featureValue - Split value of the feature by which to split
* @property {DataSplitGroups} groups - Data groups resulting from the split
*/
/**
* Decision tree node. Holds properties of a single tree node.
*/
var DecisionTreeNode = function DecisionTreeNode() {
_classCallCheck(this, DecisionTreeNode);
};
/**
* Decision tree learner. Builds a decision tree by greedily splitting samples on one feature
* hierarchically.
*/
exports.DecisionTreeNode = DecisionTreeNode;
var DecisionTree =
/*#__PURE__*/
function (_Classifier) {
_inherits(DecisionTree, _Classifier);
/**
* Constructor. Initialize class members and store user-defined options.
*
* @param {Object} [optionsUser] - User-defined options for decision tree
* @param {string} [optionsUser.criterion = 'gini'] - Splitting criterion. Either 'gini', for the
* Gini coefficient, or 'entropy' for the Shannon entropy
* @param {number|string} [optionsUser.numFeatures = 1.0] - Number of features to subsample at
* each node. Either a number (float), in which case the input fraction of features is used
* (e.g., 1.0 for all features), or a string. If string, 'sqrt' and 'log2' are supported,
* causing the algorithm to use sqrt(n) and log2(n) features, respectively (where n is the
* total number of features)
* @param {number} [optionsUser.maxDepth = -1] - Maximum depth of the tree. The depth of the
* tree is the number of nodes in the longest path from the decision tree root to a leaf. It
* is an indicator of the complexity of the tree. Use -1 for no maximum depth
*/
function DecisionTree() {
var _this;
var optionsUser = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};
_classCallCheck(this, DecisionTree);
_this = _possibleConstructorReturn(this, _getPrototypeOf(DecisionTree).call(this)); // Parse options
var optionsDefault = {
criterion: 'gini',
numFeatures: 1.0,
maxDepth: -1
};
var options = _objectSpread({}, optionsDefault, {}, optionsUser); // Set options
_this.criterion = options.criterion;
_this.numFeatures = options.numFeatures;
_this.maxDepth = options.maxDepth;
return _this;
}
/**
* Calculate the impurity for multiple groups of labels. The impurity criterion used can be
* specified by the user through the user-defined options.
*
* @param {Array.<Array.<mixed>>} groups - Groups of labels. Each group is an array of labels
* @return {number} Impurity for the provided groups
*/
_createClass(DecisionTree, [{
key: "calculateImpurity",
value: function calculateImpurity(groups) {
if (this.criterion === 'gini') {
return this.calculateWeightedImpurity(groups, this.gini);
}
if (this.criterion === 'entropy') {
return this.calculateWeightedImpurity(groups, this.entropy);
}
return null;
}
/**
* Calculate the weighted impurity for multiple groups of labels. The returned impurity is
* calculated as the weighted sum of the impurities of the individual groups, where the
* weights are determined by the number of samples in the group.
*
* @param {Array.<Array.<mixed>>} groups - Groups of labels. Each group is an array of labels
* @param {function(labels: Array.<number>): number} impurityCallback - Callback function taking
* an array of labels as its first and only argument
* @return {number} Weighted impurity for the provided groups
*/
}, {
key: "calculateWeightedImpurity",
value: function calculateWeightedImpurity(groups, impurityCallback) {
// Impurity per group
var impurities = []; // Total number of elements
var numElements = 0; // Loop over the groups and calculate the group's impurity
var _iteratorNormalCompletion = true;
var _didIteratorError = false;
var _iteratorError = undefined;
try {
for (var _iterator = groups[Symbol.iterator](), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
var group = _step.value;
impurities.push(impurityCallback(group));
numElements += group.length;
} // Return the weighted sum of impurities
} catch (err) {
_didIteratorError = true;
_iteratorError = err;
} finally {
try {
if (!_iteratorNormalCompletion && _iterator["return"] != null) {
_iterator["return"]();
}
} finally {
if (_didIteratorError) {
throw _iteratorError;
}
}
}
return impurities.reduce(function (r, a, i) {
return r + a * groups[i].length / numElements;
}, 0);
}
/**
* Calculate the Gini coefficient a set of labels.
*
* @param {Array.<mixed>} labels - Array of predicted labels
* @return {number} Gini impurity
*/
}, {
key: "gini",
value: function gini(labels) {
var uniqueLabels = Arrays.unique(labels);
return uniqueLabels.reduce(function (r, label) {
var frac = labels.filter(function (x) {
return x === label;
}).length / labels.length;
return r + frac * (1 - frac);
}, 0);
}
/**
* Calculate the Shannon entropy a set of labels.
*
* @param {Array.<mixed>} labels - Array of predicted labels
* @return {number} Shannon entropy
*/
}, {
key: "entropy",
value: function entropy(labels) {
var uniqueLabels = Arrays.unique(labels);
return uniqueLabels.reduce(function (r, label) {
var frac = labels.filter(function (x) {
return x === label;
}).length / labels.length;
return r - frac * Math.log(frac);
}, 0);
}
/**
* Split a set of samples into two groups by some splitting value for a feature. The samples with
* a feature value lower than the split value go the left (first) group, and the other samples go
* to the right (second) group.
*
* @param {Array.<number>} XSub - Features of samples to split by some feature
* @param {Array.<mixed>} ySub - Labels of samples
* @param {number} fInd - Index of feature to split by
* @param {number} splitValue - Value to be used as the splitting point for the feature
* @return {DataSplitGroups} Assigned sample indices, features, and labels for both of the groups
*/
}, {
key: "splitSamples",
value: function splitSamples(XSub, ySub, fInd, splitValue) {
var groupsIndices = [[], []];
var groupsX = [[], []];
var groupsY = [[], []];
XSub.forEach(function (x, i) {
if (x[fInd] < splitValue) {
groupsIndices[0].push(i);
groupsX[0].push(x);
groupsY[0].push(ySub[i]);
} else {
groupsIndices[1].push(i);
groupsX[1].push(x);
groupsY[1].push(ySub[i]);
}
});
return {
indices: groupsIndices,
features: groupsX,
labels: groupsY
};
}
/**
* Find the best splitting feature and feature value for a set of data points.
*
* @param {Array.<Array.<number>>} XSub - Features of samples to find the split for
* @param {Array.<mixed>} ySub - Labels of samples
* @param {number} baseImpurity - Impurity of parent node
* @return {DataSplit}
*/
}, {
key: "findSplit",
value: function findSplit(XSub, ySub, baseImpurity) {
var _this2 = this;
// Extract information from training data
var shape = Arrays.getShape(XSub); // Best split found
var bestSplitGain = -Infinity;
var bestSplitFeature;
var bestSplitFeatureValue;
var bestSplitGroups; // Transpose features array to easily access all sample values for a given feature
var XSubT = Arrays.transpose(XSub); // Randomly sample features to consider
var possibleIndices = _toConsumableArray(Array(shape[1])).map(function (x, i) {
return i;
});
var fIndices = Random.sample(possibleIndices, this.numFeaturesInt, false); // Calculate best split by looping over all features and considering the split quality for
// all of each feature's values. The best split is the feature value at which to split such
// that the impurity is minimized
fIndices.forEach(function (fInd) {
// Extract unique, sorted sample values for this feature
var sampleValues = Arrays.unique(XSubT[fInd]);
sampleValues.sort(function (a, b) {
return (a > b) * 2 - 1;
}); // Find split values as the average value between all sorted unique values
var splitValues = Arrays.scale(Arrays.sum(sampleValues.slice(1), sampleValues.slice(0, -1)), 0.5); // Loop over all split values
splitValues.forEach(function (splitValue) {
// Groups samples. The first and second group correspond with the samples in the left
// and right parts of the split, respectively
var groups = _this2.splitSamples(XSub, ySub, fInd, splitValue); // Calculate impurity and impurity gain
var impurity = _this2.calculateImpurity(groups.labels);
var gain = baseImpurity - impurity; // Check whether this split is better than the current best split
if (gain > bestSplitGain && groups.features[0].length > 0 && groups.features[1].length > 0) {
bestSplitGain = gain;
bestSplitFeature = fInd;
bestSplitFeatureValue = splitValue;
bestSplitGroups = groups;
}
});
});
return {
feature: bestSplitFeature,
featureValue: bestSplitFeatureValue,
groups: bestSplitGroups
};
}
/**
* Build a (sub-)tree from a set of samples.
*
* @param {Array.<Array.<number>>} XSub - Features of samples to build a tree for
* @param {Array.<mixed>} ySub - Labels of samples
* @param {number} [depth = 0] - Current tree depth. 0 indicates the root node
* @return {DecisionTreeNode} Decision tree node
*/
}, {
key: "buildTree",
value: function buildTree(XSub, ySub) {
var depth = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 0;
// Create tree node
var node = new DecisionTreeNode(); // Calculate node impurity
var impurity = this.calculateImpurity([ySub]);
node.impurity = impurity; // If the node has only samples from a single class, no further splitting is possible
if (impurity === 0) {
node.type = 'leaf';
node.prediction = ySub[0];
return node;
} // Check whether the maximum depth has been reached, and make the node a leaf if that's the case
if (this.maxDepth >= 0 && depth >= this.maxDepth) {
node.type = 'leaf';
node.prediction = Arrays.valueCounts(ySub).reduce(function (r, x) {
return x[1] > r[1] ? x : r;
})[0];
return node;
}
var _this$findSplit = this.findSplit(XSub, ySub, impurity),
feature = _this$findSplit.feature,
featureValue = _this$findSplit.featureValue,
groups = _this$findSplit.groups; // Fill node details
node.type = 'node';
node.feature = feature;
node.featureValue = featureValue;
node.left = this.buildTree(groups.features[0], groups.labels[0], depth + 1);
node.right = this.buildTree(groups.features[1], groups.labels[1], depth + 1);
return node;
}
/**
* @see {@link Classifier#train}
*/
}, {
key: "train",
value: function train(X, y) {
if (X.length !== y.length) {
throw new Error('Number of data points should match number of labels.');
} // Process training options
var shape = Arrays.getShape(X);
if (this.numFeatures === 'sqrt') {
this.numFeaturesInt = Math.floor(Math.sqrt(shape[1]));
} else if (this.numFeatures === 'log2') {
this.numFeaturesInt = Math.floor(Math.log2(shape[1]));
} else {
this.numFeaturesInt = Math.max(1, Math.min(shape[1], Math.floor(this.numFeatures * shape[1])));
} // Construct decision tree
this.tree = this.buildTree(X, y);
}
/**
* @see {@link Classifier#predict}
*/
}, {
key: "predict",
value: function predict(X) {
var _this3 = this;
if (typeof this.tree === 'undefined') {
throw new Error('Model has to be trained in order to make predictions.');
} // Make prediction for each data point
var predictions = X.map(function (x) {
return _this3.predictSample(x);
});
return predictions;
}
/**
* Make a prediction for a single sample.
*
* @param {Array.<number>} sampleFeatures - Data point features
* @return {mixed} Prediction. Label of class with highest prevalence among k nearest neighbours
*/
}, {
key: "predictSample",
value: function predictSample(sampleFeatures) {
var node = this.tree;
while (node.type === 'node') {
node = sampleFeatures[node.feature] < node.featureValue ? node.left : node.right;
}
return node.prediction;
}
}]);
return DecisionTree;
}(_base.Classifier);
exports["default"] = DecisionTree;