UNPKG

lumenize

Version:

Illuminating the forest AND the trees in your data.

github.com/lmaccherone/Lumenize

lmaccherone/Lumenize

688 lines (601 loc) • 29.8 kB

JavaScript

// Generated by CoffeeScript 1.7.1 (function() { var BayesianClassifier, Classifier, JSON, OLAPCube, functions, utils, __hasProp = {}.hasOwnProperty, __extends = function(child, parent) { for (var key in parent) { if (__hasProp.call(parent, key)) child[key] = parent[key]; } function ctor() { this.constructor = child; } ctor.prototype = parent.prototype; child.prototype = new ctor(); child.__super__ = parent.prototype; return child; }; functions = require('./functions').functions; utils = require('tztime').utils; OLAPCube = require('./OLAPCube').OLAPCube; JSON = require('JSON2'); Classifier = (function() { function Classifier() {} /* @class Classifier __Base class for all Classifiers__ See individual subclasses for usage details */ Classifier.getBucketCountMinMax = function(values) { var max, min, targetBucketCount; targetBucketCount = Math.floor(Math.sqrt(values.length)) + 1; if (targetBucketCount < 3) { throw new Error("Need more training data"); } min = functions.min(values); max = functions.max(values); return { targetBucketCount: targetBucketCount, min: min, max: max }; }; Classifier.generateConstantWidthBucketer = function(values) { var bucketSize, bucketer, i, max, min, targetBucketCount, _i, _ref, _ref1; _ref = Classifier.getBucketCountMinMax(values), targetBucketCount = _ref.targetBucketCount, min = _ref.min, max = _ref.max; bucketSize = (max - min) / targetBucketCount; bucketer = []; bucketer.push({ value: 'B' + 0, startOn: null, endBelow: min + bucketSize }); for (i = _i = 1, _ref1 = targetBucketCount - 2; 1 <= _ref1 ? _i <= _ref1 : _i >= _ref1; i = 1 <= _ref1 ? ++_i : --_i) { bucketer.push({ value: 'B' + i, startOn: min + bucketSize * i, endBelow: min + bucketSize * (i + 1) }); } bucketer.push({ value: 'B' + (targetBucketCount - 1), startOn: min + bucketSize * (targetBucketCount - 1), endBelow: null }); return bucketer; }; Classifier.generateConstantQuantityBucketer = function(values) { var bucketSize, bucketer, currentBoundary, i, lastBoundary, max, min, targetBucketCount, _i, _ref, _ref1; _ref = Classifier.getBucketCountMinMax(values), targetBucketCount = _ref.targetBucketCount, min = _ref.min, max = _ref.max; bucketSize = 100 / targetBucketCount; bucketer = []; currentBoundary = functions.percentileCreator(bucketSize)(values); bucketer.push({ value: 'B' + 0, startOn: null, endBelow: currentBoundary }); for (i = _i = 1, _ref1 = targetBucketCount - 2; 1 <= _ref1 ? _i <= _ref1 : _i >= _ref1; i = 1 <= _ref1 ? ++_i : --_i) { lastBoundary = currentBoundary; currentBoundary = functions.percentileCreator(bucketSize * (i + 1))(values); bucketer.push({ value: 'B' + i, startOn: lastBoundary, endBelow: currentBoundary }); } bucketer.push({ value: 'B' + (targetBucketCount - 1), startOn: currentBoundary, endBelow: null }); return bucketer; }; Classifier.splitAt = function(values, index) { var left, right; left = values.slice(0, index); right = values.slice(index); return { left: left, right: right }; }; Classifier.optimalSplitFor2Buckets = function(values) { var bestIndex, bestLeft, bestRight, bestTotalErrorSquared, i, left, right, splitAt, totalErrorSquared, _i, _ref, _ref1; bestIndex = 1; bestTotalErrorSquared = Number.MAX_VALUE; for (i = _i = 1, _ref = values.length - 1; 1 <= _ref ? _i <= _ref : _i >= _ref; i = 1 <= _ref ? ++_i : --_i) { _ref1 = Classifier.splitAt(values, i), left = _ref1.left, right = _ref1.right; totalErrorSquared = functions.errorSquared(left) + functions.errorSquared(right); if (totalErrorSquared < bestTotalErrorSquared) { bestTotalErrorSquared = totalErrorSquared; bestIndex = i; bestLeft = left; bestRight = right; } } splitAt = (values[bestIndex - 1] + values[bestIndex]) / 2; return { splitAt: splitAt, left: bestLeft, right: bestRight }; }; Classifier.areAllSame = function(values) { var firstValue, value, _i, _len; firstValue = values[0]; for (_i = 0, _len = values.length; _i < _len; _i++) { value = values[_i]; if (value !== firstValue) { return false; } } return true; }; Classifier.findBucketSplits = function(currentSplits, values, targetBucketCount) { var left, right, splitAt, _ref; if (values.length < 5 || Classifier.areAllSame(values)) { return null; } _ref = Classifier.optimalSplitFor2Buckets(values), splitAt = _ref.splitAt, left = _ref.left, right = _ref.right; currentSplits.push(splitAt); if (currentSplits.length < targetBucketCount) { Classifier.findBucketSplits(currentSplits, left, targetBucketCount); Classifier.findBucketSplits(currentSplits, right, targetBucketCount); } return currentSplits; }; Classifier.generateVOptimalBucketer = function(values) { var bucketer, currentBoundary, i, lastBoundary, max, min, splits, targetBucketCount, _i, _ref, _ref1; _ref = Classifier.getBucketCountMinMax(values), targetBucketCount = _ref.targetBucketCount, min = _ref.min, max = _ref.max; values.sort(function(a, b) { return a - b; }); splits = []; Classifier.findBucketSplits(splits, values, targetBucketCount); splits.sort(function(a, b) { return a - b; }); bucketer = []; currentBoundary = splits[0]; bucketer.push({ value: 'B' + 0, startOn: null, endBelow: currentBoundary }); for (i = _i = 1, _ref1 = splits.length - 1; 1 <= _ref1 ? _i <= _ref1 : _i >= _ref1; i = 1 <= _ref1 ? ++_i : --_i) { lastBoundary = currentBoundary; currentBoundary = splits[i]; bucketer.push({ value: 'B' + i, startOn: lastBoundary, endBelow: currentBoundary }); } bucketer.push({ value: 'B' + splits.length, startOn: currentBoundary, endBelow: null }); return bucketer; }; Classifier.prototype.discreteizeRow = function(row) { var bin, feature, index, value, _i, _j, _len, _len1, _ref, _ref1; _ref = this.features; for (_i = 0, _len = _ref.length; _i < _len; _i++) { feature = _ref[_i]; if (feature.type === 'continuous') { value = row[feature.field]; if (value == null) { throw new Error("Could not find field " + feature.field + " in " + (JSON.stringify(row)) + "."); } _ref1 = feature.bins; for (index = _j = 0, _len1 = _ref1.length; _j < _len1; index = ++_j) { bin = _ref1[index]; if (bin.startOn != null) { if (bin.endBelow != null) { if ((bin.startOn <= value && value < bin.endBelow)) { row[feature.field] = bin.value; break; } } else if (bin.startOn <= value) { row[feature.field] = bin.value; break; } } else if (value < bin.endBelow) { row[feature.field] = bin.value; break; } } } } return row; }; return Classifier; })(); BayesianClassifier = (function(_super) { __extends(BayesianClassifier, _super); /* @class BayesianClassifier __A Bayesian classifier with non-parametric modeling of distributions using v-optimal bucketing.__ If you look for libraries for Bayesian classification, the primary use case is spam filtering and they assume that the presence or absence of a word is the only feature you are interested in. This is a more general purpose tool. *# Features ## * Works even for bi-modal and other non-normal distributions * No requirement that you identify the distribution * Uses [non-parametric modeling](http://en.wikipedia.org/wiki/Non-parametric_statistics) * Uses v-optimal bucketing so it deals well with outliers and sharp cliffs * Serialize (`getStateForSaving()`) and deserialize (`newFromSavedState()`) to preserve training between sessions *# Why the assumption of a normal distribution is bad in some cases ## The [wikipedia example of using Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Sex_classification) tries to determine if someone was male or female based upon the height, weight and shoe size. The assumption is that men are generally larger, heavier, and have larger shoe size than women. In the example, they use the mean and variance of the male-only and female-only populations to characterize those distributions. This works because these characteristics are generally normally distributed **and the distribution for men is generally to the right of the distribution for women**. However, let's ask a group of folks who work together if they consider themselves a team and let's try to use the size of the group as a feature to predict what a new group would say. If the group is very small (1-2 people), they are less likely to consider themselves a team (partnership maybe), but if they are too large (say > 10), they are also unlikely to refer to themselves as a team. The non-team distribution is bimodal, looking at its mean and variance completely mis-characterizes it. Also, the distribution is zero bound so it's likely to be asymmetric, which also poses problems for a normal distribution assumption. *# So what do we do instead? ## This classifier uses the actual sampled percentage for buckets of the data. This approach is often referred to as "building a non-parametric model", although "un-named distribution" strikes me a better label. **Pros/Cons**. The use of a non-parametric approach will allow us to deal with non-normal distributions (asymmetric, bimodal, etc.) without ever having to identify which nominal distribution is the best fit or having to ask the user (who may not know) what distribution to use. The downside to this approach is that it generally requires a larger training set. You will need to experiment to determine how small is too small for your situation. This approach is hinted at in the [wikipedia article on Bayesian classifiers](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) as "binning to discretize the feature values, to obtain a new set of Bernoulli-distributed features". However, this classifier does not create new separate Bernoulli features for each bin. Rather, it creates a mapping function from a feature value to a probability indicating how often the feature value is coincident with a particular outputField value. This mapping function is different for each bin. *# V-optimal bucketing ## There are two common approaches to bucketing: 1. Make each bucket be equal in width along the x-axis (like we would for a histogram) (equi-width) 2. Make each bucket have roughly the same number of data points (equi-depth) It turns out neither of the above works out well unless the training set is relatively large. Rather, there is an approach called [v-optimal bucketing](http://en.wikipedia.org/wiki/V-optimal_histograms) which attempts to find the optimal boundaries in the data. The basic idea is to look for the splits that provide the minimum total error-squared where the "error" for each point is the distance of that point from the arithmetic mean. This classifier uses v-optimal bucketing when the training set hass 144 or fewer rows. Above that it switches to equi-depth bucketing. Note, I only evaluated a single scenario (Rally RealTeam), but 144 was the point where equi-depth started to provide as-good results as v-optimal bucketing. Note, in my test, much larger sets had moderately _better_ results with equi-depth bucketing. The algorithm used here for v-optimal bucketing is slightly inspired by [this non-recursive code](http://www.mathcs.emory.edu/~cheung/Courses/584-StreamDB/Syllabus/06-Histograms/v-opt3.html). However, the implementation here is recursive and I've made some different choices about when to terminate the splitting. To understand the essence of the algorithm used, you need only look at the 9 lines of code in the `findBucketSplits()` function. The `optimalSplitFor2Buckets()` function will split the values into two buckets. It tries each possible split starting with only one in the bucket on the left all the way down to a split with only one in the bucket on the right. One of the design choices I made for this algorithm means that you can't precicely control the number of buckets. It also seems to have a tendency to create very lopsided bucketing breakdowns. The latter may be the reason that equi-depth bucketing has better results when there are hundreds of rows in the training set. We may wish to revisit this algorithm at a later time because my instinct is that there is probably some definition of "optimal" that is at least as good as equi-depth for large training sets. I suspect the current algorith favors splitting the left. A better algorithm wouldn't have a left and a right. It would find the optimal split for each of the current splits and take the one that gave the entire new splitting regime the lowest overall error. This new algorithm would be much more computationally intensive but for small training sets, I don't think it will be a deal breaker and we can always use equi-depth once for larger sets. *# Simple example ## First we need to require the classifier. {BayesianClassifier} = require('../') Before we start, let's take a look at our training set. The assumption is that we think TeamSize and HasChildProject will be predictors for RealTeam. trainingSet = [ {TeamSize: 5, HasChildProject: 0, RealTeam: 1}, {TeamSize: 3, HasChildProject: 1, RealTeam: 0}, {TeamSize: 3, HasChildProject: 1, RealTeam: 1}, {TeamSize: 1, HasChildProject: 0, RealTeam: 0}, {TeamSize: 2, HasChildProject: 1, RealTeam: 0}, {TeamSize: 2, HasChildProject: 0, RealTeam: 0}, {TeamSize: 15, HasChildProject: 1, RealTeam: 0}, {TeamSize: 27, HasChildProject: 1, RealTeam: 0}, {TeamSize: 13, HasChildProject: 1, RealTeam: 1}, {TeamSize: 7, HasChildProject: 0, RealTeam: 1}, {TeamSize: 7, HasChildProject: 0, RealTeam: 0}, {TeamSize: 9, HasChildProject: 1, RealTeam: 1}, {TeamSize: 6, HasChildProject: 0, RealTeam: 1}, {TeamSize: 5, HasChildProject: 0, RealTeam: 1}, {TeamSize: 5, HasChildProject: 0, RealTeam: 0}, ] Now, let's set up a simple config indicating our assumptions. Note how the type for TeamSize is 'continuous' whereas the type for HasChildProject is 'discrete' eventhough a number is stored. Continuous types must be numbers but discrete types can either be numbers or strings. config = outputField: "RealTeam" features: [ {field: 'TeamSize', type: 'continuous'}, {field: 'HasChildProject', type: 'discrete'} ] We can now instantiate the classifier with that config, classifier = new BayesianClassifier(config) and pass in our training set. percentWins = classifier.train(trainingSet) The call to `train()` returns the percentage of times that the trained classifier gets the right answer for the training set. This should usually be pretty high. Anything below say, 70% and you probably don't have the right "features" in your training set or you don't have enough training set data. Our made up exmple is a borderline case. console.log(percentWins) * 0.7333333333333333 Now, let's see how the trained classifier is used to predict "RealTeam"-ness. We simply pass in an object with fields for each of our features. A very small team with child projects are definitely not a RealTeam. console.log(classifier.predict({TeamSize: 1, HasChildProject: 1})) * 0 However, a mid-sized project with no child projects most certainly is a RealTeam. console.log(classifier.predict({TeamSize: 7, HasChildProject: 0})) * 1 Here is a less obvious case, with one indicator going one way (too big) and another going the other way (no child projects). console.log(classifier.predict({TeamSize: 29, HasChildProject: 0})) * 0 If you want to know the strength of the prediction, you can pass in `true` as the second parameter to the `predict()` method. console.log(classifier.predict({TeamSize: 29, HasChildProject: 0}, true)) * { '0': 0.6956521739130435, '1': 0.30434782608695654 } We're only 69.6% sure this is not a RealTeam. Notice how the keys for the output are strings eventhough we passed in values of type Number for the RealTeam field in our training set. We had no choice in this case because keys of JavaScript Objects must be strings. However, the classifier is smart enough to know that you wanted Like the Lumenize calculators, you can save and restore the state of a trained classifier. savedState = classifier.getStateForSaving('some meta data') newClassifier = BayesianClassifier.newFromSavedState(savedState) console.log(newClassifier.meta) * some meta data It will make the same predictions. console.log(newClassifier.predict({TeamSize: 29, HasChildProject: 0}, true)) * { '0': 0.6956521739130435, '1': 0.30434782608695654 } */ function BayesianClassifier(userConfig) { this.userConfig = userConfig; /* @constructor @param {Object} userConfig See Config options for details. @cfg {String} outputField String indicating which field in the training set is what we are trying to predict @cfg {Object[]} features Array of Maps which specifies the fields to use as features. Each row in the array should be in the form of `{field: <fieldName>, type: <'continuous' | 'discrete'>}`. Note, that you can even declare Number type fields as 'discrete'. It is preferable to do this if you know that it can only be one of a hand full of values (0 vs 1 for example). **WARNING: If you choose 'discrete' for the feature type, then ALL possible values for that feature must appear in the training set. If the classifier is asked to make a prediction with a value that it has never seen before, it will fail catostrophically.** */ this.config = utils.clone(this.userConfig); this.outputField = this.config.outputField; this.features = this.config.features; } BayesianClassifier.prototype.train = function(userSuppliedTrainingSet) { /* @method train Train the classifier with a training set. @return {Number} The percentage of time that the trained classifier returns the expected outputField for the rows in the training set. If this is low (say below 70%), you need more predictive fields and/or more data in your training set. @param {Object[]} userSuppliedTrainingSet an Array of Maps containing a field for the outputField as well as a field for each of the features specified in the config. */ var bin, bucketGenerator, bucketer, countForThisValue, denominator, denominatorCell, dimensions, feature, featureCube, featureValues, filter, loses, n, numerator, numeratorCell, outputDimension, outputValue, outputValuesCube, percentWins, prediction, row, trainingSet, value, values, wins, _i, _j, _k, _l, _len, _len1, _len2, _len3, _len4, _len5, _len6, _len7, _m, _n, _o, _p, _ref, _ref1, _ref2, _ref3, _ref4, _ref5; trainingSet = utils.clone(userSuppliedTrainingSet); outputDimension = [ { field: this.outputField } ]; outputValuesCube = new OLAPCube({ dimensions: outputDimension }, trainingSet); this.outputValues = outputValuesCube.getDimensionValues(this.outputField); this.outputFieldTypeIsNumber = true; _ref = this.outputValues; for (_i = 0, _len = _ref.length; _i < _len; _i++) { value = _ref[_i]; if (utils.type(value) !== 'number') { this.outputFieldTypeIsNumber = false; } } n = trainingSet.length; filter = {}; this.baseProbabilities = {}; _ref1 = this.outputValues; for (_j = 0, _len1 = _ref1.length; _j < _len1; _j++) { outputValue = _ref1[_j]; filter[this.outputField] = outputValue; countForThisValue = outputValuesCube.getCell(filter)._count; this.baseProbabilities[outputValue] = countForThisValue / n; } if (n >= 144) { bucketGenerator = Classifier.generateConstantQuantityBucketer; } else { bucketGenerator = Classifier.generateVOptimalBucketer; } _ref2 = this.features; for (_k = 0, _len2 = _ref2.length; _k < _len2; _k++) { feature = _ref2[_k]; if (feature.type === 'continuous') { values = (function() { var _l, _len3, _results; _results = []; for (_l = 0, _len3 = trainingSet.length; _l < _len3; _l++) { row = trainingSet[_l]; _results.push(row[feature.field]); } return _results; })(); bucketer = bucketGenerator(values); feature.bins = bucketer; } else if (feature.type === 'discrete') { } else { throw new Error("Unrecognized feature type: " + feature.type + "."); } } for (_l = 0, _len3 = trainingSet.length; _l < _len3; _l++) { row = trainingSet[_l]; this.discreteizeRow(row); } _ref3 = this.features; for (_m = 0, _len4 = _ref3.length; _m < _len4; _m++) { feature = _ref3[_m]; dimensions = [ { field: this.outputField, keepTotals: true } ]; dimensions.push({ field: feature.field }); featureCube = new OLAPCube({ dimensions: dimensions }, trainingSet); featureValues = featureCube.getDimensionValues(feature.field); if (feature.type === 'discrete') { feature.bins = (function() { var _len5, _n, _results; _results = []; for (_n = 0, _len5 = featureValues.length; _n < _len5; _n++) { value = featureValues[_n]; _results.push({ value: value }); } return _results; })(); } _ref4 = feature.bins; for (_n = 0, _len5 = _ref4.length; _n < _len5; _n++) { bin = _ref4[_n]; bin.probabilities = {}; _ref5 = this.outputValues; for (_o = 0, _len6 = _ref5.length; _o < _len6; _o++) { outputValue = _ref5[_o]; filter = {}; filter[feature.field] = bin.value; denominatorCell = featureCube.getCell(filter); if (denominatorCell != null) { denominator = denominatorCell._count; } else { denominator = 0; } filter[this.outputField] = outputValue; numeratorCell = featureCube.getCell(filter); numerator = (numeratorCell != null ? numeratorCell._count : void 0) | 0; bin.probabilities[outputValue] = numerator / denominator; } } } trainingSet = utils.clone(userSuppliedTrainingSet); wins = 0; loses = 0; for (_p = 0, _len7 = trainingSet.length; _p < _len7; _p++) { row = trainingSet[_p]; prediction = this.predict(row); if (prediction === row[this.outputField]) { wins++; } else { loses++; } } percentWins = wins / (wins + loses); return percentWins; }; BayesianClassifier.prototype.predict = function(row, returnProbabilities) { var bin, feature, matchingBin, max, outputValue, outputValueForMax, probabilities, probability, _i, _j, _len, _len1, _ref, _ref1, _ref2; if (returnProbabilities == null) { returnProbabilities = false; } /* @method predict Use the trained classifier to make a prediction. @return {String|Number|Object} If returnProbabilities is false (the default), then it will return the prediction. If returnProbabilities is true, then it will return an Object indicating the probability for each possible outputField value. @param {Object} row an Object containing a field for each of the features specified by the config. @param {Boolean} [returnProbabilities = false] If true, then the output will indicate the probabilities of each possible outputField value. Otherwise, the output of a call to `predict()` will return the predicted value with the highest probability. */ row = this.discreteizeRow(row); probabilities = {}; _ref = this.baseProbabilities; for (outputValue in _ref) { probability = _ref[outputValue]; probabilities[outputValue] = probability; } _ref1 = this.features; for (_i = 0, _len = _ref1.length; _i < _len; _i++) { feature = _ref1[_i]; matchingBin = null; _ref2 = feature.bins; for (_j = 0, _len1 = _ref2.length; _j < _len1; _j++) { bin = _ref2[_j]; if (row[feature.field] === bin.value) { matchingBin = bin; break; } } if (matchingBin == null) { throw new Error("No matching bin for " + feature.field + "=" + row[feature.field] + " in the training set."); } for (outputValue in probabilities) { probability = probabilities[outputValue]; probabilities[outputValue] = probability * matchingBin.probabilities[outputValue] / (probability * matchingBin.probabilities[outputValue] + (1 - probability) * (1 - matchingBin.probabilities[outputValue])); } } max = 0; outputValueForMax = null; for (outputValue in probabilities) { probability = probabilities[outputValue]; if (probability > max) { max = probability; outputValueForMax = outputValue; } } if (returnProbabilities) { return probabilities; } else { if (this.outputFieldTypeIsNumber) { return Number(outputValueForMax); } else { return outputValueForMax; } } }; BayesianClassifier.prototype.getStateForSaving = function(meta) { /* @method getStateForSaving Enables saving the state of a Classifier. See the bottom of the "Simple example" for example code of using this saving and restoring functionality. @param {Object} [meta] An optional parameter that will be added to the serialized output and added to the meta field within the deserialized Classifier @return {Object} Returns an Ojbect representing the state of the Classifier. This Object is suitable for saving to an object store. Use the static method `newFromSavedState()` with this Object as the parameter to reconstitute the Classifier. */ var out; out = { userConfig: this.userConfig, outputField: this.outputField, outputValues: this.outputValues, outputFieldTypeIsNumber: this.outputFieldTypeIsNumber, baseProbabilities: this.baseProbabilities, features: this.features }; if (meta != null) { out.meta = meta; } return out; }; BayesianClassifier.newFromSavedState = function(p) { /* @method newFromSavedState Deserializes a previously stringified Classifier and returns a new Classifier. See the bottom of the "Simple example" for example code of using this saving and restoring functionality. @static @param {String/Object} p A String or Object from a previously saved Classifier state @return {Classifier} */ var classifier; if (utils.type(p) === 'string') { p = JSON.parse(p); } classifier = new BayesianClassifier(p.userConfig); classifier.outputField = p.outputField; classifier.outputValues = p.outputValues; classifier.outputFieldTypeIsNumber = p.outputFieldTypeIsNumber; classifier.baseProbabilities = p.baseProbabilities; classifier.features = p.features; if (p.meta != null) { classifier.meta = p.meta; } return classifier; }; return BayesianClassifier; })(Classifier); exports.Classifier = Classifier; exports.BayesianClassifier = BayesianClassifier; }).call(this);