UNPKG

lumenize

Version:

Illuminating the forest AND the trees in your data.

github.com/lmaccherone/Lumenize

lmaccherone/Lumenize

727 lines (639 loc) • 30.5 kB

JavaScript

// Generated by CoffeeScript 1.10.0 (function() { var BayesianClassifier, Classifier, OLAPCube, functions, utils, extend = function(child, parent) { for (var key in parent) { if (hasProp.call(parent, key)) child[key] = parent[key]; } function ctor() { this.constructor = child; } ctor.prototype = parent.prototype; child.prototype = new ctor(); child.__super__ = parent.prototype; return child; }, hasProp = {}.hasOwnProperty; functions = require('./functions').functions; utils = require('tztime').utils; OLAPCube = require('./OLAPCube').OLAPCube; Classifier = (function() { function Classifier() {} /* @class Classifier __Base class for all Classifiers__ See individual subclasses for usage details */ Classifier.getBucketCountMinMax = function(values) { var max, min, targetBucketCount; targetBucketCount = Math.floor(Math.sqrt(values.length)) + 1; if (targetBucketCount < 3) { throw new Error("Need more training data"); } min = functions.min(values); max = functions.max(values); return { targetBucketCount: targetBucketCount, min: min, max: max }; }; Classifier.generateConstantWidthBucketer = function(values) { var bucketSize, bucketer, i, j, max, min, ref, ref1, targetBucketCount; ref = Classifier.getBucketCountMinMax(values), targetBucketCount = ref.targetBucketCount, min = ref.min, max = ref.max; bucketSize = (max - min) / targetBucketCount; bucketer = []; bucketer.push({ value: 'B' + 0, startOn: null, endBelow: min + bucketSize }); for (i = j = 1, ref1 = targetBucketCount - 2; 1 <= ref1 ? j <= ref1 : j >= ref1; i = 1 <= ref1 ? ++j : --j) { bucketer.push({ value: 'B' + i, startOn: min + bucketSize * i, endBelow: min + bucketSize * (i + 1) }); } bucketer.push({ value: 'B' + (targetBucketCount - 1), startOn: min + bucketSize * (targetBucketCount - 1), endBelow: null }); return bucketer; }; Classifier.generateConstantQuantityBucketer = function(values) { var bucketSize, bucketer, currentBoundary, i, j, lastBoundary, max, min, ref, ref1, targetBucketCount; ref = Classifier.getBucketCountMinMax(values), targetBucketCount = ref.targetBucketCount, min = ref.min, max = ref.max; bucketSize = 100 / targetBucketCount; bucketer = []; currentBoundary = functions.percentileCreator(bucketSize)(values); bucketer.push({ value: 'B' + 0, startOn: null, endBelow: currentBoundary }); for (i = j = 1, ref1 = targetBucketCount - 2; 1 <= ref1 ? j <= ref1 : j >= ref1; i = 1 <= ref1 ? ++j : --j) { lastBoundary = currentBoundary; currentBoundary = functions.percentileCreator(bucketSize * (i + 1))(values); bucketer.push({ value: 'B' + i, startOn: lastBoundary, endBelow: currentBoundary }); } bucketer.push({ value: 'B' + (targetBucketCount - 1), startOn: currentBoundary, endBelow: null }); return bucketer; }; Classifier.splitAt = function(values, index) { var left, right; left = values.slice(0, index); right = values.slice(index); return { left: left, right: right }; }; Classifier.splitAtValue = function(values, split) { var j, left, len, right, value; left = []; right = []; for (j = 0, len = values.length; j < len; j++) { value = values[j]; if (value < split) { left.push(value); } else { right.push(value); } } return { left: left, right: right }; }; Classifier.optimalSplitFor2Buckets = function(values) { var bestIndex, bestLeft, bestRight, bestTotalErrorSquared, i, j, left, ref, ref1, right, splitAt, totalErrorSquared; bestIndex = 1; bestTotalErrorSquared = Number.MAX_VALUE; for (i = j = 1, ref = values.length - 1; 1 <= ref ? j <= ref : j >= ref; i = 1 <= ref ? ++j : --j) { ref1 = Classifier.splitAt(values, i), left = ref1.left, right = ref1.right; totalErrorSquared = functions.errorSquared(left) + functions.errorSquared(right); if (totalErrorSquared < bestTotalErrorSquared) { bestTotalErrorSquared = totalErrorSquared; bestIndex = i; bestLeft = left; bestRight = right; } } splitAt = (values[bestIndex - 1] + values[bestIndex]) / 2; return { splitAt: splitAt, left: bestLeft, right: bestRight }; }; Classifier.areAllSame = function(values) { var firstValue, j, len, value; firstValue = values[0]; for (j = 0, len = values.length; j < len; j++) { value = values[j]; if (value !== firstValue) { return false; } } return true; }; Classifier.findBucketSplits = function(currentSplits, values, targetBucketCount, originalValues) { var errorSquared, j, left, left2, len, maxErrorSquared, maxErrorSquaredValues, ref, ref1, ref2, right, right2, split, splitAt; if (originalValues == null) { originalValues = values.slice(0); } if (values.length < 5 || Classifier.areAllSame(values)) { return null; } ref = Classifier.optimalSplitFor2Buckets(values), splitAt = ref.splitAt, left = ref.left, right = ref.right; currentSplits.push(splitAt); currentSplits.sort(function(a, b) { return a - b; }); while (currentSplits.length < targetBucketCount - 1) { right = originalValues; maxErrorSquared = 0; maxErrorSquaredValues = null; for (j = 0, len = currentSplits.length; j < len; j++) { split = currentSplits[j]; ref1 = Classifier.splitAtValue(right, split), left = ref1.left, right = ref1.right; errorSquared = functions.errorSquared(left); if (errorSquared > maxErrorSquared) { maxErrorSquared = errorSquared; maxErrorSquaredValues = left; } } errorSquared = functions.errorSquared(right); if (errorSquared > maxErrorSquared) { maxErrorSquared = errorSquared; maxErrorSquaredValues = right; } ref2 = Classifier.optimalSplitFor2Buckets(maxErrorSquaredValues), splitAt = ref2.splitAt, left2 = ref2.left2, right2 = ref2.right2; currentSplits.push(splitAt); currentSplits.sort(function(a, b) { return a - b; }); } return currentSplits; }; Classifier.generateVOptimalBucketer = function(values) { var bucketer, currentBoundary, i, j, lastBoundary, max, min, ref, ref1, splits, targetBucketCount; ref = Classifier.getBucketCountMinMax(values), targetBucketCount = ref.targetBucketCount, min = ref.min, max = ref.max; values.sort(function(a, b) { return a - b; }); splits = []; Classifier.findBucketSplits(splits, values, targetBucketCount); splits.sort(function(a, b) { return a - b; }); bucketer = []; currentBoundary = splits[0]; bucketer.push({ value: 'B' + 0, startOn: null, endBelow: currentBoundary }); for (i = j = 1, ref1 = splits.length - 1; 1 <= ref1 ? j <= ref1 : j >= ref1; i = 1 <= ref1 ? ++j : --j) { lastBoundary = currentBoundary; currentBoundary = splits[i]; bucketer.push({ value: 'B' + i, startOn: lastBoundary, endBelow: currentBoundary }); } bucketer.push({ value: 'B' + splits.length, startOn: currentBoundary, endBelow: null }); return bucketer; }; Classifier.prototype.discreteizeRow = function(row) { var bin, feature, index, j, k, len, len1, ref, ref1, value; ref = this.features; for (j = 0, len = ref.length; j < len; j++) { feature = ref[j]; if (feature.type === 'continuous') { value = row[feature.field]; if (value == null) { throw new Error("Could not find field " + feature.field + " in " + (JSON.stringify(row)) + "."); } ref1 = feature.bins; for (index = k = 0, len1 = ref1.length; k < len1; index = ++k) { bin = ref1[index]; if (bin.startOn != null) { if (bin.endBelow != null) { if ((bin.startOn <= value && value < bin.endBelow)) { row[feature.field] = bin.value; break; } } else if (bin.startOn <= value) { row[feature.field] = bin.value; break; } } else if (value < bin.endBelow) { row[feature.field] = bin.value; break; } } } } return row; }; return Classifier; })(); BayesianClassifier = (function(superClass) { extend(BayesianClassifier, superClass); /* @class BayesianClassifier __A Bayesian classifier with non-parametric modeling of distributions using v-optimal bucketing.__ If you look for libraries for Bayesian classification, the primary use case is spam filtering and they assume that the presence or absence of a word is the only feature you are interested in. This is a more general purpose tool. ## Features ## * Works even for bi-modal and other non-normal distributions * No requirement that you identify the distribution * Uses [non-parametric modeling](http://en.wikipedia.org/wiki/Non-parametric_statistics) * Uses v-optimal bucketing so it deals well with outliers and sharp cliffs * Serialize (`getStateForSaving()`) and deserialize (`newFromSavedState()`) to preserve training between sessions ## Why the assumption of a normal distribution is bad in some cases ## The [wikipedia example of using Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Sex_classification) tries to determine if someone was male or female based upon the height, weight and shoe size. The assumption is that men are generally larger, heavier, and have larger shoe size than women. In the example, they use the mean and variance of the male-only and female-only populations to characterize those distributions. This works because these characteristics are generally normally distributed **and the distribution for men is generally to the right of the distribution for women**. However, let's ask a group of folks who work together if they consider themselves a team and let's try to use the size of the group as a feature to predict what a new group would say. If the group is very small (1-2 people), they are less likely to consider themselves a team (partnership maybe), but if they are too large (say > 10), they are also unlikely to refer to themselves as a team. The non-team distribution is bimodal, looking at its mean and variance completely mis-characterizes it. Also, the distribution is zero bound so it's likely to be asymmetric, which also poses problems for a normal distribution assumption. ## So what do we do instead? ## This classifier uses the actual values (in buckets) rather than characterize the distribution as "normal", "log-normal", etc. This approach is often referred to as "building a non-parametric model". **Pros/Cons**. The use of a non-parametric approach will allow us to deal with non-normal distributions (asymmetric, bimodal, etc.) without ever having to identify which nominal distribution is the best fit or having to ask the user (who may not know) what distribution to use. The downside to this approach is that it generally requires a larger training set. You will need to experiment to determine how small is too small for your situation. This approach is hinted at in the [wikipedia article on Bayesian classifiers](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) as "binning to discretize the feature values, to obtain a new set of Bernoulli-distributed features". However, this classifier does not create new separate Bernoulli features for each bin. Rather, it creates a mapping function from a feature value to a probability indicating how often the feature value is coincident with a particular outputField value. This mapping function is different for each bin. ## V-optimal bucketing ## There are two common approaches to bucketing: 1. Make each bucket be equal in width along the x-axis (like we would for a histogram) (equi-width) 2. Make each bucket have roughly the same number of data points (equi-depth) It turns out neither of the above works out well unless the training set is relatively large. Rather, there is an approach called [v-optimal bucketing](http://en.wikipedia.org/wiki/V-optimal_histograms) which attempts to find the optimal boundaries in the data. The basic idea is to look for the splits that provide the minimum total error-squared where the "error" for each point is the distance of that point from the arithmetic mean. This classifier uses v-optimal bucketing when the training set has 144 or fewer rows. Above that it switches to equi-depth bucketing. Note, I only evaluated a single scenario (Rally RealTeam), but 144 was the point where equi-depth started to provide as-good results as v-optimal bucketing. Note, in my test, much larger sets had moderately _better_ results with equi-depth bucketing. That said, the 144 cutoff was determined with an older version of the v-optimal bucketing. I've since fixed that old algorithms tendency to produce lopsided distributions. It may very well be possible for v-optimal to be better for even larger numbers of data points. I need to run a new experiment to see. The algorithm used here for v-optimal bucketing is slightly inspired by [this](http://www.mathcs.emory.edu/~cheung/Courses/584-StreamDB/Syllabus/06-Histograms/v-opt3.html). However, I've made some different choices about when to terminate the splitting and deciding what portion to split again. To understand the essence of the algorithm used, you need only look at the 9 lines of code in the `findBucketSplits()` function. The `optimalSplitFor2Buckets()` function will split the values into two buckets. It tries each possible split starting with only one in the bucket on the left all the way down to a split with only one in the bucket on the right. It then figures out which split has the highest error and splits that again until we have the target number of splits. ## Simple example ## First we need to require the classifier. {BayesianClassifier} = require('../') Before we start, let's take a look at our training set. The assumption is that we think TeamSize and HasChildProject will be predictors for RealTeam. trainingSet = [ {TeamSize: 5, HasChildProject: 0, RealTeam: 1}, {TeamSize: 3, HasChildProject: 1, RealTeam: 0}, {TeamSize: 3, HasChildProject: 1, RealTeam: 1}, {TeamSize: 1, HasChildProject: 0, RealTeam: 0}, {TeamSize: 2, HasChildProject: 1, RealTeam: 0}, {TeamSize: 2, HasChildProject: 0, RealTeam: 0}, {TeamSize: 15, HasChildProject: 1, RealTeam: 0}, {TeamSize: 27, HasChildProject: 1, RealTeam: 0}, {TeamSize: 13, HasChildProject: 1, RealTeam: 1}, {TeamSize: 7, HasChildProject: 0, RealTeam: 1}, {TeamSize: 7, HasChildProject: 0, RealTeam: 0}, {TeamSize: 9, HasChildProject: 1, RealTeam: 1}, {TeamSize: 6, HasChildProject: 0, RealTeam: 1}, {TeamSize: 5, HasChildProject: 0, RealTeam: 1}, {TeamSize: 5, HasChildProject: 0, RealTeam: 0}, ] Now, let's set up a simple config indicating our assumptions. Note how the type for TeamSize is 'continuous' whereas the type for HasChildProject is 'discrete' eventhough a number is stored. Continuous types must be numbers but discrete types can either be numbers or strings. config = outputField: "RealTeam" features: [ {field: 'TeamSize', type: 'continuous'}, {field: 'HasChildProject', type: 'discrete'} ] We can now instantiate the classifier with that config, classifier = new BayesianClassifier(config) and pass in our training set. percentWins = classifier.train(trainingSet) The call to `train()` returns the percentage of times that the trained classifier gets the right answer for the training set. This should usually be pretty high. Anything below say, 70% and you probably don't have the right "features" in your training set or you don't have enough training set data. Our made up exmple is a borderline case. console.log(percentWins) * 0.7333333333333333 Now, let's see how the trained classifier is used to predict "RealTeam"-ness. We simply pass in an object with fields for each of our features. A very small team with child projects are definitely not a RealTeam. console.log(classifier.predict({TeamSize: 1, HasChildProject: 1})) * 0 However, a mid-sized project with no child projects most certainly is a RealTeam. console.log(classifier.predict({TeamSize: 7, HasChildProject: 0})) * 1 Here is a less obvious case, with one indicator going one way (the right size) and another going the other way (has child projects). console.log(classifier.predict({TeamSize: 5, HasChildProject: 1})) * 1 If you want to know the strength of the prediction, you can pass in `true` as the second parameter to the `predict()` method. console.log(classifier.predict({TeamSize: 5, HasChildProject: 1}, true)) * { '0': 0.3786982248520709, '1': 0.6213017751479291 } We're only 62.1% sure this is a RealTeam. Notice how the keys for the output are strings eventhough we passed in values of type Number for the RealTeam field in our training set. We had no choice in this case because keys of JavaScript Objects must be strings. However, the classifier is smart enough to convert it back to the correct type if you call it without passing in true for the second parameter. Like the Lumenize calculators, you can save and restore the state of a trained classifier. savedState = classifier.getStateForSaving('some meta data') newClassifier = BayesianClassifier.newFromSavedState(savedState) console.log(newClassifier.meta) * some meta data It will make the same predictions. console.log(newClassifier.predict({TeamSize: 5, HasChildProject: 1}, true)) * { '0': 0.3786982248520709, '1': 0.6213017751479291 } */ function BayesianClassifier(userConfig) { this.userConfig = userConfig; /* @constructor @param {Object} userConfig See Config options for details. @cfg {String} outputField String indicating which field in the training set is what we are trying to predict @cfg {Object[]} features Array of Maps which specifies the fields to use as features. Each row in the array should be in the form of `{field: <fieldName>, type: <'continuous' | 'discrete'>}`. Note, that you can even declare Number type fields as 'discrete'. It is preferable to do this if you know that it can only be one of a hand full of values (0 vs 1 for example). **WARNING: If you choose 'discrete' for the feature type, then ALL possible values for that feature must appear in the training set. If the classifier is asked to make a prediction with a value that it has never seen before, it will fail catostrophically.** */ this.config = utils.clone(this.userConfig); this.outputField = this.config.outputField; this.features = this.config.features; } BayesianClassifier.prototype.train = function(userSuppliedTrainingSet) { /* @method train Train the classifier with a training set. @return {Number} The percentage of time that the trained classifier returns the expected outputField for the rows in the training set. If this is low (say below 70%), you need more predictive fields and/or more data in your training set. @param {Object[]} userSuppliedTrainingSet an Array of Maps containing a field for the outputField as well as a field for each of the features specified in the config. */ var bin, bucketGenerator, bucketer, countForThisValue, denominator, denominatorCell, dimensions, feature, featureCube, featureValues, filter, j, k, l, len, len1, len2, len3, len4, len5, len6, len7, loses, m, n, numerator, numeratorCell, o, outputDimension, outputValue, outputValuesCube, percentWins, prediction, q, r, ref, ref1, ref2, ref3, ref4, ref5, row, s, trainingSet, value, values, wins; trainingSet = utils.clone(userSuppliedTrainingSet); outputDimension = [ { field: this.outputField } ]; outputValuesCube = new OLAPCube({ dimensions: outputDimension }, trainingSet); this.outputValues = outputValuesCube.getDimensionValues(this.outputField); this.outputFieldTypeIsNumber = true; ref = this.outputValues; for (j = 0, len = ref.length; j < len; j++) { value = ref[j]; if (utils.type(value) !== 'number') { this.outputFieldTypeIsNumber = false; } } n = trainingSet.length; filter = {}; this.baseProbabilities = {}; ref1 = this.outputValues; for (k = 0, len1 = ref1.length; k < len1; k++) { outputValue = ref1[k]; filter[this.outputField] = outputValue; countForThisValue = outputValuesCube.getCell(filter)._count; this.baseProbabilities[outputValue] = countForThisValue / n; } if (n >= 144) { bucketGenerator = Classifier.generateConstantQuantityBucketer; } else { bucketGenerator = Classifier.generateVOptimalBucketer; } ref2 = this.features; for (l = 0, len2 = ref2.length; l < len2; l++) { feature = ref2[l]; if (feature.type === 'continuous') { values = (function() { var len3, m, results; results = []; for (m = 0, len3 = trainingSet.length; m < len3; m++) { row = trainingSet[m]; results.push(row[feature.field]); } return results; })(); bucketer = bucketGenerator(values); feature.bins = bucketer; } else if (feature.type === 'discrete') { } else { throw new Error("Unrecognized feature type: " + feature.type + "."); } } for (m = 0, len3 = trainingSet.length; m < len3; m++) { row = trainingSet[m]; this.discreteizeRow(row); } ref3 = this.features; for (o = 0, len4 = ref3.length; o < len4; o++) { feature = ref3[o]; dimensions = [ { field: this.outputField, keepTotals: true } ]; dimensions.push({ field: feature.field }); featureCube = new OLAPCube({ dimensions: dimensions }, trainingSet); featureValues = featureCube.getDimensionValues(feature.field); if (feature.type === 'discrete') { feature.bins = (function() { var len5, q, results; results = []; for (q = 0, len5 = featureValues.length; q < len5; q++) { value = featureValues[q]; results.push({ value: value }); } return results; })(); } ref4 = feature.bins; for (q = 0, len5 = ref4.length; q < len5; q++) { bin = ref4[q]; bin.probabilities = {}; ref5 = this.outputValues; for (r = 0, len6 = ref5.length; r < len6; r++) { outputValue = ref5[r]; filter = {}; filter[feature.field] = bin.value; denominatorCell = featureCube.getCell(filter); if (denominatorCell != null) { denominator = denominatorCell._count; } else { denominator = 0; } filter[this.outputField] = outputValue; numeratorCell = featureCube.getCell(filter); numerator = (numeratorCell != null ? numeratorCell._count : void 0) | 0; bin.probabilities[outputValue] = numerator / denominator; } } } trainingSet = utils.clone(userSuppliedTrainingSet); wins = 0; loses = 0; for (s = 0, len7 = trainingSet.length; s < len7; s++) { row = trainingSet[s]; prediction = this.predict(row); if (prediction === row[this.outputField]) { wins++; } else { loses++; } } percentWins = wins / (wins + loses); return percentWins; }; BayesianClassifier.prototype.predict = function(row, returnProbabilities) { var bin, feature, j, k, len, len1, matchingBin, max, outputValue, outputValueForMax, probabilities, probability, ref, ref1, ref2; if (returnProbabilities == null) { returnProbabilities = false; } /* @method predict Use the trained classifier to make a prediction. @return {String|Number|Object} If returnProbabilities is false (the default), then it will return the prediction. If returnProbabilities is true, then it will return an Object indicating the probability for each possible outputField value. @param {Object} row an Object containing a field for each of the features specified by the config. @param {Boolean} [returnProbabilities = false] If true, then the output will indicate the probabilities of each possible outputField value. Otherwise, the output of a call to `predict()` will return the predicted value with the highest probability. */ row = this.discreteizeRow(row); probabilities = {}; ref = this.baseProbabilities; for (outputValue in ref) { probability = ref[outputValue]; probabilities[outputValue] = probability; } ref1 = this.features; for (j = 0, len = ref1.length; j < len; j++) { feature = ref1[j]; matchingBin = null; ref2 = feature.bins; for (k = 0, len1 = ref2.length; k < len1; k++) { bin = ref2[k]; if (row[feature.field] === bin.value) { matchingBin = bin; break; } } if (matchingBin == null) { throw new Error("No matching bin for " + feature.field + "=" + row[feature.field] + " in the training set."); } for (outputValue in probabilities) { probability = probabilities[outputValue]; probabilities[outputValue] = probability * matchingBin.probabilities[outputValue] / (probability * matchingBin.probabilities[outputValue] + (1 - probability) * (1 - matchingBin.probabilities[outputValue])); } } max = 0; outputValueForMax = null; for (outputValue in probabilities) { probability = probabilities[outputValue]; if (probability > max) { max = probability; outputValueForMax = outputValue; } } if (returnProbabilities) { return probabilities; } else { if (this.outputFieldTypeIsNumber) { return Number(outputValueForMax); } else { return outputValueForMax; } } }; BayesianClassifier.prototype.getStateForSaving = function(meta) { /* @method getStateForSaving Enables saving the state of a Classifier. See the bottom of the "Simple example" for example code of using this saving and restoring functionality. @param {Object} [meta] An optional parameter that will be added to the serialized output and added to the meta field within the deserialized Classifier @return {Object} Returns an Ojbect representing the state of the Classifier. This Object is suitable for saving to an object store. Use the static method `newFromSavedState()` with this Object as the parameter to reconstitute the Classifier. */ var out; out = { userConfig: this.userConfig, outputField: this.outputField, outputValues: this.outputValues, outputFieldTypeIsNumber: this.outputFieldTypeIsNumber, baseProbabilities: this.baseProbabilities, features: this.features }; if (meta != null) { out.meta = meta; } return out; }; BayesianClassifier.newFromSavedState = function(p) { /* @method newFromSavedState Deserializes a previously stringified Classifier and returns a new Classifier. See the bottom of the "Simple example" for example code of using this saving and restoring functionality. @static @param {String/Object} p A String or Object from a previously saved Classifier state @return {Classifier} */ var classifier; if (utils.type(p) === 'string') { p = JSON.parse(p); } classifier = new BayesianClassifier(p.userConfig); classifier.outputField = p.outputField; classifier.outputValues = p.outputValues; classifier.outputFieldTypeIsNumber = p.outputFieldTypeIsNumber; classifier.baseProbabilities = p.baseProbabilities; classifier.features = p.features; if (p.meta != null) { classifier.meta = p.meta; } return classifier; }; return BayesianClassifier; })(Classifier); exports.Classifier = Classifier; exports.BayesianClassifier = BayesianClassifier; }).call(this);