UNPKG

lumenize

Version:

Illuminating the forest AND the trees in your data.

github.com/lmaccherone/Lumenize

lmaccherone/Lumenize

996 lines (905 loc) • 39.2 kB

JavaScript

// Generated by CoffeeScript 1.10.0 (function() { var RandomPicker, functions, getBucketCountMinMax, histogram, justHereForDocsAndDoctest, roundDownToSignificance, roundUpToSignificance, setParameters, utils; functions = require('./functions').functions; utils = require('tztime').utils; RandomPicker = require('./RandomPicker').RandomPicker; histogram = {}; justHereForDocsAndDoctest = function() { /* @class histogram This module has functionality that will allow you to create histograms and do bucketing. Features: * Three bucketing strategies: 1. constant width (default) 2. constant depth - for an example of using this mode, look at the source code for the `bucketPercentile()` function 3. [v-optimal](http://en.wikipedia.org/wiki/V-optimal_histograms) * Two operating modes modes: 1. Automatic. Call histogram with data and all of your parameters and out pops a histogram. 2. Piecemeal. Create buckets, put data into buckets, generate histograms from data and pre-calculated buckets. Sometimes you are less interested in the histogram than you are in the bucketing. Let's walk through some examples of both modes. But first a general discussion about how these functions accept raw data. ## Getting data into the histogram functions ## We have two ways to define data. We can pass in an Array of Objects and specify the field to use. grades = [ {name: 'Joe', average: 105}, {name: 'Jeff', average: 104.9}, # ... ] {histogram} = require('../') h = histogram.histogram(grades, 'average') console.log(h) * [ { index: 0, startOn: null, endBelow: null, label: 'all', count: 2 } ] Or, we can just pass in a list of values grades = [105, 104.9, 99, 98.7, 85, 78, 54, 98, 78, 20] h = histogram.histogram(grades) console.log((row.label + ': ' + row.count for row in h)) * [ '< 41.25: 1', '41.25-62.5: 1', '62.5-83.75: 2', '>= 83.75: 6' ] ## Automatic histogram creation ## The above examples for the two ways of getting data into the histogram functions also demonstrates the use of automatic histogram creation. There are additional parameters to this function that allow you to control the type of bucketing (constantWidth, constantDepth, etc.), min and max values, significance of the bucket boundaries, etc. See the individual functions for details on these parameters. ## Piecemeal usage ## Sometimes you don't actually want a histogram. You want a way to create constantWidth, constantDepth, log, or v-optimal buckets and you want a tool to know which bucket a particular value falls into. The cannonical example of this is for calculating percentiles for standardized testing... or for grading on a curve. The documentation for the `percentileBuckets()` function walks you through an example like this. */ }; getBucketCountMinMax = function(values) { var max, min, targetBucketCount; targetBucketCount = Math.floor(Math.sqrt(values.length)) + 1; if (targetBucketCount < 3) { targetBucketCount = 2; } min = functions.min(values); max = functions.max(values); return { targetBucketCount: targetBucketCount, min: min, max: max }; }; roundUpToSignificance = function(value, significance) { var multiple; if (significance == null) { return value; } multiple = 1 / significance; return Math.ceil(value * multiple) / multiple; }; roundDownToSignificance = function(value, significance) { var multiple; if (significance == null) { return value; } multiple = 1 / significance; return Math.floor(value * multiple) / multiple; }; setParameters = function(rows, valueField, firstStartOn, lastEndBelow, bucketCount, significance) { var lowerBase, max, min, ref, row, targetBucketCount, upperBase, values; if (valueField != null) { values = (function() { var j, len, results; results = []; for (j = 0, len = rows.length; j < len; j++) { row = rows[j]; results.push(row[valueField]); } return results; })(); } else { values = rows; } ref = getBucketCountMinMax(values), targetBucketCount = ref.targetBucketCount, min = ref.min, max = ref.max; if (bucketCount == null) { bucketCount = targetBucketCount; } if (firstStartOn != null) { lowerBase = firstStartOn; } else { lowerBase = roundDownToSignificance(min, significance); firstStartOn = null; } if (lastEndBelow != null) { upperBase = lastEndBelow; } else { upperBase = roundUpToSignificance(max, significance); lastEndBelow = null; } return { values: values, bucketCount: bucketCount, firstStartOn: firstStartOn, lowerBase: lowerBase, lastEndBelow: lastEndBelow, upperBase: upperBase }; }; histogram.bucketsLog = function(rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount) { var buckets, endBelow, firstStartOnExponent, index, lastEndBelowExponent, lowerBase, ref, startOn, upperBase, values; if (significance != null) { throw new Error("Significance not supported for bucketsLog."); } ref = setParameters(rows, valueField, firstStartOn, lastEndBelow, bucketCount, significance), values = ref.values, bucketCount = ref.bucketCount, firstStartOn = ref.firstStartOn, lowerBase = ref.lowerBase, lastEndBelow = ref.lastEndBelow, upperBase = ref.upperBase; if (lowerBase < 0) { throw new Error("bucketsLog do not support values below zero. Strip those out if you want to use this."); } if (lowerBase === 0) { firstStartOn = 0; } else { firstStartOnExponent = Math.floor(Math.log10(lowerBase)); firstStartOn = Math.pow(10, firstStartOnExponent); } lastEndBelowExponent = Math.floor(Math.log10(upperBase)) + 1; lastEndBelow = Math.pow(10, lastEndBelowExponent); index = 0; startOn = firstStartOn; if (startOn === 0) { endBelow = 1; } else { endBelow = Math.pow(10, firstStartOnExponent + 1); } buckets = []; while (endBelow <= lastEndBelow) { buckets.push({ index: index, startOn: startOn, endBelow: endBelow }); startOn = endBelow; endBelow = endBelow * 10; index++; } return buckets; }; histogram.bucketsConstantWidth = function(rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount) { var bucket, bucketSize, buckets, edge, i, j, lastEdge, lowerBase, ref, ref1, upperBase, values; ref = setParameters(rows, valueField, firstStartOn, lastEndBelow, bucketCount, significance), values = ref.values, bucketCount = ref.bucketCount, firstStartOn = ref.firstStartOn, lowerBase = ref.lowerBase, lastEndBelow = ref.lastEndBelow, upperBase = ref.upperBase; buckets = []; if (bucketCount < 3) { bucket = { index: 0, startOn: firstStartOn, endBelow: lastEndBelow, label: 'all' }; buckets.push(bucket); return buckets; } bucketSize = roundDownToSignificance((upperBase - lowerBase) / bucketCount, significance); if (bucketSize <= 0) { throw new Error("Calculated bucketSizes <= 0 are not allowed. Try a smaller significance."); } lastEdge = lowerBase + bucketSize; bucket = { index: 0, startOn: firstStartOn, endBelow: lastEdge }; buckets.push(bucket); for (i = j = 1, ref1 = bucketCount - 2; 1 <= ref1 ? j <= ref1 : j >= ref1; i = 1 <= ref1 ? ++j : --j) { edge = lastEdge + bucketSize; buckets.push({ index: i, startOn: lastEdge, endBelow: edge }); lastEdge = edge; } if ((lastEdge != null) && (lastEndBelow != null) && lastEdge >= lastEndBelow) { throw new Error("Somehow, the last bucket didn't work out. Try a smaller significance. lastEdge: " + lastEdge + " lastEndBelow: " + lastEndBelow); } bucket = { index: bucketCount - 1, startOn: lastEdge, endBelow: lastEndBelow }; buckets.push(bucket); return buckets; }; histogram.bucketsConstantDepth = function(rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount) { var bucket, bucketSize, buckets, currentBoundary, i, j, lastBoundary, lowerBase, ref, ref1, upperBase, values; ref = setParameters(rows, valueField, firstStartOn, lastEndBelow, bucketCount, significance), values = ref.values, bucketCount = ref.bucketCount, firstStartOn = ref.firstStartOn, lowerBase = ref.lowerBase, lastEndBelow = ref.lastEndBelow, upperBase = ref.upperBase; if (bucketCount < 3) { bucket = { index: 0, startOn: firstStartOn, endBelow: lastEndBelow }; buckets.push(bucket); return buckets; } bucketSize = 100 / bucketCount; buckets = []; currentBoundary = roundDownToSignificance(functions.percentileCreator(bucketSize)(values), significance); bucket = { index: 0, startOn: firstStartOn, endBelow: currentBoundary }; buckets.push(bucket); for (i = j = 1, ref1 = bucketCount - 2; 1 <= ref1 ? j <= ref1 : j >= ref1; i = 1 <= ref1 ? ++j : --j) { lastBoundary = currentBoundary; currentBoundary = roundDownToSignificance(functions.percentileCreator(bucketSize * (i + 1))(values), significance); buckets.push({ index: i, startOn: lastBoundary, endBelow: currentBoundary }); } if ((lastBoundary != null) && (lastEndBelow != null) && lastBoundary >= lastEndBelow) { throw new Error("Somehow, the last bucket didn't work out. Try a different bucketCount."); } bucket = { index: bucketCount - 1, startOn: currentBoundary, endBelow: lastEndBelow }; buckets.push(bucket); return buckets; }; histogram.bucketsPercentile = function(rows, valueField) { /* @method bucketsPercentile This is a short cut to creating a set of buckets for "scoring" in percentiles (think standardized testing). Note: You can't score in the 100th percentile because you can't beat your own score. If you have a higher score than anybody else, you didn't beat your own score. So, you aren't better than 100%. If there are less than 100 total scores then you technically can't even be in the 99th percentile. This function is hard-coded to only create 100 buckets. However, if you wanted to calculate fractional percentiles. Say you want to know who is in the 99.9th percentile, then you could simulate that yourself by calling bucketsConstantDepth with 1000 as the bucketCount parameter. Let's say you are a teacher and you only give out A's, B's, C's, and F's. Let's say you want the top 10% to get an A. This should only be one student, no matter what he scores. The next 30% of students to get a B. The next 50% of students to get a C and the last 10% to get an F (again, only 1 student). So with 10 students, the final distribution of grades will be this: * A: 1 * B: 3 * C: 5 * F: 1 * Total: 10 Let's say you have these grades: grades = [ {name: 'Joe', average: 105}, # 1 A 90th percentile and above {name: 'Jeff', average: 104.9}, # 1 B 60th percentile and above {name: 'John', average: 92}, # 2 {name: 'Jess', average: 90}, # 3 {name: 'Joseph', average: 87}, # 1 C 10th percentile and above {name: 'Julie', average: 87}, # 2 {name: 'Juan', average: 75}, # 3 {name: 'Jill', average: 73}, # 4 {name: 'Jon', average: 71}, # 5 {name: 'Jorge', average: 32} # 1 F rest ] Now, let's create the percentile buckets for this by calling bucketsPercentile. {histogram} = require('../') buckets = histogram.bucketsPercentile(grades, 'average') Let's create a little helper function to convert the percentiles to grades. It includes a call to `histogram.bucket`. getGrade = (average, buckets) -> percentile = histogram.bucket(average, buckets).percentileHigherIsBetter if percentile >= 90 return 'A' else if percentile >= 60 return 'B' else if percentile >= 10 return 'C' else return 'F' Now, if we loop over this and call getGrade, we can print out the final grade for each student. for student in grades console.log(student.name, getGrade(student.average, buckets)) * Joe A * Jeff B * John B * Jess B * Joseph C * Julie C * Juan C * Jill C * Jon C * Jorge F @static @param {Object[]/Number[]} rows If no valueField is provided or the valueField parameter is null, then the first parameter is assumed to be an Array of Numbers representing the values to bucket. Otherwise, it is assumed to be an Array of Objects with a bunch of fields. @return {Object[]} Returns an Array of Objects (buckets) in the form of {index, startOn, endBelow, label, percentileHigherIsBetter, percentileLowerIsBetter} To convert a value into a percentile call `histogram.bucket(value, bucketsFromCallToBucketsPercentile)` and then read the percentileHigherIsBetter or percentileLowerIsBetter of the bucket that is returned. */ var b, buckets, j, len, percentile; buckets = histogram.buckets(rows, valueField, histogram.bucketsConstantDepth, null, null, null, 100); percentile = 0; for (j = 0, len = buckets.length; j < len; j++) { b = buckets[j]; if (b.matchingRangeIndexEnd != null) { b.percentileHigherIsBetter = b.matchingRangeIndexStart; b.percentileLowerIsBetter = 99 - b.matchingRangeIndexEnd; percentile = b.matchingRangeIndexEnd; delete b.matchingRangeIndexEnd; delete b.matchingRangeIndexStart; } else { b.percentileHigherIsBetter = percentile; b.percentileLowerIsBetter = 99 - percentile; } percentile++; } return buckets; }; histogram.buckets = function(rows, valueField, type, significance, firstStartOn, lastEndBelow, bucketCount) { var bucket, buckets, currentBucket, gotToEnd, i, index, j, len, startOfMatching, tempBuckets; if (type == null) { type = histogram.bucketsConstantWidth; } /* @method buckets @static @param {Object[]/Number[]} rows If no valueField is provided or the valueField parameter is null, then the first parameter is assumed to be an Array of Numbers representing the values to bucket. Otherwise, it is assumed to be an Array of Objects with a bunch of fields. @param {String} [valueField] Specifies the field containing the values to calculate the histogram on @param {function} [type = histogram.constantWidth] Specifies how to pick the edges of the buckets. Four schemes are provided: histogram.bucketsConstantWidth, histogram.bucketsConstantDepth, histogram.bucketsLog, and histogram.bucketsVOptimal. You could inject your own but this function simply calls that so you may as well just create the buckets yourself. @param {Number} [significance] The multiple to which you want to round the bucket edges. 1 means whole numbers. 0.1 means to round to tenths. 0.01 to hundreds. Etc. If you provide all of these last four parameters, ensure that (lastEndBelow - firstStartOn) / bucketCount will naturally come out in the significance specified. So, (100 - 0) / 100 = 1. This works well with a significance of 1, 0.1, 0.01, etc. But (13 - 0) / 10 = 1.3. This would not work with a significance of 1. However, a signficance of 0.1 would work fine. @param {Number} [firstStartOn] This will be the startOn of the first bucket. Think of it as the min value. @param {Number} [lastEndBelow] This will be the endBelow of the last bucket. Think of it as the max value. @param {Number} [bucketCount] If provided, the histogram will have this many buckets. @return {Object[]} Returns an Array of Objects (buckets) in the form of {index, startOn, endBelow, label} The buckets array that is returned will have these properties: * Each bucket (row) will have these fields {index, startOn, endBelow, label}. * Duplicate buckets are merged. When they are merged two fields are added to the resulting merged bucket: {matchingRangeIndexStart, matchingRangeIndexEnd} indicating the range that this bucket replaces. * If firstStartOn is not provided, it will be null indicating -Infinity * If lastEndBelow is not provided, it will be null indicating Infinity. */ tempBuckets = type(rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount); if (tempBuckets.length < 2) { buckets = tempBuckets; } else { buckets = []; startOfMatching = tempBuckets[0]; gotToEnd = false; i = 1; while (i < tempBuckets.length) { currentBucket = tempBuckets[i]; if (startOfMatching.startOn === currentBucket.startOn) { i++; currentBucket = tempBuckets[i]; while ((currentBucket != null) && startOfMatching.startOn === currentBucket.startOn && startOfMatching.endBelow === currentBucket.endBelow) { i++; currentBucket = tempBuckets[i]; } if (i >= tempBuckets.length - 1) { currentBucket = tempBuckets[tempBuckets.length - 1]; gotToEnd = true; } startOfMatching.matchingRangeIndexStart = startOfMatching.index; startOfMatching.matchingRangeIndexEnd = currentBucket.index; startOfMatching.endBelow = currentBucket.endBelow; buckets.push(startOfMatching); i++; currentBucket = tempBuckets[i]; } else { buckets.push(startOfMatching); } startOfMatching = currentBucket; i++; } if (!gotToEnd) { buckets.push(currentBucket); } } for (index = j = 0, len = buckets.length; j < len; index = ++j) { bucket = buckets[index]; bucket.index = index; if ((bucket.startOn != null) && (bucket.endBelow != null)) { bucket.label = bucket.startOn + "-" + bucket.endBelow; } else if (bucket.startOn != null) { bucket.label = ">= " + bucket.startOn; } else if (bucket.endBelow != null) { bucket.label = "< " + bucket.endBelow; } else { bucket.label = "all"; } } return buckets; }; histogram.bucket = function(value, buckets) { /* @method bucket @static @param {Number} value The value to bucket @param {Object[]} buckets Array of objects where each row is in the form {index, startOn, endBelow, label} @return {Object} Returns the bucket that contains the given value unless the data fits in none of the buckets, in which case, it returns `null`. Note: With default parameters, the buckets generated by this module will cover -Infinity to Infinity, (i.e. all possible values). However, if you hand generate your own buckets or you use firstStartOn or lastEndBelow parameters, when calling histogram.buckets, then it's possible for values to fall into no buckets. You can effectively use this as a way to filter out outliers or unexpected negative values. Also note that the firstStartOn (min) is inclusive, but the lastEndBelow (max) is exclusive. If you set the lastEndBelow to 100, then no values of 100 will get bucketed. You can't score in the 100th percentile because you can't beat your own score. This is simlar logic. */ var b, i, j, ref; if (value == null) { return null; } if (buckets.length >= 3) { for (i = j = 1, ref = buckets.length - 2; 1 <= ref ? j <= ref : j >= ref; i = 1 <= ref ? ++j : --j) { b = buckets[i]; if ((b.startOn <= value && value < b.endBelow)) { return b; } } } b = buckets[0]; if ((b.startOn != null) && (b.endBelow != null)) { if ((b.startOn <= value && value < b.endBelow)) { return b; } } else if (b.startOn != null) { if (b.startOn <= value) { return b; } } else if (b.endBelow != null) { if (value < b.endBelow) { return b; } } else if ((b.startOn == null) && (b.endBelow == null)) { return b; } b = buckets[buckets.length - 1]; if (b.endBelow != null) { if ((b.startOn <= value && value < b.endBelow)) { return b; } } else { if (b.startOn <= value) { return b; } } return null; }; histogram.histogramFromBuckets = function(rows, valueField, buckets) { /* @method histogramFromBuckets @static @param {Object[]/Number[]} rows If no valueField is provided or the valueField parameter is null, then the first parameter is assumed to be an Array of Numbers representing the values to bucket. Otherwise, it is assumed to be an Array of Objects with a bunch of fields. @param {String} valueField Specifies the field containing the values to calculate the histogram on @param {Object[]} buckets Array of Objects as output from a get...Buckets() function. Each row {index, startOn, endBelow, label} @return {Object[]} Returns a histogram from rows using the provided buckets. See histogram.histogram() for details on the returned Array. */ var bucket, h, histogramRow, j, k, len, len1, row, v, values; if (valueField != null) { values = (function() { var j, len, results; results = []; for (j = 0, len = rows.length; j < len; j++) { row = rows[j]; results.push(row[valueField]); } return results; })(); } else { values = rows; } h = utils.clone(buckets); for (j = 0, len = h.length; j < len; j++) { histogramRow = h[j]; histogramRow.count = 0; } for (k = 0, len1 = values.length; k < len1; k++) { v = values[k]; bucket = histogram.bucket(v, buckets); if (bucket != null) { h[bucket.index].count++; } } return h; }; histogram.histogram = function(rows, valueField, type, significance, firstStartOn, lastEndBelow, bucketCount) { var buckets; if (type == null) { type = histogram.constantWidth; } /* @method histogram @static @param {Object[]/Number[]} rows If no valueField is provided or the valueField parameter is null, then the first parameter is assumed to be an Array of Numbers representing the values to bucket. Otherwise, it is assumed to be an Array of Objects with a bunch of fields. @param {String} [valueField] Specifies the field containing the values to calculate the histogram on @param {function} [type = histogram.constantWidth] Specifies how to pick the edges of the buckets. Four schemes are provided: histogram.bucketsConstantWidth, histogram.bucketsConstantDepth, histogram.bucketsLog, and histogram.bucketsVOptimal. However, you can inject your own. @param {Number} [significance] The multiple to which you want to round the bucket edges. 1 means whole numbers. 0.1 means to round to tenths. 0.01 to hundreds. Etc. If you provide all of these last four parameters, ensure that (lastEndBelow - firstStartOn) / bucketCount will naturally come out in the significance specified. So, (100 - 0) / 100 = 1. This works well with a significance of 1, 0.1, 0.01, etc. But (13 - 0) / 10 = 1.3. This would not work with a significance of 1. However, a signficance of 0.1 would work fine. @param {Number} [firstStartOn] This will be the startOn of the first bucket. @param {Number} [lastEndBelow] This will be the endBelow of the last bucket. Think of it as the max value. @param {Number} [bucketCount] If provided, the histogram will have this many buckets. @return {Object[]} Returns an Array of Objects (buckets) in the form of {index, startOn, endBelow, label, count} where count is the number of values in each bucket. Note: With default parameters, the buckets will cover -Infinity to Infinity, (i.e. all possible values). However, if firstStartOn or lastEndBelow are provided, then any values that you pass in that fall outside of this range will be ignored. You can effectively use this as a way to filter out outliers or unexpected negative values. Also note that the firstStartOn (min) is inclusive, but the lastEndBelow (max) is exclusive. If you set the lastEndBelow to 100, then no values of 100 will get counted. You can't score in the 100th percentile because you can't beat your own score. This is simlar logic. */ buckets = histogram.buckets(rows, valueField, type, significance, firstStartOn, lastEndBelow, bucketCount); return histogram.histogramFromBuckets(rows, valueField, buckets); }; histogram.discriminated = function(rows, valueField, discriminatorField, type, significance) { var actualValue, boxPlotArray, boxPlotArrays, bucket, bucketCount, buckets, categories, column, columnDistribution, columnIndex, columnPickers, correct, count, counts, data, discriminatedData, discriminatorValue, discriminatorValues, distributionOverall, h, histograms, i, index, j, k, l, len, len1, len2, len3, len4, lowerQuartileCalculator, m, max, maxValue, median, min, minCount, minValue, n, o, p25, p75, pickerOverall, pickers, qtyOverall, ref, ref1, row, series, simulationIterations, smallestSetOfData, stats, successfulClassificationRate, targetIndex, targetValue, total, upperQuartileCalculator, value, values; if (type == null) { type = histogram.constantWidth; } if (significance == null) { significance = 1; } /* @method discriminated @static @param {Object[]} rows Unlike the other histogram methods, this one requires the rows to be Objects becase we need both a valueField and a discriminatorField. @param {String} valueField Specifies the field containing the values to calculate the histogram on @param {String} discriminatorField Specifies the field containing the discriminator to split the histogram series by @param {function} [type = histogram.constantWidth] Specifies how to pick the edges of the buckets. Four schemes are provided: histogram.bucketsConstantWidth, histogram.bucketsConstantDepth, histogram.bucketsLog, and histogram.bucketsVOptimal. However, you can inject your own. @return {Object} Will split the rows into series based upon unique discriminator values. It uses the smallest set to determine the number of buckets, but it uses the entire set to determin the min, and max values. Then it calculates the histogram for each series using the same buckets. Note the shape of this output is very different from the other histogram methods. It's designed to be easily graphed with HighCharts. */ discriminatedData = {}; for (j = 0, len = rows.length; j < len; j++) { row = rows[j]; value = row[valueField]; discriminatorValue = row[discriminatorField]; if (typeof minValue !== "undefined" && minValue !== null) { minValue = Math.min(minValue, value); } else { minValue = value; } if (typeof maxValue !== "undefined" && maxValue !== null) { maxValue = Math.max(maxValue, value); } else { maxValue = value; } if (discriminatedData[discriminatorValue] == null) { discriminatedData[discriminatorValue] = []; } discriminatedData[discriminatorValue].push(row); } minCount = null; for (discriminatorValue in discriminatedData) { data = discriminatedData[discriminatorValue]; if (minCount != null) { if (data.length < minCount) { minCount = data.length; smallestSetOfData = data; } } else { minCount = data.length; smallestSetOfData = data; } } bucketCount = Math.ceil(Math.sqrt(minCount)); significance = 1; buckets = histogram.buckets(smallestSetOfData, valueField, type, significance, minValue, maxValue + significance, bucketCount); series = []; categories = (function() { var k, len1, results; results = []; for (k = 0, len1 = buckets.length; k < len1; k++) { bucket = buckets[k]; results.push(bucket.label); } return results; })(); for (discriminatorValue in discriminatedData) { data = discriminatedData[discriminatorValue]; h = histogram.histogramFromBuckets(data, valueField, buckets); data = (function() { var k, len1, results; results = []; for (k = 0, len1 = h.length; k < len1; k++) { row = h[k]; results.push(row.count); } return results; })(); row = { name: discriminatorValue, data: data, histogram: h }; series.push(row); } lowerQuartileCalculator = functions.percentileCreator(25); upperQuartileCalculator = functions.percentileCreator(75); discriminatorValues = []; stats = []; boxPlotArrays = []; for (discriminatorValue in discriminatedData) { data = discriminatedData[discriminatorValue]; values = (function() { var k, len1, results; results = []; for (k = 0, len1 = data.length; k < len1; k++) { row = data[k]; results.push(row[valueField]); } return results; })(); min = functions.min(values); p25 = lowerQuartileCalculator(values); median = functions.median(values); p75 = upperQuartileCalculator(values); max = functions.max(values); row = { min: min, p25: p25, median: median, p75: p75, max: max, count: values.length }; boxPlotArray = [min, p25, median, p75, max]; stats.push(row); boxPlotArrays.push(boxPlotArray); discriminatorValues.push(discriminatorValue); } qtyOverall = rows.length; distributionOverall = []; histograms = []; pickers = []; for (index = k = 0, len1 = series.length; k < len1; index = ++k) { row = series[index]; distributionOverall.push({ p: stats[index].count / qtyOverall, value: index }); histograms.push(row.histogram); pickers.push(new RandomPicker({ histogram: row.histogram, returnValueField: 'index' })); } pickerOverall = new RandomPicker({ distribution: distributionOverall }); columnPickers = []; ref = histograms[0]; for (columnIndex = l = 0, len2 = ref.length; l < len2; columnIndex = ++l) { column = ref[columnIndex]; total = 0; counts = []; for (index = m = 0, len3 = series.length; m < len3; index = ++m) { row = series[index]; count = histograms[index][columnIndex].count; counts.push(count); total += count; } columnDistribution = []; for (index = n = 0, len4 = counts.length; n < len4; index = ++n) { count = counts[index]; columnDistribution.push({ p: count / total, value: index }); } columnPickers.push(new RandomPicker({ distribution: columnDistribution })); } correct = 0; simulationIterations = 1000; for (i = o = 1, ref1 = simulationIterations; 1 <= ref1 ? o <= ref1 : o >= ref1; i = 1 <= ref1 ? ++o : --o) { targetValue = pickerOverall.get(); targetIndex = pickers[targetValue].get(); actualValue = columnPickers[targetIndex].get(); if (targetValue === actualValue) { correct++; } } successfulClassificationRate = Math.floor(100 * correct / simulationIterations + 0.5); return { categories: categories, series: series, discriminatorValues: discriminatorValues, stats: stats, boxPlotArrays: boxPlotArrays, successfulClassificationRate: successfulClassificationRate }; }; histogram.clipping = function(rows, valueField, noClipping) { var b, bucket, bucketCount, bucketSize, buckets, c, chartMax, chartValues, chartValuesMinusOutliers, clipped, i, iqr, j, k, l, len, len1, len2, m, max, percentile, q1, q3, ref, row, total, upperBound, valueMax; if (noClipping == null) { noClipping = false; } /* @method clipping @static Note: The calling pattern and functionality of this method is legacy and a bit different from the other members of this histogram module. I just haven't yet had the opportunity to upgrade it to the new pattern. This histogram function is designed to work with data that is zero bound on the low end and might have outliers on the high end. It's not very general purpose but it's ideal for distributions that have a long-fat-tail. @param {Object[]} rows @param {String} valueField Specifies the field containing the values to calculate the histogram on @param {Boolean} [noClipping = false] If set to true, then it will not create a non-linear band for the outliers. The default behavior (noClipping = false) is to lump together outliers into a single bucket at the top. @return {Object[]} Returns an object containing the following: * buckets - An Array containing {label, count, rows, clippedChartValue} * bucketSize - The size of each bucket (except the top one) * chartMax - The maximum to use for charting using clipped values * clipped - A Boolean indicating if the result is clipped * valueMax - The actual maximum value found. Will always be >= chartMax Given an array of rows like: {histogram} = require('../') rows = [ {age: 7}, {age: 25}, {age: 23}, {age: 27}, {age: 34}, {age: 55}, {age: 42}, {age: 13}, {age: 11}, {age: 23}, {age: 31}, {age: 32}, {age: 29}, {age: 16}, {age: 31}, {age: 22}, {age: 25}, ] histogram will calculate a histogram. There will be sqrt(n) + 1 buckets {buckets, chartMax} = histogram.clipping(rows, 'age') for b in buckets console.log(b.label, b.count) * 0-12 2 * 12-24 5 * 24-36 8 * 36-48 1 * 48-60 1 console.log(chartMax) * 60 This histogram calculator will also attempt to lump outliers into a single bucket at the top. rows.push({age: 85}) {buckets, chartMax} = histogram.clipping(rows, 'age') lastBucket = buckets[buckets.length - 1] console.log(lastBucket.label, lastBucket.count) * 48-86* 2 The asterix `*` is there to indicate that this bucket is not the same size as the others and non-linear. The histogram calculator will also "clip" the values for these outliers so that you can display them in a scatter chart on a linear scale with the last band compressed. The `clippedChartValue` will be guaranteed to be below the `chartMax` by interpolating it's position between the bounds of the top band where the actual max value is scaled down to the `chartMax` lastBucket = buckets[buckets.length - 1] console.log(lastBucket.rows[1].age, lastBucket.rows[1].clippedChartValue) * 85 59.68421052631579 */ if (valueField != null) { chartValues = (function() { var j, len, results; results = []; for (j = 0, len = rows.length; j < len; j++) { row = rows[j]; results.push(row[valueField]); } return results; })(); } else { chartValues = rows; } max = functions.max(chartValues); max = Math.max(max, 1); if (noClipping) { upperBound = max; chartValuesMinusOutliers = chartValues; } else { q3 = functions.percentileCreator(75)(chartValues); q1 = functions.percentileCreator(25)(chartValues); iqr = q3 - q1; upperBound = q3 + 1.5 * iqr; if (isNaN(upperBound) || upperBound > max) { upperBound = max; } chartValuesMinusOutliers = (function() { var j, len, results; results = []; for (j = 0, len = chartValues.length; j < len; j++) { c = chartValues[j]; if (c <= upperBound) { results.push(c); } } return results; })(); } bucketCount = Math.floor(Math.sqrt(chartValuesMinusOutliers.length)); if (bucketCount < 3) { bucketCount = 2; } bucketSize = Math.floor(upperBound / bucketCount) + 1; upperBound = bucketSize * bucketCount; chartMax = upperBound + bucketSize; valueMax = Math.floor(functions.max(chartValues)) + 1; valueMax = Math.max(chartMax, valueMax); for (j = 0, len = rows.length; j < len; j++) { row = rows[j]; if (row[valueField] >= upperBound) { row.clippedChartValue = upperBound + bucketSize * (row[valueField] - upperBound) / (valueMax - upperBound); } else { row.clippedChartValue = row[valueField]; } } buckets = []; for (i = k = 0, ref = bucketCount; 0 <= ref ? k <= ref : k >= ref; i = 0 <= ref ? ++k : --k) { bucket = { label: (Math.floor(i * bucketSize)) + "-" + (Math.floor((i + 1) * bucketSize)), rows: [], count: 0 }; buckets.push(bucket); } clipped = !(valueMax === chartMax); if (clipped) { buckets[bucketCount].label = upperBound + "-" + valueMax + "*"; } else { buckets[bucketCount].label = upperBound + "-" + valueMax; } total = 0; for (l = 0, len1 = rows.length; l < len1; l++) { row = rows[l]; if (row[valueField] >= upperBound) { bucket = buckets[buckets.length - 1]; } else { bucket = buckets[Math.floor(row[valueField] / bucketSize)]; } bucket.rows.push(row); bucket.count++; total++; } percentile = 0; for (m = 0, len2 = buckets.length; m < len2; m++) { b = buckets[m]; percentile += b.count / total; if (isNaN(percentile)) { b.percentile = 0; } else { b.percentile = percentile; } } buckets[buckets.length - 1].percentile = 1.0; return { buckets: buckets, bucketSize: bucketSize, chartMax: chartMax, clipped: clipped, valueMax: valueMax }; }; exports.histogram = histogram; }).call(this);