lumenize
Version:
Illuminating the forest AND the trees in your data.
828 lines (748 loc) • 35.1 kB
HTML
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>The source code</title>
<link href="../resources/prettify/prettify.css" type="text/css" rel="stylesheet" />
<script type="text/javascript" src="../resources/prettify/prettify.js"></script>
<style type="text/css">
.highlight { display: block; background-color: #ddd; }
</style>
<script type="text/javascript">
function highlight() {
document.getElementById(location.hash.replace(/#/, "")).className = "highlight";
}
</script>
</head>
<body onload="prettyPrint(); highlight();">
<pre class="prettyprint lang-js">/* <CoffeeScript>
functions = require('./functions').functions
utils = require('tztime').utils
{RandomPicker} = require('./RandomPicker')
histogram = {}
justHereForDocsAndDoctest = () ->
</CoffeeScript> */
<span id='Lumenize-histogram'> /**
</span> * @class Lumenize.histogram
*
* This module has functionality that will allow you to create histograms and do bucketing.
*
* Features:
*
* * Three bucketing strategies:
* 1. constant width (default)
* 2. constant depth - for an example of using this mode, look at the source code for the `bucketPercentile()` function
* 3. [v-optimal](http://en.wikipedia.org/wiki/V-optimal_histograms)
* * Two operating modes modes:
* 1. Automatic. Call histogram with data and all of your parameters and out pops a histogram.
* 2. Piecemeal. Create buckets, put data into buckets, generate histograms from data and pre-calculated buckets.
* Sometimes you are less interested in the histogram than you are in the bucketing.
*
* Let's walk through some examples of both modes. But first a general discussion about how these functions accept raw data.
*
* ## Getting data into the histogram functions ##
*
* We have two ways to define data. We can pass in an Array of Objects and specify the field to use.
*
* grades = [
* {name: 'Joe', average: 105},
* {name: 'Jeff', average: 104.9}, # ...
*
* ]
*
* {histogram} = require('../')
* h = histogram.histogram(grades, 'average')
*
* console.log(h)
* # [ { index: 0, startOn: null, endBelow: null, label: 'all', count: 2 } ]
*
* Or, we can just pass in a list of values
*
* grades = [105, 104.9, 99, 98.7, 85, 78, 54, 98, 78, 20]
* h = histogram.histogram(grades)
* console.log((row.label + ': ' + row.count for row in h))
* # [ '< 41.25: 1', '41.25-62.5: 1', '62.5-83.75: 2', '>= 83.75: 6' ]
*
* ## Automatic histogram creation ##
*
* The above examples for the two ways of getting data into the histogram functions also demonstrates the use of
* automatic histogram creation. There are additional parameters to this function that allow you to control the
* type of bucketing (constantWidth, constantDepth, etc.), min and max values, significance of the bucket boundaries, etc.
* See the individual functions for details on these parameters.
*
* ## Piecemeal usage ##
*
* Sometimes you don't actually want a histogram. You want a way to create constantWidth, constantDepth, log, or v-optimal buckets
* and you want a tool to know which bucket a particular value falls into. The cannonical example of this is for calculating
* percentiles for standardized testing... or for grading on a curve. The documentation for the `percentileBuckets()`
* function walks you through an example like this.
*/
/* <CoffeeScript>
getBucketCountMinMax = (values) ->
targetBucketCount = Math.floor(Math.sqrt(values.length)) + 1
if targetBucketCount < 3
targetBucketCount = 2
min = functions.min(values) # !TODO: Optimize this for a single loop
max = functions.max(values)
return {targetBucketCount, min, max}
roundUpToSignificance = (value, significance) ->
unless significance?
return value
multiple = 1 / significance
return Math.ceil(value * multiple) / multiple
roundDownToSignificance = (value, significance) ->
unless significance?
return value
multiple = 1 / significance
return Math.floor(value * multiple) / multiple
setParameters = (rows, valueField, firstStartOn, lastEndBelow, bucketCount, significance) ->
if valueField?
values = (row[valueField] for row in rows)
else
values = rows
{targetBucketCount, min, max} = getBucketCountMinMax(values)
unless bucketCount?
bucketCount = targetBucketCount
if firstStartOn?
lowerBase = firstStartOn
else
lowerBase = roundDownToSignificance(min, significance)
firstStartOn = null
if lastEndBelow?
upperBase = lastEndBelow
else
upperBase = roundUpToSignificance(max, significance)
lastEndBelow = null
return {values, bucketCount, firstStartOn, lowerBase, lastEndBelow, upperBase}
histogram.bucketsLog = (rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount) ->
if significance?
throw new Error("Significance not supported for bucketsLog.")
{values, bucketCount, firstStartOn, lowerBase, lastEndBelow, upperBase} = setParameters(rows, valueField, firstStartOn, lastEndBelow, bucketCount, significance)
if lowerBase < 0
throw new Error("bucketsLog do not support values below zero. Strip those out if you want to use this.")
if lowerBase is 0
firstStartOn = 0
else
firstStartOnExponent = Math.floor(Math.log10(lowerBase))
firstStartOn = Math.pow(10, firstStartOnExponent)
lastEndBelowExponent = Math.floor(Math.log10(upperBase)) + 1
lastEndBelow = Math.pow(10, lastEndBelowExponent)
index = 0
startOn = firstStartOn
if startOn is 0
endBelow = 1
else
endBelow = Math.pow(10, firstStartOnExponent + 1)
buckets = [] # each row is {index, startOn, endBelow} meaning bucket startOn <= x < endBelow
while endBelow <= lastEndBelow
buckets.push({index, startOn, endBelow})
startOn = endBelow
endBelow = endBelow * 10
index++
return buckets
histogram.bucketsConstantWidth = (rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount) ->
{values, bucketCount, firstStartOn, lowerBase, lastEndBelow, upperBase} = setParameters(rows, valueField, firstStartOn, lastEndBelow, bucketCount, significance)
buckets = [] # each row is {index, startOn, endBelow, label} meaning bucket startOn <= x < endBelow
if bucketCount < 3
bucket = {index: 0, startOn: firstStartOn, endBelow: lastEndBelow, label: 'all'}
buckets.push(bucket)
return buckets
bucketSize = roundDownToSignificance((upperBase - lowerBase) / bucketCount, significance)
if bucketSize <= 0
throw new Error("Calculated bucketSizes <= 0 are not allowed. Try a smaller significance.")
lastEdge = lowerBase + bucketSize
# first bucket
bucket = {index: 0, startOn: firstStartOn, endBelow: lastEdge}
buckets.push(bucket)
# all the buckets in the middle
for i in [1..bucketCount - 2]
edge = lastEdge + bucketSize
buckets.push({index: i, startOn: lastEdge, endBelow: edge})
lastEdge = edge
# last bucket
if lastEdge? and lastEndBelow? and lastEdge >= lastEndBelow
throw new Error("Somehow, the last bucket didn't work out. Try a smaller significance. lastEdge: #{lastEdge} lastEndBelow: #{lastEndBelow}")
bucket = {index:bucketCount - 1, startOn: lastEdge, endBelow: lastEndBelow}
buckets.push(bucket)
return buckets
histogram.bucketsConstantDepth = (rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount) ->
{values, bucketCount, firstStartOn, lowerBase, lastEndBelow, upperBase} = setParameters(rows, valueField, firstStartOn, lastEndBelow, bucketCount, significance)
if bucketCount < 3
bucket = {index: 0, startOn: firstStartOn, endBelow: lastEndBelow}
buckets.push(bucket)
return buckets
bucketSize = 100 / bucketCount
buckets = [] # each row is {index, startOn, endBelow} meaning bucket startOn <= x < endBelow
# first bucket
currentBoundary = roundDownToSignificance(functions.percentileCreator(bucketSize)(values), significance)
bucket = {index: 0, startOn: firstStartOn, endBelow: currentBoundary}
buckets.push(bucket)
# all the buckets in the middle
for i in [1..bucketCount - 2]
lastBoundary = currentBoundary
currentBoundary = roundDownToSignificance(functions.percentileCreator(bucketSize * (i + 1))(values), significance)
buckets.push({index: i, startOn: lastBoundary, endBelow: currentBoundary})
# last bucket
if lastBoundary? and lastEndBelow? and lastBoundary >= lastEndBelow
throw new Error("Somehow, the last bucket didn't work out. Try a different bucketCount.")
bucket = {index:bucketCount - 1, startOn: currentBoundary, endBelow: lastEndBelow}
buckets.push(bucket)
return buckets
histogram.bucketsPercentile = (rows, valueField) ->
</CoffeeScript> */
<span id='Lumenize-histogram-static-method-bucketsPercentile'> /**
</span> * @method bucketsPercentile
* @member Lumenize.histogram
*
* This is a short cut to creating a set of buckets for "scoring" in percentiles (think standardized testing).
*
* Note: You can't score in the 100th percentile because you can't beat your own score.
* If you have a higher score than anybody else, you didn't beat your own score. So, you aren't better than 100%. If there are
* less than 100 total scores then you technically can't even be in the 99th percentile. This function is hard-coded
* to only create 100 buckets. However, if you wanted to calculate fractional percentiles. Say you want to know who
* is in the 99.9th percentile, then you could simulate that yourself by calling bucketsConstantDepth with 1000 as
* the bucketCount parameter.
*
* Let's say you are a teacher and you only give out A's, B's, C's, and F's. Let's say you
* want the top 10% to get an A. This should only be one student, no matter what he scores. The next 30% of students
* to get a B. The next 50% of students to get a C and the last 10% to get an F (again, only 1 student). So with 10 students,
* the final distribution of grades will be this:
*
* * A: 1
* * B: 3
* * C: 5
* * F: 1
* * Total: 10
*
* Let's say you have these grades:
*
* grades = [
* {name: 'Joe', average: 105}, # 1 A 90th percentile and above
* {name: 'Jeff', average: 104.9}, # 1 B 60th percentile and above
* {name: 'John', average: 92}, # 2
* {name: 'Jess', average: 90}, # 3
* {name: 'Joseph', average: 87}, # 1 C 10th percentile and above
* {name: 'Julie', average: 87}, # 2
* {name: 'Juan', average: 75}, # 3
* {name: 'Jill', average: 73}, # 4
* {name: 'Jon', average: 71}, # 5
* {name: 'Jorge', average: 32} # 1 F rest
* ]
*
* Now, let's create the percentile buckets for this by calling bucketsPercentile.
*
* {histogram} = require('../')
* buckets = histogram.bucketsPercentile(grades, 'average')
*
* Let's create a little helper function to convert the percentiles to grades. It includes a call to `histogram.bucket`.
*
* getGrade = (average, buckets) ->
* percentile = histogram.bucket(average, buckets).percentileHigherIsBetter
* if percentile >= 90
* return 'A'
* else if percentile >= 60
* return 'B'
* else if percentile >= 10
* return 'C'
* else
* return 'F'
*
* Now, if we loop over this and call getGrade, we can print out the final grade for each student.
*
* for student in grades
* console.log(student.name, getGrade(student.average, buckets))
*
* # Joe A
* # Jeff B
* # John B
* # Jess B
* # Joseph C
* # Julie C
* # Juan C
* # Jill C
* # Jon C
* # Jorge F
*
* @static
* @param {Object[]/Number[]} rows If no valueField is provided or the valueField parameter is null, then the first parameter is
* assumed to be an Array of Numbers representing the values to bucket. Otherwise, it is assumed to be an Array of Objects
* with a bunch of fields.
*
* @return {Object[]}
*
* Returns an Array of Objects (buckets) in the form of {index, startOn, endBelow, label, percentileHigherIsBetter, percentileLowerIsBetter}
*
* To convert a value into a percentile call `histogram.bucket(value, bucketsFromCallToBucketsPercentile)` and
* then read the percentileHigherIsBetter or percentileLowerIsBetter of the bucket that is returned.
*/
/* <CoffeeScript>
buckets = histogram.buckets(rows, valueField, histogram.bucketsConstantDepth, null, null, null, 100)
percentile = 0
for b in buckets
if b.matchingRangeIndexEnd?
b.percentileHigherIsBetter = b.matchingRangeIndexStart
b.percentileLowerIsBetter = 99 - b.matchingRangeIndexEnd
percentile = b.matchingRangeIndexEnd
delete b.matchingRangeIndexEnd
delete b.matchingRangeIndexStart
else
b.percentileHigherIsBetter = percentile
b.percentileLowerIsBetter = 99 - percentile
percentile++
return buckets
histogram.buckets = (rows, valueField, type = histogram.bucketsConstantWidth, significance, firstStartOn, lastEndBelow, bucketCount) ->
</CoffeeScript> */
<span id='Lumenize-histogram-static-method-buckets'> /**
</span> * @method buckets
* @member Lumenize.histogram
* @static
* @param {Object[]/Number[]} rows If no valueField is provided or the valueField parameter is null, then the first parameter is
* assumed to be an Array of Numbers representing the values to bucket. Otherwise, it is assumed to be an Array of Objects
* with a bunch of fields.
* @param {String} [valueField] Specifies the field containing the values to calculate the histogram on
* @param {function} [type = histogram.constantWidth] Specifies how to pick the edges of the buckets. Four schemes
* are provided: histogram.bucketsConstantWidth, histogram.bucketsConstantDepth, histogram.bucketsLog, and histogram.bucketsVOptimal.
* You could inject your own but this function simply calls that so you may as well just create the buckets yourself.
* @param {Number} [significance] The multiple to which you want to round the bucket edges. 1 means whole numbers.
* 0.1 means to round to tenths. 0.01 to hundreds. Etc. If you provide all of these last four parameters, ensure
* that (lastEndBelow - firstStartOn) / bucketCount will naturally come out in the significance specified. So,
* (100 - 0) / 100 = 1. This works well with a significance of 1, 0.1, 0.01, etc. But (13 - 0) / 10 = 1.3. This
* would not work with a significance of 1. However, a signficance of 0.1 would work fine.
*
* @param {Number} [firstStartOn] This will be the startOn of the first bucket. Think of it as the min value.
* @param {Number} [lastEndBelow] This will be the endBelow of the last bucket. Think of it as the max value.
* @param {Number} [bucketCount] If provided, the histogram will have this many buckets.
* @return {Object[]}
*
* Returns an Array of Objects (buckets) in the form of {index, startOn, endBelow, label}
*
* The buckets array that is returned will have these properties:
*
* * Each bucket (row) will have these fields {index, startOn, endBelow, label}.
* * Duplicate buckets are merged. When they are merged two fields are added to the resulting merged bucket:
* {matchingRangeIndexStart, matchingRangeIndexEnd} indicating the range that this bucket replaces.
* * If firstStartOn is not provided, it will be null indicating -Infinity
* * If lastEndBelow is not provided, it will be null indicating Infinity.
*/
/* <CoffeeScript>
tempBuckets = type(rows, valueField, significance, firstStartOn, lastEndBelow, bucketCount)
# return tempBuckets
if tempBuckets.length < 2
buckets = tempBuckets
else # merge duplicate buckets
buckets = []
startOfMatching = tempBuckets[0]
gotToEnd = false
i = 1
while i < tempBuckets.length
currentBucket = tempBuckets[i]
if startOfMatching.startOn == currentBucket.startOn
i++
currentBucket = tempBuckets[i]
while currentBucket? and startOfMatching.startOn == currentBucket.startOn and startOfMatching.endBelow == currentBucket.endBelow
i++
currentBucket = tempBuckets[i]
if i >= tempBuckets.length - 1
currentBucket = tempBuckets[tempBuckets.length - 1]
gotToEnd = true
startOfMatching.matchingRangeIndexStart = startOfMatching.index
startOfMatching.matchingRangeIndexEnd = currentBucket.index
startOfMatching.endBelow = currentBucket.endBelow
buckets.push(startOfMatching)
i++
currentBucket = tempBuckets[i]
else
buckets.push(startOfMatching)
startOfMatching = currentBucket
i++
unless gotToEnd
buckets.push(currentBucket)
# reindex and add labels
for bucket, index in buckets
bucket.index = index
# delete bucket.index
if bucket.startOn? and bucket.endBelow?
bucket.label = "#{bucket.startOn}-#{bucket.endBelow}"
else if bucket.startOn?
bucket.label = ">= #{bucket.startOn}"
else if bucket.endBelow?
bucket.label = "< #{bucket.endBelow}"
else
bucket.label = "all"
return buckets
histogram.bucket = (value, buckets) ->
</CoffeeScript> */
<span id='Lumenize-histogram-static-method-bucket'> /**
</span> * @method bucket
* @member Lumenize.histogram
* @static
* @param {Number} value The value to bucket
* @param {Object[]} buckets Array of objects where each row is in the form {index, startOn, endBelow, label}
* @return {Object}
*
* Returns the bucket that contains the given value unless the data fits in none of the buckets, in which case, it returns
* `null`.
*
* Note: With default parameters, the buckets generated by this module will cover -Infinity to Infinity, (i.e. all
* possible values). However, if you hand generate your own buckets or you use firstStartOn or lastEndBelow parameters,
* when calling histogram.buckets, then it's possible for values to fall into no buckets.
* You can effectively use this as a way to filter out outliers or unexpected
* negative values. Also note that the firstStartOn (min) is inclusive, but the lastEndBelow (max) is exclusive. If
* you set the lastEndBelow to 100, then no values of 100 will get bucketed. You can't score in the 100th percentile
* because you can't beat your own score. This is simlar logic.
*/
/* <CoffeeScript>
unless value?
return null
# middle buckets
if buckets.length >= 3
for i in [1..buckets.length - 2]
b = buckets[i]
if b.startOn <= value < b.endBelow
return b
# convoluted logic so it works for buckets of length 1, 2, and 3+
b = buckets[0]
if b.startOn? and b.endBelow?
if b.startOn <= value < b.endBelow
return b
else if b.startOn?
if b.startOn <= value
return b
else if b.endBelow?
if value < b.endBelow
return b
else if !b.startOn? and !b.endBelow?
return b
# the only situation where you get to this point is when startOn is non-null and it might be the last bucket
b = buckets[buckets.length - 1]
if b.endBelow?
if b.startOn <= value < b.endBelow
return b
else
if b.startOn <= value
return b
return null
histogram.histogramFromBuckets = (rows, valueField, buckets) ->
</CoffeeScript> */
<span id='Lumenize-histogram-static-method-histogramFromBuckets'> /**
</span> * @method histogramFromBuckets
* @member Lumenize.histogram
* @static
* @param {Object[]/Number[]} rows If no valueField is provided or the valueField parameter is null, then the first parameter is
* assumed to be an Array of Numbers representing the values to bucket. Otherwise, it is assumed to be an Array of Objects
* with a bunch of fields.
* @param {String} valueField Specifies the field containing the values to calculate the histogram on
* @param {Object[]} buckets Array of Objects as output from a get...Buckets() function. Each row {index, startOn, endBelow, label}
* @return {Object[]}
*
* Returns a histogram from rows using the provided buckets. See histogram.histogram() for details on the returned Array.
*/
/* <CoffeeScript>
if valueField?
values = (row[valueField] for row in rows)
else
values = rows
h = utils.clone(buckets)
histogramRow.count = 0 for histogramRow in h
for v in values
bucket = histogram.bucket(v, buckets)
if bucket?
h[bucket.index].count++
return h
histogram.histogram = (rows, valueField, type = histogram.constantWidth, significance, firstStartOn, lastEndBelow, bucketCount) ->
</CoffeeScript> */
<span id='Lumenize-histogram-static-method-histogram'> /**
</span> * @method histogram
* @member Lumenize.histogram
* @static
* @param {Object[]/Number[]} rows If no valueField is provided or the valueField parameter is null, then the first parameter is
* assumed to be an Array of Numbers representing the values to bucket. Otherwise, it is assumed to be an Array of Objects
* with a bunch of fields.
* @param {String} [valueField] Specifies the field containing the values to calculate the histogram on
* @param {function} [type = histogram.constantWidth] Specifies how to pick the edges of the buckets. Four schemes
* are provided: histogram.bucketsConstantWidth, histogram.bucketsConstantDepth, histogram.bucketsLog, and histogram.bucketsVOptimal.
* However, you can inject your own.
* @param {Number} [significance] The multiple to which you want to round the bucket edges. 1 means whole numbers.
* 0.1 means to round to tenths. 0.01 to hundreds. Etc. If you provide all of these last four parameters, ensure
* that (lastEndBelow - firstStartOn) / bucketCount will naturally come out in the significance specified. So,
* (100 - 0) / 100 = 1. This works well with a significance of 1, 0.1, 0.01, etc. But (13 - 0) / 10 = 1.3. This
* would not work with a significance of 1. However, a signficance of 0.1 would work fine.
* @param {Number} [firstStartOn] This will be the startOn of the first bucket.
* @param {Number} [lastEndBelow] This will be the endBelow of the last bucket. Think of it as the max value.
* @param {Number} [bucketCount] If provided, the histogram will have this many buckets.
* @return {Object[]}
*
* Returns an Array of Objects (buckets) in the form of {index, startOn, endBelow, label, count} where count is the
* number of values in each bucket.
*
* Note: With default parameters, the buckets will cover -Infinity to Infinity, (i.e. all
* possible values). However, if firstStartOn or lastEndBelow are provided, then any values that you pass in that
* fall outside of this range will be ignored. You can effectively use this as a way to filter out outliers or unexpected
* negative values. Also note that the firstStartOn (min) is inclusive, but the lastEndBelow (max) is exclusive. If
* you set the lastEndBelow to 100, then no values of 100 will get counted. You can't score in the 100th percentile
* because you can't beat your own score. This is simlar logic.
*/
/* <CoffeeScript>
buckets = histogram.buckets(rows, valueField, type, significance, firstStartOn, lastEndBelow, bucketCount)
return histogram.histogramFromBuckets(rows, valueField, buckets)
histogram.discriminated = (rows, valueField, discriminatorField, type = histogram.constantWidth, significance = 1) ->
</CoffeeScript> */
<span id='Lumenize-histogram-static-method-discriminated'> /**
</span> * @method discriminated
* @member Lumenize.histogram
* @static
* @param {Object[]} rows Unlike the other histogram methods, this one requires the rows to be Objects becase we need
* both a valueField and a discriminatorField.
* @param {String} valueField Specifies the field containing the values to calculate the histogram on
* @param {String} discriminatorField Specifies the field containing the discriminator to split the histogram series by
* @param {function} [type = histogram.constantWidth] Specifies how to pick the edges of the buckets. Four schemes
* are provided: histogram.bucketsConstantWidth, histogram.bucketsConstantDepth, histogram.bucketsLog, and histogram.bucketsVOptimal.
* However, you can inject your own.
* @return {Object}
*
* Will split the rows into series based upon unique discriminator values. It uses the smallest set to determine the number
* of buckets, but it uses the entire set to determin the min, and max values. Then it calculates the histogram for each
* series using the same buckets.
*
* Note the shape of this output is very different from the other histogram methods. It's designed to be easily graphed
* with HighCharts.
*/
/* <CoffeeScript>
discriminatedData = {}
for row in rows
value = row[valueField]
discriminatorValue = row[discriminatorField]
if minValue?
minValue = Math.min(minValue, value)
else
minValue = value
if maxValue?
maxValue = Math.max(maxValue, value)
else
maxValue = value
unless discriminatedData[discriminatorValue]?
discriminatedData[discriminatorValue] = []
discriminatedData[discriminatorValue].push(row)
minCount = null
for discriminatorValue, data of discriminatedData
if minCount?
if data.length < minCount
minCount = data.length
smallestSetOfData = data
else
minCount = data.length
smallestSetOfData = data
bucketCount = Math.ceil(Math.sqrt(minCount)) # Rounding up instead of rounding down because bucketCount is calculated from the smallestSetOfData
significance = 1
buckets = histogram.buckets(smallestSetOfData, valueField, type, significance, minValue, maxValue + significance, bucketCount)
series = []
categories = (bucket.label for bucket in buckets)
for discriminatorValue, data of discriminatedData
h = histogram.histogramFromBuckets(data, valueField, buckets)
data = (row.count for row in h)
row = {name: discriminatorValue, data: data, histogram: h}
series.push(row)
# Calculate stats for each series
lowerQuartileCalculator = functions.percentileCreator(25)
upperQuartileCalculator = functions.percentileCreator(75)
discriminatorValues = []
stats = []
boxPlotArrays = []
for discriminatorValue, data of discriminatedData
values = (row[valueField] for row in data)
min = functions.min(values)
p25 = lowerQuartileCalculator(values)
median = functions.median(values)
p75 = upperQuartileCalculator(values)
max = functions.max(values)
row = {min, p25, median, p75, max, count: values.length}
boxPlotArray = [min, p25, median, p75, max]
stats.push(row)
boxPlotArrays.push(boxPlotArray)
discriminatorValues.push(discriminatorValue)
# Monte Carlo simulation to determine correct classification rate
qtyOverall = rows.length
distributionOverall = []
histograms = []
pickers = []
for row, index in series
distributionOverall.push({p: stats[index].count / qtyOverall, value: index})
histograms.push(row.histogram)
pickers.push(new RandomPicker({histogram: row.histogram, returnValueField: 'index'}))
pickerOverall = new RandomPicker({distribution: distributionOverall})
columnPickers = [] # Key is column index
for column, columnIndex in histograms[0]
total = 0
counts = []
for row, index in series
count = histograms[index][columnIndex].count
counts.push(count)
total += count
columnDistribution = []
for count, index in counts
columnDistribution.push({p: count / total, value: index})
columnPickers.push(new RandomPicker({distribution: columnDistribution}))
correct = 0
simulationIterations = 1000
for i in [1..simulationIterations]
targetValue = pickerOverall.get()
targetIndex = pickers[targetValue].get()
actualValue = columnPickers[targetIndex].get()
if targetValue is actualValue
correct++
successfulClassificationRate = Math.floor(100 * correct / simulationIterations + 0.5)
return {categories, series, discriminatorValues, stats, boxPlotArrays, successfulClassificationRate}
histogram.clipping = (rows, valueField, noClipping = false) ->
</CoffeeScript> */
<span id='Lumenize-histogram-static-method-clipping'> /**
</span> * @method clipping
* @member Lumenize.histogram
* @static
*
* Note: The calling pattern and functionality of this method is legacy and a bit different from the other members of
* this histogram module. I just haven't yet had the opportunity to upgrade it to the new pattern.
*
* This histogram function is designed to work with data that is zero bound on the low end and might have outliers
* on the high end. It's not very general purpose but it's ideal for distributions that have a long-fat-tail.
*
* @param {Object[]} rows
* @param {String} valueField Specifies the field containing the values to calculate the histogram on
* @param {Boolean} [noClipping = false] If set to true, then it will not create a non-linear band for the outliers. The
* default behavior (noClipping = false) is to lump together outliers into a single bucket at the top.
* @return {Object[]}
*
* Returns an object containing the following:
*
* * buckets - An Array containing {label, count, rows, clippedChartValue}
* * bucketSize - The size of each bucket (except the top one)
* * chartMax - The maximum to use for charting using clipped values
* * clipped - A Boolean indicating if the result is clipped
* * valueMax - The actual maximum value found. Will always be >= chartMax
*
* Given an array of rows like:
*
* {histogram} = require('../')
*
* rows = [
* {age: 7},
* {age: 25},
* {age: 23},
* {age: 27},
* {age: 34},
* {age: 55},
* {age: 42},
* {age: 13},
* {age: 11},
* {age: 23},
* {age: 31},
* {age: 32},
* {age: 29},
* {age: 16},
* {age: 31},
* {age: 22},
* {age: 25},
* ]
*
* histogram will calculate a histogram. There will be sqrt(n) + 1 buckets
*
* {buckets, chartMax} = histogram.clipping(rows, 'age')
* for b in buckets
* console.log(b.label, b.count)
* # 0-12 2
* # 12-24 5
* # 24-36 8
* # 36-48 1
* # 48-60 1
*
* console.log(chartMax)
* # 60
*
* This histogram calculator will also attempt to lump outliers into a single bucket at the top.
*
* rows.push({age: 85})
*
* {buckets, chartMax} = histogram.clipping(rows, 'age')
*
* lastBucket = buckets[buckets.length - 1]
* console.log(lastBucket.label, lastBucket.count)
* # 48-86* 2
*
* The asterix `*` is there to indicate that this bucket is not the same size as the others and non-linear.
* The histogram calculator will also "clip" the values for these outliers so that you can
* display them in a scatter chart on a linear scale with the last band compressed.
* The `clippedChartValue` will be guaranteed to be below the `chartMax` by interpolating it's position between
* the bounds of the top band where the actual max value is scaled down to the `chartMax`
*
* lastBucket = buckets[buckets.length - 1]
* console.log(lastBucket.rows[1].age, lastBucket.rows[1].clippedChartValue)
* # 85 59.68421052631579
*
*/
/* <CoffeeScript>
if valueField?
chartValues = (row[valueField] for row in rows)
else
chartValues = rows
max = functions.max(chartValues)
max = Math.max(max, 1)
if noClipping
upperBound = max
chartValuesMinusOutliers = chartValues
else
q3 = functions.percentileCreator(75)(chartValues)
q1 = functions.percentileCreator(25)(chartValues)
iqr = q3 - q1
upperBound = q3 + 1.5 * iqr # This is the Tukey recommendation http://exploringdata.net/why_1_5.htm
if isNaN(upperBound) or upperBound > max
upperBound = max
chartValuesMinusOutliers = (c for c in chartValues when c <= upperBound)
bucketCount = Math.floor(Math.sqrt(chartValuesMinusOutliers.length))
if bucketCount < 3
bucketCount = 2
bucketSize = Math.floor(upperBound / bucketCount) + 1
upperBound = bucketSize * bucketCount
chartMax = upperBound + bucketSize # This will be at the very top of the top bucket
valueMax = Math.floor(functions.max(chartValues)) + 1
valueMax = Math.max(chartMax, valueMax)
# add clippedChartValues to timeInState
# the clippedChartValue is interpolated between upperBound and valueMax to fit within one bucketSize
for row in rows
if row[valueField] >= upperBound
row.clippedChartValue = upperBound + bucketSize * (row[valueField] - upperBound) / (valueMax - upperBound)
else
row.clippedChartValue = row[valueField]
buckets = []
for i in [0..bucketCount]
bucket = {
label: "#{Math.floor(i * bucketSize)}-#{Math.floor((i + 1) * bucketSize)}",
rows: []
count: 0
}
buckets.push(bucket)
clipped = not (valueMax == chartMax)
if clipped
buckets[bucketCount].label = "#{upperBound}-#{valueMax}*"
else
buckets[bucketCount].label = "#{upperBound}-#{valueMax}"
total = 0
for row in rows
if row[valueField] >= upperBound
bucket = buckets[buckets.length - 1]
else
bucket = buckets[Math.floor(row[valueField] / bucketSize)]
bucket.rows.push(row)
bucket.count++
total++
percentile = 0
for b in buckets
percentile += b.count / total
if isNaN(percentile)
b.percentile = 0
else
b.percentile = percentile
buckets[buckets.length - 1].percentile = 1.0
return {buckets, bucketSize, chartMax, clipped, valueMax}
exports.histogram = histogram
</CoffeeScript> */</pre>
</body>
</html>