simple-statistics
Version:
1,601 lines (1,441 loc) • 146 kB
JavaScript
/**
* [Simple linear regression](http://en.wikipedia.org/wiki/Simple_linear_regression)
* is a simple way to find a fitted line
* between a set of coordinates. This algorithm finds the slope and y-intercept of a regression line
* using the least sum of squares.
*
* @param {Array<Array<number>>} data an array of two-element of arrays,
* like `[[0, 1], [2, 3]]`
* @returns {Object} object containing slope and intersect of regression line
* @example
* linearRegression([[0, 0], [1, 1]]); // => { m: 1, b: 0 }
*/
function linearRegression(data) {
var m;
var b;
// Store data length in a local variable to reduce
// repeated object property lookups
var dataLength = data.length;
//if there's only one point, arbitrarily choose a slope of 0
//and a y-intercept of whatever the y of the initial point is
if (dataLength === 1) {
m = 0;
b = data[0][1];
} else {
// Initialize our sums and scope the `m` and `b`
// variables that define the line.
var sumX = 0;
var sumY = 0;
var sumXX = 0;
var sumXY = 0;
// Use local variables to grab point values
// with minimal object property lookups
var point;
var x;
var y;
// Gather the sum of all x values, the sum of all
// y values, and the sum of x^2 and (x*y) for each
// value.
//
// In math notation, these would be SS_x, SS_y, SS_xx, and SS_xy
for (var i = 0; i < dataLength; i++) {
point = data[i];
x = point[0];
y = point[1];
sumX += x;
sumY += y;
sumXX += x * x;
sumXY += x * y;
}
// `m` is the slope of the regression line
m =
(dataLength * sumXY - sumX * sumY) /
(dataLength * sumXX - sumX * sumX);
// `b` is the y-intercept of the line.
b = sumY / dataLength - (m * sumX) / dataLength;
}
// Return both values as an object.
return {
m: m,
b: b
};
}
/**
* Given the output of `linearRegression`: an object
* with `m` and `b` values indicating slope and intercept,
* respectively, generate a line function that translates
* x values into y values.
*
* @param {Object} mb object with `m` and `b` members, representing
* slope and intersect of desired line
* @returns {Function} method that computes y-value at any given
* x-value on the line.
* @example
* var l = linearRegressionLine(linearRegression([[0, 0], [1, 1]]));
* l(0) // = 0
* l(2) // = 2
* linearRegressionLine({ b: 0, m: 1 })(1); // => 1
* linearRegressionLine({ b: 1, m: 1 })(1); // => 2
*/
function linearRegressionLine(mb /*: { b: number, m: number }*/) {
// Return a function that computes a `y` value for each
// x value it is given, based on the values of `b` and `a`
// that we just computed.
return function (x) {
return mb.b + mb.m * x;
};
}
/**
* Our default sum is the [Kahan-Babuska algorithm](https://pdfs.semanticscholar.org/1760/7d467cda1d0277ad272deb2113533131dc09.pdf).
* This method is an improvement over the classical
* [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm).
* It aims at computing the sum of a list of numbers while correcting for
* floating-point errors. Traditionally, sums are calculated as many
* successive additions, each one with its own floating-point roundoff. These
* losses in precision add up as the number of numbers increases. This alternative
* algorithm is more accurate than the simple way of calculating sums by simple
* addition.
*
* This runs in `O(n)`, linear time, with respect to the length of the array.
*
* @param {Array<number>} x input
* @return {number} sum of all input numbers
* @example
* sum([1, 2, 3]); // => 6
*/
function sum(x) {
// If the array is empty, we needn't bother computing its sum
if (x.length === 0) {
return 0;
}
// Initializing the sum as the first number in the array
var sum = x[0];
// Keeping track of the floating-point error correction
var correction = 0;
var transition;
if (typeof sum !== "number") {
return Number.NaN;
}
for (var i = 1; i < x.length; i++) {
if (typeof x[i] !== "number") {
return Number.NaN;
}
transition = sum + x[i];
// Here we need to update the correction in a different fashion
// if the new absolute value is greater than the absolute sum
if (Math.abs(sum) >= Math.abs(x[i])) {
correction += sum - transition + x[i];
} else {
correction += x[i] - transition + sum;
}
sum = transition;
}
// Returning the corrected sum
return sum + correction;
}
/**
* The mean, _also known as average_,
* is the sum of all values over the number of values.
* This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
* a method of finding a typical or central value of a set of numbers.
*
* This runs in `O(n)`, linear time, with respect to the length of the array.
*
* @param {Array<number>} x sample of one or more data points
* @throws {Error} if the length of x is less than one
* @returns {number} mean
* @example
* mean([0, 10]); // => 5
*/
function mean(x) {
if (x.length === 0) {
throw new Error("mean requires at least one data point");
}
return sum(x) / x.length;
}
/**
* The sum of deviations to the Nth power.
* When n=2 it's the sum of squared deviations.
* When n=3 it's the sum of cubed deviations.
*
* @param {Array<number>} x
* @param {number} n power
* @returns {number} sum of nth power deviations
*
* @example
* var input = [1, 2, 3];
* // since the variance of a set is the mean squared
* // deviations, we can calculate that with sumNthPowerDeviations:
* sumNthPowerDeviations(input, 2) / input.length;
*/
function sumNthPowerDeviations(x, n) {
var meanValue = mean(x);
var sum = 0;
var tempValue;
var i;
// This is an optimization: when n is 2 (we're computing a number squared),
// multiplying the number by itself is significantly faster than using
// the Math.pow method.
if (n === 2) {
for (i = 0; i < x.length; i++) {
tempValue = x[i] - meanValue;
sum += tempValue * tempValue;
}
} else {
for (i = 0; i < x.length; i++) {
sum += Math.pow(x[i] - meanValue, n);
}
}
return sum;
}
/**
* The [variance](http://en.wikipedia.org/wiki/Variance)
* is the sum of squared deviations from the mean.
*
* This is an implementation of variance, not sample variance:
* see the `sampleVariance` method if you want a sample measure.
*
* @param {Array<number>} x a population of one or more data points
* @returns {number} variance: a value greater than or equal to zero.
* zero indicates that all values are identical.
* @throws {Error} if x's length is 0
* @example
* variance([1, 2, 3, 4, 5, 6]); // => 2.9166666666666665
*/
function variance(x) {
if (x.length === 0) {
throw new Error("variance requires at least one data point");
}
// Find the mean of squared deviations between the
// mean value and each value.
return sumNthPowerDeviations(x, 2) / x.length;
}
/**
* The [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation)
* is the square root of the variance. This is also known as the population
* standard deviation. It's useful for measuring the amount
* of variation or dispersion in a set of values.
*
* Standard deviation is only appropriate for full-population knowledge: for
* samples of a population, {@link sampleStandardDeviation} is
* more appropriate.
*
* @param {Array<number>} x input
* @returns {number} standard deviation
* @example
* variance([2, 4, 4, 4, 5, 5, 7, 9]); // => 4
* standardDeviation([2, 4, 4, 4, 5, 5, 7, 9]); // => 2
*/
function standardDeviation(x) {
if (x.length === 1) {
return 0;
}
var v = variance(x);
return Math.sqrt(v);
}
/**
* The [R Squared](http://en.wikipedia.org/wiki/Coefficient_of_determination)
* value of data compared with a function `f`
* is the sum of the squared differences between the prediction
* and the actual value.
*
* @param {Array<Array<number>>} x input data: this should be doubly-nested
* @param {Function} func function called on `[i][0]` values within the dataset
* @returns {number} r-squared value
* @example
* var samples = [[0, 0], [1, 1]];
* var regressionLine = linearRegressionLine(linearRegression(samples));
* rSquared(samples, regressionLine); // = 1 this line is a perfect fit
*/
function rSquared(x, func) {
if (x.length < 2) {
return 1;
}
// Compute the average y value for the actual
// data set in order to compute the
// _total sum of squares_
var sum = 0;
for (var i = 0; i < x.length; i++) {
sum += x[i][1];
}
var average = sum / x.length;
// Compute the total sum of squares - the
// squared difference between each point
// and the average of all points.
var sumOfSquares = 0;
for (var j = 0; j < x.length; j++) {
sumOfSquares += Math.pow(average - x[j][1], 2);
}
// Finally estimate the error: the squared
// difference between the estimate and the actual data
// value at each point.
var err = 0;
for (var k = 0; k < x.length; k++) {
err += Math.pow(x[k][1] - func(x[k][0]), 2);
}
// As the error grows larger, its ratio to the
// sum of squares increases and the r squared
// value grows lower.
return 1 - err / sumOfSquares;
}
/**
* The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
* that appears in a list the highest number of times.
* There can be multiple modes in a list: in the event of a tie, this
* algorithm will return the most recently seen mode.
*
* This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
* a method of finding a typical or central value of a set of numbers.
*
* This runs in `O(n)` because the input is sorted.
*
* @param {Array<number>} sorted a sample of one or more data points
* @returns {number} mode
* @throws {Error} if sorted is empty
* @example
* modeSorted([0, 0, 1]); // => 0
*/
function modeSorted(sorted) {
// Handle edge cases:
// The mode of an empty list is undefined
if (sorted.length === 0) {
throw new Error("mode requires at least one data point");
}
if (sorted.length === 1) {
return sorted[0];
}
// This assumes it is dealing with an array of size > 1, since size
// 0 and 1 are handled immediately. Hence it starts at index 1 in the
// array.
var last = sorted[0];
// store the mode as we find new modes
var value = Number.NaN;
// store how many times we've seen the mode
var maxSeen = 0;
// how many times the current candidate for the mode
// has been seen
var seenThis = 1;
// end at sorted.length + 1 to fix the case in which the mode is
// the highest number that occurs in the sequence. the last iteration
// compares sorted[i], which is undefined, to the highest number
// in the series
for (var i = 1; i < sorted.length + 1; i++) {
// we're seeing a new number pass by
if (sorted[i] !== last) {
// the last number is the new mode since we saw it more
// often than the old one
if (seenThis > maxSeen) {
maxSeen = seenThis;
value = last;
}
seenThis = 1;
last = sorted[i];
// if this isn't a new number, it's one more occurrence of
// the potential mode
} else {
seenThis++;
}
}
return value;
}
/**
* Sort an array of numbers by their numeric value, ensuring that the
* array is not changed in place.
*
* This is necessary because the default behavior of .sort
* in JavaScript is to sort arrays as string values
*
* [1, 10, 12, 102, 20].sort()
* // output
* [1, 10, 102, 12, 20]
*
* @param {Array<number>} x input array
* @return {Array<number>} sorted array
* @private
* @example
* numericSort([3, 2, 1]) // => [1, 2, 3]
*/
function numericSort(x) {
return (
x
// ensure the array is not changed in-place
.slice()
// comparator function that treats input as numeric
.sort(function (a, b) {
return a - b;
})
);
}
/**
* The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
* that appears in a list the highest number of times.
* There can be multiple modes in a list: in the event of a tie, this
* algorithm will return the most recently seen mode.
*
* This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
* a method of finding a typical or central value of a set of numbers.
*
* This runs in `O(n log(n))` because it needs to sort the array internally
* before running an `O(n)` search to find the mode.
*
* @param {Array<number>} x input
* @returns {number} mode
* @example
* mode([0, 0, 1]); // => 0
*/
function mode(x) {
// Sorting the array lets us iterate through it below and be sure
// that every time we see a new number it's new and we'll never
// see the same number twice
return modeSorted(numericSort(x));
}
/* globals Map: false */
/**
* The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
* that appears in a list the highest number of times.
* There can be multiple modes in a list: in the event of a tie, this
* algorithm will return the most recently seen mode.
*
* modeFast uses a Map object to keep track of the mode, instead of the approach
* used with `mode`, a sorted array. As a result, it is faster
* than `mode` and supports any data type that can be compared with `==`.
* It also requires a
* [JavaScript environment with support for Map](https://kangax.github.io/compat-table/es6/#test-Map),
* and will throw an error if Map is not available.
*
* This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
* a method of finding a typical or central value of a set of numbers.
*
* @param {Array<*>} x a sample of one or more data points
* @returns {?*} mode
* @throws {ReferenceError} if the JavaScript environment doesn't support Map
* @throws {Error} if x is empty
* @example
* modeFast(['rabbits', 'rabbits', 'squirrels']); // => 'rabbits'
*/
function modeFast(x) {
// This index will reflect the incidence of different values, indexing
// them like
// { value: count }
var index = new Map();
// A running `mode` and the number of times it has been encountered.
var mode;
var modeCount = 0;
for (var i = 0; i < x.length; i++) {
var newCount = index.get(x[i]);
if (newCount === undefined) {
newCount = 1;
} else {
newCount++;
}
if (newCount > modeCount) {
mode = x[i];
modeCount = newCount;
}
index.set(x[i], newCount);
}
if (modeCount === 0) {
throw new Error("mode requires at last one data point");
}
return mode;
}
/**
* The min is the lowest number in the array.
* This runs in `O(n)`, linear time, with respect to the length of the array.
*
* @param {Array<number>} x sample of one or more data points
* @throws {Error} if the length of x is less than one
* @returns {number} minimum value
* @example
* min([1, 5, -10, 100, 2]); // => -10
*/
function min(x) {
if (x.length === 0) {
throw new Error("min requires at least one data point");
}
var value = x[0];
for (var i = 1; i < x.length; i++) {
if (x[i] < value) {
value = x[i];
}
}
return value;
}
/**
* This computes the maximum number in an array.
*
* This runs in `O(n)`, linear time, with respect to the length of the array.
*
* @param {Array<number>} x sample of one or more data points
* @returns {number} maximum value
* @throws {Error} if the length of x is less than one
* @example
* max([1, 2, 3, 4]);
* // => 4
*/
function max(x) {
if (x.length === 0) {
throw new Error("max requires at least one data point");
}
var value = x[0];
for (var i = 1; i < x.length; i++) {
if (x[i] > value) {
value = x[i];
}
}
return value;
}
/**
* This computes the minimum & maximum number in an array.
*
* This runs in `O(n)`, linear time, with respect to the length of the array.
*
* @param {Array<number>} x sample of one or more data points
* @returns {Array<number>} minimum & maximum value
* @throws {Error} if the length of x is less than one
* @example
* extent([1, 2, 3, 4]);
* // => [1, 4]
*/
function extent(x) {
if (x.length === 0) {
throw new Error("extent requires at least one data point");
}
var min = x[0];
var max = x[0];
for (var i = 1; i < x.length; i++) {
if (x[i] > max) {
max = x[i];
}
if (x[i] < min) {
min = x[i];
}
}
return [min, max];
}
/**
* The minimum is the lowest number in the array. With a sorted array,
* the first element in the array is always the smallest, so this calculation
* can be done in one step, or constant time.
*
* @param {Array<number>} x input
* @returns {number} minimum value
* @example
* minSorted([-100, -10, 1, 2, 5]); // => -100
*/
function minSorted(x) {
return x[0];
}
/**
* The maximum is the highest number in the array. With a sorted array,
* the last element in the array is always the largest, so this calculation
* can be done in one step, or constant time.
*
* @param {Array<number>} x input
* @returns {number} maximum value
* @example
* maxSorted([-100, -10, 1, 2, 5]); // => 5
*/
function maxSorted(x) {
return x[x.length - 1];
}
/**
* The extent is the lowest & highest number in the array. With a sorted array,
* the first element in the array is always the lowest while the last element is always the largest, so this calculation
* can be done in one step, or constant time.
*
* @param {Array<number>} x input
* @returns {Array<number>} minimum & maximum value
* @example
* extentSorted([-100, -10, 1, 2, 5]); // => [-100, 5]
*/
function extentSorted(x) {
return [x[0], x[x.length - 1]];
}
/**
* The simple [sum](https://en.wikipedia.org/wiki/Summation) of an array
* is the result of adding all numbers together, starting from zero.
*
* This runs in `O(n)`, linear time, with respect to the length of the array.
*
* @param {Array<number>} x input
* @return {number} sum of all input numbers
* @example
* sumSimple([1, 2, 3]); // => 6
*/
function sumSimple(x) {
var value = 0;
for (var i = 0; i < x.length; i++) {
if (typeof x[i] !== "number") {
return Number.NaN;
}
value += x[i];
}
return value;
}
/**
* The [product](https://en.wikipedia.org/wiki/Product_(mathematics)) of an array
* is the result of multiplying all numbers together, starting using one as the multiplicative identity.
*
* This runs in `O(n)`, linear time, with respect to the length of the array.
*
* @param {Array<number>} x input
* @return {number} product of all input numbers
* @example
* product([1, 2, 3, 4]); // => 24
*/
function product(x) {
var value = 1;
for (var i = 0; i < x.length; i++) {
value *= x[i];
}
return value;
}
/**
* This is the internal implementation of quantiles: when you know
* that the order is sorted, you don't need to re-sort it, and the computations
* are faster.
*
* @param {Array<number>} x sample of one or more data points
* @param {number} p desired quantile: a number between 0 to 1, inclusive
* @returns {number} quantile value
* @throws {Error} if p ix outside of the range from 0 to 1
* @throws {Error} if x is empty
* @example
* quantileSorted([3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20], 0.5); // => 9
*/
function quantileSorted(x, p) {
var idx = x.length * p;
if (x.length === 0) {
throw new Error("quantile requires at least one data point.");
} else if (p < 0 || p > 1) {
throw new Error("quantiles must be between 0 and 1");
} else if (p === 1) {
// If p is 1, directly return the last element
return x[x.length - 1];
} else if (p === 0) {
// If p is 0, directly return the first element
return x[0];
} else if (idx % 1 !== 0) {
// If p is not integer, return the next element in array
return x[Math.ceil(idx) - 1];
} else if (x.length % 2 === 0) {
// If the list has even-length, we'll take the average of this number
// and the next value, if there is one
return (x[idx - 1] + x[idx]) / 2;
} else {
// Finally, in the simple case of an integer value
// with an odd-length list, return the x value at the index.
return x[idx];
}
}
/**
* Rearrange items in `arr` so that all items in `[left, k]` range are the smallest.
* The `k`-th element will have the `(k - left + 1)`-th smallest value in `[left, right]`.
*
* Implements Floyd-Rivest selection algorithm https://en.wikipedia.org/wiki/Floyd-Rivest_algorithm
*
* @param {Array<number>} arr input array
* @param {number} k pivot index
* @param {number} [left] left index
* @param {number} [right] right index
* @returns {void} mutates input array
* @example
* var arr = [65, 28, 59, 33, 21, 56, 22, 95, 50, 12, 90, 53, 28, 77, 39];
* quickselect(arr, 8);
* // = [39, 28, 28, 33, 21, 12, 22, 50, 53, 56, 59, 65, 90, 77, 95]
*/
function quickselect(arr, k, left, right) {
left = left || 0;
right = right || arr.length - 1;
while (right > left) {
// 600 and 0.5 are arbitrary constants chosen in the original paper to minimize execution time
if (right - left > 600) {
var n = right - left + 1;
var m = k - left + 1;
var z = Math.log(n);
var s = 0.5 * Math.exp((2 * z) / 3);
var sd = 0.5 * Math.sqrt((z * s * (n - s)) / n);
if (m - n / 2 < 0) { sd *= -1; }
var newLeft = Math.max(left, Math.floor(k - (m * s) / n + sd));
var newRight = Math.min(
right,
Math.floor(k + ((n - m) * s) / n + sd)
);
quickselect(arr, k, newLeft, newRight);
}
var t = arr[k];
var i = left;
var j = right;
swap(arr, left, k);
if (arr[right] > t) { swap(arr, left, right); }
while (i < j) {
swap(arr, i, j);
i++;
j--;
while (arr[i] < t) { i++; }
while (arr[j] > t) { j--; }
}
if (arr[left] === t) { swap(arr, left, j); }
else {
j++;
swap(arr, j, right);
}
if (j <= k) { left = j + 1; }
if (k <= j) { right = j - 1; }
}
}
function swap(arr, i, j) {
var tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
}
/**
* The [quantile](https://en.wikipedia.org/wiki/Quantile):
* this is a population quantile, since we assume to know the entire
* dataset in this library. This is an implementation of the
* [Quantiles of a Population](http://en.wikipedia.org/wiki/Quantile#Quantiles_of_a_population)
* algorithm from wikipedia.
*
* Sample is a one-dimensional array of numbers,
* and p is either a decimal number from 0 to 1 or an array of decimal
* numbers from 0 to 1.
* In terms of a k/q quantile, p = k/q - it's just dealing with fractions or dealing
* with decimal values.
* When p is an array, the result of the function is also an array containing the appropriate
* quantiles in input order
*
* @param {Array<number>} x sample of one or more numbers
* @param {Array<number> | number} p the desired quantile, as a number between 0 and 1
* @returns {number} quantile
* @example
* quantile([3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20], 0.5); // => 9
*/
function quantile(x, p) {
var copy = x.slice();
if (Array.isArray(p)) {
// rearrange elements so that each element corresponding to a requested
// quantile is on a place it would be if the array was fully sorted
multiQuantileSelect(copy, p);
// Initialize the result array
var results = [];
// For each requested quantile
for (var i = 0; i < p.length; i++) {
results[i] = quantileSorted(copy, p[i]);
}
return results;
} else {
var idx = quantileIndex(copy.length, p);
quantileSelect(copy, idx, 0, copy.length - 1);
return quantileSorted(copy, p);
}
}
function quantileSelect(arr, k, left, right) {
if (k % 1 === 0) {
quickselect(arr, k, left, right);
} else {
k = Math.floor(k);
quickselect(arr, k, left, right);
quickselect(arr, k + 1, k + 1, right);
}
}
function multiQuantileSelect(arr, p) {
var indices = [0];
for (var i = 0; i < p.length; i++) {
indices.push(quantileIndex(arr.length, p[i]));
}
indices.push(arr.length - 1);
indices.sort(compare);
var stack = [0, indices.length - 1];
while (stack.length) {
var r = Math.ceil(stack.pop());
var l = Math.floor(stack.pop());
if (r - l <= 1) { continue; }
var m = Math.floor((l + r) / 2);
quantileSelect(
arr,
indices[m],
Math.floor(indices[l]),
Math.ceil(indices[r])
);
stack.push(l, m, m, r);
}
}
function compare(a, b) {
return a - b;
}
function quantileIndex(len, p) {
var idx = len * p;
if (p === 1) {
// If p is 1, directly return the last index
return len - 1;
} else if (p === 0) {
// If p is 0, directly return the first index
return 0;
} else if (idx % 1 !== 0) {
// If index is not integer, return the next index in array
return Math.ceil(idx) - 1;
} else if (len % 2 === 0) {
// If the list has even-length, we'll return the middle of two indices
// around quantile to indicate that we need an average value of the two
return idx - 0.5;
} else {
// Finally, in the simple case of an integer index
// with an odd-length list, return the index
return idx;
}
}
/* eslint no-bitwise: 0 */
/**
* This function returns the quantile in which one would find the given value in
* the given array. With a sorted array, leveraging binary search, we can find
* this information in logarithmic time.
*
* @param {Array<number>} x input
* @returns {number} value value
* @example
* quantileRankSorted([1, 2, 3, 4], 3); // => 0.75
* quantileRankSorted([1, 2, 3, 3, 4], 3); // => 0.7
* quantileRankSorted([1, 2, 3, 4], 6); // => 1
* quantileRankSorted([1, 2, 3, 3, 5], 4); // => 0.8
*/
function quantileRankSorted(x, value) {
// Value is lesser than any value in the array
if (value < x[0]) {
return 0;
}
// Value is greater than any value in the array
if (value > x[x.length - 1]) {
return 1;
}
var l = lowerBound(x, value);
// Value is not in the array
if (x[l] !== value) {
return l / x.length;
}
l++;
var u = upperBound(x, value);
// The value exists only once in the array
if (u === l) {
return l / x.length;
}
// Here, we are basically computing the mean of the range of indices
// containing our searched value. But, instead, of initializing an
// array and looping over it, there is a dedicated math formula that
// we apply below to get the result.
var r = u - l + 1;
var sum = (r * (u + l)) / 2;
var mean = sum / r;
return mean / x.length;
}
function lowerBound(x, value) {
var mid = 0;
var lo = 0;
var hi = x.length;
while (lo < hi) {
mid = (lo + hi) >>> 1;
if (value <= x[mid]) {
hi = mid;
} else {
lo = -~mid;
}
}
return lo;
}
function upperBound(x, value) {
var mid = 0;
var lo = 0;
var hi = x.length;
while (lo < hi) {
mid = (lo + hi) >>> 1;
if (value >= x[mid]) {
lo = -~mid;
} else {
hi = mid;
}
}
return lo;
}
/**
* This function returns the quantile in which one would find the given value in
* the given array. It will copy and sort your array before each run, so
* if you know your array is already sorted, you should use `quantileRankSorted`
* instead.
*
* @param {Array<number>} x input
* @returns {number} value value
* @example
* quantileRank([4, 3, 1, 2], 3); // => 0.75
* quantileRank([4, 3, 2, 3, 1], 3); // => 0.7
* quantileRank([2, 4, 1, 3], 6); // => 1
* quantileRank([5, 3, 1, 2, 3], 4); // => 0.8
*/
function quantileRank(x, value) {
// Cloning and sorting the array
var sortedCopy = numericSort(x);
return quantileRankSorted(sortedCopy, value);
}
/**
* The [Interquartile range](http://en.wikipedia.org/wiki/Interquartile_range) is
* a measure of statistical dispersion, or how scattered, spread, or
* concentrated a distribution is. It's computed as the difference between
* the third quartile and first quartile.
*
* @param {Array<number>} x sample of one or more numbers
* @returns {number} interquartile range: the span between lower and upper quartile,
* 0.25 and 0.75
* @example
* interquartileRange([0, 1, 2, 3]); // => 2
*/
function interquartileRange(x) {
// Interquartile range is the span between the upper quartile,
// at `0.75`, and lower quartile, `0.25`
var q1 = quantile(x, 0.75);
var q2 = quantile(x, 0.25);
if (typeof q1 === "number" && typeof q2 === "number") {
return q1 - q2;
}
}
/**
* The [median](http://en.wikipedia.org/wiki/Median) is
* the middle number of a list. This is often a good indicator of 'the middle'
* when there are outliers that skew the `mean()` value.
* This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
* a method of finding a typical or central value of a set of numbers.
*
* The median isn't necessarily one of the elements in the list: the value
* can be the average of two elements if the list has an even length
* and the two central values are different.
*
* @param {Array<number>} x input
* @returns {number} median value
* @example
* median([10, 2, 5, 100, 2, 1]); // => 3.5
*/
function median(x) {
return +quantile(x, 0.5);
}
/**
* The [Median Absolute Deviation](http://en.wikipedia.org/wiki/Median_absolute_deviation) is
* a robust measure of statistical
* dispersion. It is more resilient to outliers than the standard deviation.
*
* @param {Array<number>} x input array
* @returns {number} median absolute deviation
* @example
* medianAbsoluteDeviation([1, 1, 2, 2, 4, 6, 9]); // => 1
*/
function medianAbsoluteDeviation(x) {
var medianValue = median(x);
var medianAbsoluteDeviations = [];
// Make a list of absolute deviations from the median
for (var i = 0; i < x.length; i++) {
medianAbsoluteDeviations.push(Math.abs(x[i] - medianValue));
}
// Find the median value of that list
return median(medianAbsoluteDeviations);
}
/**
* Split an array into chunks of a specified size. This function
* has the same behavior as [PHP's array_chunk](http://php.net/manual/en/function.array-chunk.php)
* function, and thus will insert smaller-sized chunks at the end if
* the input size is not divisible by the chunk size.
*
* `x` is expected to be an array, and `chunkSize` a number.
* The `x` array can contain any kind of data.
*
* @param {Array} x a sample
* @param {number} chunkSize size of each output array. must be a positive integer
* @returns {Array<Array>} a chunked array
* @throws {Error} if chunk size is less than 1 or not an integer
* @example
* chunk([1, 2, 3, 4, 5, 6], 2);
* // => [[1, 2], [3, 4], [5, 6]]
*/
function chunk(x, chunkSize) {
// a list of result chunks, as arrays in an array
var output = [];
// `chunkSize` must be zero or higher - otherwise the loop below,
// in which we call `start += chunkSize`, will loop infinitely.
// So, we'll detect and throw in that case to indicate
// invalid input.
if (chunkSize < 1) {
throw new Error("chunk size must be a positive number");
}
if (Math.floor(chunkSize) !== chunkSize) {
throw new Error("chunk size must be an integer");
}
// `start` is the index at which `.slice` will start selecting
// new array elements
for (var start = 0; start < x.length; start += chunkSize) {
// for each chunk, slice that part of the array and add it
// to the output. The `.slice` function does not change
// the original array.
output.push(x.slice(start, start + chunkSize));
}
return output;
}
/**
* Sampling with replacement is a type of sampling that allows the same
* item to be picked out of a population more than once.
*
* @param {Array<*>} x an array of any kind of value
* @param {number} n count of how many elements to take
* @param {Function} [randomSource=Math.random] an optional entropy source that
* returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
* @return {Array} n sampled items from the population
* @example
* var values = [1, 2, 3, 4];
* sampleWithReplacement(values, 2); // returns 2 random values, like [2, 4];
*/
function sampleWithReplacement(x, n, randomSource) {
if (x.length === 0) {
return [];
}
// a custom random number source can be provided if you want to use
// a fixed seed or another random number generator, like
// [random-js](https://www.npmjs.org/package/random-js)
randomSource = randomSource || Math.random;
var length = x.length;
var sample = [];
for (var i = 0; i < n; i++) {
var index = Math.floor(randomSource() * length);
sample.push(x[index]);
}
return sample;
}
/**
* A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle)
* in-place - which means that it **will change the order of the original
* array by reference**.
*
* This is an algorithm that generates a random [permutation](https://en.wikipedia.org/wiki/Permutation)
* of a set.
*
* @param {Array} x sample of one or more numbers
* @param {Function} [randomSource=Math.random] an optional entropy source that
* returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
* @returns {Array} x
* @example
* var x = [1, 2, 3, 4];
* shuffleInPlace(x);
* // x is shuffled to a value like [2, 1, 4, 3]
*/
function shuffleInPlace(x, randomSource) {
// a custom random number source can be provided if you want to use
// a fixed seed or another random number generator, like
// [random-js](https://www.npmjs.org/package/random-js)
randomSource = randomSource || Math.random;
// store the current length of the x to determine
// when no elements remain to shuffle.
var length = x.length;
// temporary is used to hold an item when it is being
// swapped between indices.
var temporary;
// The index to swap at each stage.
var index;
// While there are still items to shuffle
while (length > 0) {
// choose a random index within the subset of the array
// that is not yet shuffled
index = Math.floor(randomSource() * length--);
// store the value that we'll move temporarily
temporary = x[length];
// swap the value at `x[length]` with `x[index]`
x[length] = x[index];
x[index] = temporary;
}
return x;
}
/**
* A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle)
* is a fast way to create a random permutation of a finite set. This is
* a function around `shuffle_in_place` that adds the guarantee that
* it will not modify its input.
*
* @param {Array} x sample of 0 or more numbers
* @param {Function} [randomSource=Math.random] an optional entropy source that
* returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
* @return {Array} shuffled version of input
* @example
* var shuffled = shuffle([1, 2, 3, 4]);
* shuffled; // = [2, 3, 1, 4] or any other random permutation
*/
function shuffle(x, randomSource) {
// slice the original array so that it is not modified
var sample = x.slice();
// and then shuffle that shallow-copied array, in place
return shuffleInPlace(sample, randomSource);
}
/**
* Create a [simple random sample](http://en.wikipedia.org/wiki/Simple_random_sample)
* from a given array of `n` elements.
*
* The sampled values will be in any order, not necessarily the order
* they appear in the input.
*
* @param {Array<any>} x input array. can contain any type
* @param {number} n count of how many elements to take
* @param {Function} [randomSource=Math.random] an optional entropy source that
* returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
* @return {Array} subset of n elements in original array
*
* @example
* var values = [1, 2, 4, 5, 6, 7, 8, 9];
* sample(values, 3); // returns 3 random values, like [2, 5, 8];
*/
function sample(x, n, randomSource) {
// shuffle the original array using a fisher-yates shuffle
var shuffled = shuffle(x, randomSource);
// and then return a subset of it - the first `n` elements.
return shuffled.slice(0, n);
}
/**
* Create a new column x row matrix.
*
* @private
* @param {number} columns
* @param {number} rows
* @return {Array<Array<number>>} matrix
* @example
* makeMatrix(10, 10);
*/
function makeMatrix(columns, rows) {
var matrix = [];
for (var i = 0; i < columns; i++) {
var column = [];
for (var j = 0; j < rows; j++) {
column.push(0);
}
matrix.push(column);
}
return matrix;
}
/**
* For a sorted input, counting the number of unique values
* is possible in constant time and constant memory. This is
* a simple implementation of the algorithm.
*
* Values are compared with `===`, so objects and non-primitive objects
* are not handled in any special way.
*
* @param {Array<*>} x an array of any kind of value
* @returns {number} count of unique values
* @example
* uniqueCountSorted([1, 2, 3]); // => 3
* uniqueCountSorted([1, 1, 1]); // => 1
*/
function uniqueCountSorted(x) {
var uniqueValueCount = 0;
var lastSeenValue;
for (var i = 0; i < x.length; i++) {
if (i === 0 || x[i] !== lastSeenValue) {
lastSeenValue = x[i];
uniqueValueCount++;
}
}
return uniqueValueCount;
}
/**
* Generates incrementally computed values based on the sums and sums of
* squares for the data array
*
* @private
* @param {number} j
* @param {number} i
* @param {Array<number>} sums
* @param {Array<number>} sumsOfSquares
* @return {number}
* @example
* ssq(0, 1, [-1, 0, 2], [1, 1, 5]);
*/
function ssq(j, i, sums, sumsOfSquares) {
var sji; // s(j, i)
if (j > 0) {
var muji = (sums[i] - sums[j - 1]) / (i - j + 1); // mu(j, i)
sji =
sumsOfSquares[i] - sumsOfSquares[j - 1] - (i - j + 1) * muji * muji;
} else {
sji = sumsOfSquares[i] - (sums[i] * sums[i]) / (i + 1);
}
if (sji < 0) {
return 0;
}
return sji;
}
/**
* Function that recursively divides and conquers computations
* for cluster j
*
* @private
* @param {number} iMin Minimum index in cluster to be computed
* @param {number} iMax Maximum index in cluster to be computed
* @param {number} cluster Index of the cluster currently being computed
* @param {Array<Array<number>>} matrix
* @param {Array<Array<number>>} backtrackMatrix
* @param {Array<number>} sums
* @param {Array<number>} sumsOfSquares
*/
function fillMatrixColumn(
iMin,
iMax,
cluster,
matrix,
backtrackMatrix,
sums,
sumsOfSquares
) {
if (iMin > iMax) {
return;
}
// Start at midpoint between iMin and iMax
var i = Math.floor((iMin + iMax) / 2);
matrix[cluster][i] = matrix[cluster - 1][i - 1];
backtrackMatrix[cluster][i] = i;
var jlow = cluster; // the lower end for j
if (iMin > cluster) {
jlow = Math.max(jlow, backtrackMatrix[cluster][iMin - 1] || 0);
}
jlow = Math.max(jlow, backtrackMatrix[cluster - 1][i] || 0);
var jhigh = i - 1; // the upper end for j
if (iMax < matrix[0].length - 1) {
/* c8 ignore start */
jhigh = Math.min(jhigh, backtrackMatrix[cluster][iMax + 1] || 0);
/* c8 ignore end */
}
var sji;
var sjlowi;
var ssqjlow;
var ssqj;
for (var j = jhigh; j >= jlow; --j) {
sji = ssq(j, i, sums, sumsOfSquares);
if (sji + matrix[cluster - 1][jlow - 1] >= matrix[cluster][i]) {
break;
}
// Examine the lower bound of the cluster border
sjlowi = ssq(jlow, i, sums, sumsOfSquares);
ssqjlow = sjlowi + matrix[cluster - 1][jlow - 1];
if (ssqjlow < matrix[cluster][i]) {
// Shrink the lower bound
matrix[cluster][i] = ssqjlow;
backtrackMatrix[cluster][i] = jlow;
}
jlow++;
ssqj = sji + matrix[cluster - 1][j - 1];
if (ssqj < matrix[cluster][i]) {
matrix[cluster][i] = ssqj;
backtrackMatrix[cluster][i] = j;
}
}
fillMatrixColumn(
iMin,
i - 1,
cluster,
matrix,
backtrackMatrix,
sums,
sumsOfSquares
);
fillMatrixColumn(
i + 1,
iMax,
cluster,
matrix,
backtrackMatrix,
sums,
sumsOfSquares
);
}
/**
* Initializes the main matrices used in Ckmeans and kicks
* off the divide and conquer cluster computation strategy
*
* @private
* @param {Array<number>} data sorted array of values
* @param {Array<Array<number>>} matrix
* @param {Array<Array<number>>} backtrackMatrix
*/
function fillMatrices(data, matrix, backtrackMatrix) {
var nValues = matrix[0].length;
// Shift values by the median to improve numeric stability
var shift = data[Math.floor(nValues / 2)];
// Cumulative sum and cumulative sum of squares for all values in data array
var sums = [];
var sumsOfSquares = [];
// Initialize first column in matrix & backtrackMatrix
for (var i = 0, shiftedValue = (void 0); i < nValues; ++i) {
shiftedValue = data[i] - shift;
if (i === 0) {
sums.push(shiftedValue);
sumsOfSquares.push(shiftedValue * shiftedValue);
} else {
sums.push(sums[i - 1] + shiftedValue);
sumsOfSquares.push(
sumsOfSquares[i - 1] + shiftedValue * shiftedValue
);
}
// Initialize for cluster = 0
matrix[0][i] = ssq(0, i, sums, sumsOfSquares);
backtrackMatrix[0][i] = 0;
}
// Initialize the rest of the columns
var iMin;
for (var cluster = 1; cluster < matrix.length; ++cluster) {
if (cluster < matrix.length - 1) {
iMin = cluster;
} else {
// No need to compute matrix[K-1][0] ... matrix[K-1][N-2]
iMin = nValues - 1;
}
fillMatrixColumn(
iMin,
nValues - 1,
cluster,
matrix,
backtrackMatrix,
sums,
sumsOfSquares
);
}
}
/**
* Ckmeans clustering is an improvement on heuristic-based clustering
* approaches like Jenks. The algorithm was developed in
* [Haizhou Wang and Mingzhou Song](http://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf)
* as a [dynamic programming](https://en.wikipedia.org/wiki/Dynamic_programming) approach
* to the problem of clustering numeric data into groups with the least
* within-group sum-of-squared-deviations.
*
* Minimizing the difference within groups - what Wang & Song refer to as
* `withinss`, or within sum-of-squares, means that groups are optimally
* homogenous within and the data is split into representative groups.
* This is very useful for visualization, where you may want to represent
* a continuous variable in discrete color or style groups. This function
* can provide groups that emphasize differences between data.
*
* Being a dynamic approach, this algorithm is based on two matrices that
* store incrementally-computed values for squared deviations and backtracking
* indexes.
*
* This implementation is based on Ckmeans 3.4.6, which introduced a new divide
* and conquer approach that improved runtime from O(kn^2) to O(kn log(n)).
*
* Unlike the [original implementation](https://cran.r-project.org/web/packages/Ckmeans.1d.dp/index.html),
* this implementation does not include any code to automatically determine
* the optimal number of clusters: this information needs to be explicitly
* provided.
*
* ### References
* _Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic
* Programming_ Haizhou Wang and Mingzhou Song ISSN 2073-4859
*
* from The R Journal Vol. 3/2, December 2011
* @param {Array<number>} x input data, as an array of number values
* @param {number} nClusters number of desired classes. This cannot be
* greater than the number of values in the data array.
* @returns {Array<Array<number>>} clustered input
* @throws {Error} if the number of requested clusters is higher than the size of the data
* @example
* ckmeans([-1, 2, -1, 2, 4, 5, 6, -1, 2, -1], 3);
* // The input, clustered into groups of similar numbers.
* //= [[-1, -1, -1, -1], [2, 2, 2], [4, 5, 6]]);
*/
function ckmeans(x, nClusters) {
if (nClusters > x.length) {
throw new Error(
"cannot generate more classes than there are data values"
);
}
var sorted = numericSort(x);
// we'll use this as the maximum number of clusters
var uniqueCount = uniqueCountSorted(sorted);
// if all of the input values are identical, there's one cluster
// with all of the input in it.
if (uniqueCount === 1) {
return [sorted];
}
// named 'S' originally
var matrix = makeMatrix(nClusters, sorted.length);
// named 'J' originally
var backtrackMatrix = makeMatrix(nClusters, sorted.length);
// This is a dynamic programming way to solve the problem of minimizing
// within-cluster sum of squares. It's similar to linear regression
// in this way, and this calculation incrementally computes the
// sum of squares that are later read.
fillMatrices(sorted, matrix, backtrackMatrix);
// The real work of Ckmeans clustering happens in the matrix generation:
// the generated matrices encode all possible clustering combinations, and
// once they're generated we can solve for the best clustering groups
// very quickly.
var clusters = [];
var clusterRight = backtrackMatrix[0].length - 1;
// Backtrack the clusters from the dynamic programming matrix. This
// starts at the bottom-right corner of the matrix (if the top-left is 0, 0),
// and moves the cluster target with the loop.
for (var cluster = backtrackMatrix.length - 1; cluster >= 0; cluster--) {
var clusterLeft = backtrackMatrix[cluster][clusterRight];
// fill the cluster from the sorted input by taking a slice of the
// array. the backtrack matrix makes this easy - it stores the
// indexes where the cluster should start and end.
clusters[cluster] = sorted.slice(clusterLeft, clusterRight + 1);
if (cluster > 0) {
clusterRight = clusterLeft - 1;
}
}
return clusters;
}
/*
* Pull Breaks Values for Jenks
*
* the second part of the jenks recipe: take the calculated matrices
* and derive an array of n breaks.
*
* @private
*/
function jenksBreaks(data, lowerClassLimits, nClasses) {
var k = data.length;
var kclass = [];
var countNum = nClasses;
// the calculation of classes will never include the upper
// bound, so we need to explicitly set it
kclass[nClasses] = data[data.length - 1];
// the lowerClassLimits matrix is used as indices into itself
// here: the `k` variable is reused in each iteration.
while (countNum > 0) {
kclass[countNum - 1] = data[lowerClassLimits[k][countNum] - 1];
k = lowerClassLimits[k][countNum] - 1;
countNum--;
}
return kclass;
}
/*
* Compute Matrices for Jenks
*
* Compute the matrices required for Jenks breaks. These matrices
* can be used for any classing of data with `classes <= nClasses`
*
* @private
*/
function jenksMatrices(data, nClasses) {
// in the original implementation, these matrices are referred to
// as `LC` and `OP`
//
// * lowerClassLimits (LC): optimal lower class limits
// * varianceCombinations (OP): optimal variance combinations for all classes
var lowerClassLimits = [];
var varianceCombinations = [];
// loop counters
var i;
var j;
// the variance, as computed at each step in the calculation
var variance = 0;
// Initialize and fill each matrix with zeroes
for (i = 0; i < data.length + 1; i++) {
var tmp1 = [];
var tmp2 = [];
// despite these arrays having the same values, we need
// to keep them separate so that changing one does not change
// the other
for (j = 0; j < nClasses + 1; j++) {
tmp1.push(0);
tmp2.push(0);