UNPKG

simple-statistics

Version:

Simple Statistics

github.com/simple-statistics/simple-statistics

simple-statistics/simple-statistics

1,603 lines (1,441 loc) • 148 kB

JavaScript

'use strict'; /** * [Simple linear regression](http://en.wikipedia.org/wiki/Simple_linear_regression) * is a simple way to find a fitted line * between a set of coordinates. This algorithm finds the slope and y-intercept of a regression line * using the least sum of squares. * * @param {Array<Array<number>>} data an array of two-element of arrays, * like `[[0, 1], [2, 3]]` * @returns {Object} object containing slope and intersect of regression line * @example * linearRegression([[0, 0], [1, 1]]); // => { m: 1, b: 0 } */ function linearRegression(data) { var m; var b; // Store data length in a local variable to reduce // repeated object property lookups var dataLength = data.length; //if there's only one point, arbitrarily choose a slope of 0 //and a y-intercept of whatever the y of the initial point is if (dataLength === 1) { m = 0; b = data[0][1]; } else { // Initialize our sums and scope the `m` and `b` // variables that define the line. var sumX = 0; var sumY = 0; var sumXX = 0; var sumXY = 0; // Use local variables to grab point values // with minimal object property lookups var point; var x; var y; // Gather the sum of all x values, the sum of all // y values, and the sum of x^2 and (x*y) for each // value. // // In math notation, these would be SS_x, SS_y, SS_xx, and SS_xy for (var i = 0; i < dataLength; i++) { point = data[i]; x = point[0]; y = point[1]; sumX += x; sumY += y; sumXX += x * x; sumXY += x * y; } // `m` is the slope of the regression line m = (dataLength * sumXY - sumX * sumY) / (dataLength * sumXX - sumX * sumX); // `b` is the y-intercept of the line. b = sumY / dataLength - (m * sumX) / dataLength; } // Return both values as an object. return { m: m, b: b }; } /** * Given the output of `linearRegression`: an object * with `m` and `b` values indicating slope and intercept, * respectively, generate a line function that translates * x values into y values. * * @param {Object} mb object with `m` and `b` members, representing * slope and intersect of desired line * @returns {Function} method that computes y-value at any given * x-value on the line. * @example * var l = linearRegressionLine(linearRegression([[0, 0], [1, 1]])); * l(0) // = 0 * l(2) // = 2 * linearRegressionLine({ b: 0, m: 1 })(1); // => 1 * linearRegressionLine({ b: 1, m: 1 })(1); // => 2 */ function linearRegressionLine(mb /*: { b: number, m: number }*/) { // Return a function that computes a `y` value for each // x value it is given, based on the values of `b` and `a` // that we just computed. return function (x) { return mb.b + mb.m * x; }; } /** * Our default sum is the [Kahan-Babuska algorithm](https://pdfs.semanticscholar.org/1760/7d467cda1d0277ad272deb2113533131dc09.pdf). * This method is an improvement over the classical * [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm). * It aims at computing the sum of a list of numbers while correcting for * floating-point errors. Traditionally, sums are calculated as many * successive additions, each one with its own floating-point roundoff. These * losses in precision add up as the number of numbers increases. This alternative * algorithm is more accurate than the simple way of calculating sums by simple * addition. * * This runs in `O(n)`, linear time, with respect to the length of the array. * * @param {Array<number>} x input * @return {number} sum of all input numbers * @example * sum([1, 2, 3]); // => 6 */ function sum(x) { // If the array is empty, we needn't bother computing its sum if (x.length === 0) { return 0; } // Initializing the sum as the first number in the array var sum = x[0]; // Keeping track of the floating-point error correction var correction = 0; var transition; if (typeof sum !== "number") { return Number.NaN; } for (var i = 1; i < x.length; i++) { if (typeof x[i] !== "number") { return Number.NaN; } transition = sum + x[i]; // Here we need to update the correction in a different fashion // if the new absolute value is greater than the absolute sum if (Math.abs(sum) >= Math.abs(x[i])) { correction += sum - transition + x[i]; } else { correction += x[i] - transition + sum; } sum = transition; } // Returning the corrected sum return sum + correction; } /** * The mean, _also known as average_, * is the sum of all values over the number of values. * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency): * a method of finding a typical or central value of a set of numbers. * * This runs in `O(n)`, linear time, with respect to the length of the array. * * @param {Array<number>} x sample of one or more data points * @throws {Error} if the length of x is less than one * @returns {number} mean * @example * mean([0, 10]); // => 5 */ function mean(x) { if (x.length === 0) { throw new Error("mean requires at least one data point"); } return sum(x) / x.length; } /** * The sum of deviations to the Nth power. * When n=2 it's the sum of squared deviations. * When n=3 it's the sum of cubed deviations. * * @param {Array<number>} x * @param {number} n power * @returns {number} sum of nth power deviations * * @example * var input = [1, 2, 3]; * // since the variance of a set is the mean squared * // deviations, we can calculate that with sumNthPowerDeviations: * sumNthPowerDeviations(input, 2) / input.length; */ function sumNthPowerDeviations(x, n) { var meanValue = mean(x); var sum = 0; var tempValue; var i; // This is an optimization: when n is 2 (we're computing a number squared), // multiplying the number by itself is significantly faster than using // the Math.pow method. if (n === 2) { for (i = 0; i < x.length; i++) { tempValue = x[i] - meanValue; sum += tempValue * tempValue; } } else { for (i = 0; i < x.length; i++) { sum += Math.pow(x[i] - meanValue, n); } } return sum; } /** * The [variance](http://en.wikipedia.org/wiki/Variance) * is the sum of squared deviations from the mean. * * This is an implementation of variance, not sample variance: * see the `sampleVariance` method if you want a sample measure. * * @param {Array<number>} x a population of one or more data points * @returns {number} variance: a value greater than or equal to zero. * zero indicates that all values are identical. * @throws {Error} if x's length is 0 * @example * variance([1, 2, 3, 4, 5, 6]); // => 2.9166666666666665 */ function variance(x) { if (x.length === 0) { throw new Error("variance requires at least one data point"); } // Find the mean of squared deviations between the // mean value and each value. return sumNthPowerDeviations(x, 2) / x.length; } /** * The [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation) * is the square root of the variance. This is also known as the population * standard deviation. It's useful for measuring the amount * of variation or dispersion in a set of values. * * Standard deviation is only appropriate for full-population knowledge: for * samples of a population, {@link sampleStandardDeviation} is * more appropriate. * * @param {Array<number>} x input * @returns {number} standard deviation * @example * variance([2, 4, 4, 4, 5, 5, 7, 9]); // => 4 * standardDeviation([2, 4, 4, 4, 5, 5, 7, 9]); // => 2 */ function standardDeviation(x) { if (x.length === 1) { return 0; } var v = variance(x); return Math.sqrt(v); } /** * The [R Squared](http://en.wikipedia.org/wiki/Coefficient_of_determination) * value of data compared with a function `f` * is the sum of the squared differences between the prediction * and the actual value. * * @param {Array<Array<number>>} x input data: this should be doubly-nested * @param {Function} func function called on `[i][0]` values within the dataset * @returns {number} r-squared value * @example * var samples = [[0, 0], [1, 1]]; * var regressionLine = linearRegressionLine(linearRegression(samples)); * rSquared(samples, regressionLine); // = 1 this line is a perfect fit */ function rSquared(x, func) { if (x.length < 2) { return 1; } // Compute the average y value for the actual // data set in order to compute the // _total sum of squares_ var sum = 0; for (var i = 0; i < x.length; i++) { sum += x[i][1]; } var average = sum / x.length; // Compute the total sum of squares - the // squared difference between each point // and the average of all points. var sumOfSquares = 0; for (var j = 0; j < x.length; j++) { sumOfSquares += Math.pow(average - x[j][1], 2); } // Finally estimate the error: the squared // difference between the estimate and the actual data // value at each point. var err = 0; for (var k = 0; k < x.length; k++) { err += Math.pow(x[k][1] - func(x[k][0]), 2); } // As the error grows larger, its ratio to the // sum of squares increases and the r squared // value grows lower. return 1 - err / sumOfSquares; } /** * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number * that appears in a list the highest number of times. * There can be multiple modes in a list: in the event of a tie, this * algorithm will return the most recently seen mode. * * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency): * a method of finding a typical or central value of a set of numbers. * * This runs in `O(n)` because the input is sorted. * * @param {Array<number>} sorted a sample of one or more data points * @returns {number} mode * @throws {Error} if sorted is empty * @example * modeSorted([0, 0, 1]); // => 0 */ function modeSorted(sorted) { // Handle edge cases: // The mode of an empty list is undefined if (sorted.length === 0) { throw new Error("mode requires at least one data point"); } if (sorted.length === 1) { return sorted[0]; } // This assumes it is dealing with an array of size > 1, since size // 0 and 1 are handled immediately. Hence it starts at index 1 in the // array. var last = sorted[0]; // store the mode as we find new modes var value = Number.NaN; // store how many times we've seen the mode var maxSeen = 0; // how many times the current candidate for the mode // has been seen var seenThis = 1; // end at sorted.length + 1 to fix the case in which the mode is // the highest number that occurs in the sequence. the last iteration // compares sorted[i], which is undefined, to the highest number // in the series for (var i = 1; i < sorted.length + 1; i++) { // we're seeing a new number pass by if (sorted[i] !== last) { // the last number is the new mode since we saw it more // often than the old one if (seenThis > maxSeen) { maxSeen = seenThis; value = last; } seenThis = 1; last = sorted[i]; // if this isn't a new number, it's one more occurrence of // the potential mode } else { seenThis++; } } return value; } /** * Sort an array of numbers by their numeric value, ensuring that the * array is not changed in place. * * This is necessary because the default behavior of .sort * in JavaScript is to sort arrays as string values * * [1, 10, 12, 102, 20].sort() * // output * [1, 10, 102, 12, 20] * * @param {Array<number>} x input array * @return {Array<number>} sorted array * @private * @example * numericSort([3, 2, 1]) // => [1, 2, 3] */ function numericSort(x) { return ( x // ensure the array is not changed in-place .slice() // comparator function that treats input as numeric .sort(function (a, b) { return a - b; }) ); } /** * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number * that appears in a list the highest number of times. * There can be multiple modes in a list: in the event of a tie, this * algorithm will return the most recently seen mode. * * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency): * a method of finding a typical or central value of a set of numbers. * * This runs in `O(n log(n))` because it needs to sort the array internally * before running an `O(n)` search to find the mode. * * @param {Array<number>} x input * @returns {number} mode * @example * mode([0, 0, 1]); // => 0 */ function mode(x) { // Sorting the array lets us iterate through it below and be sure // that every time we see a new number it's new and we'll never // see the same number twice return modeSorted(numericSort(x)); } /* globals Map: false */ /** * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number * that appears in a list the highest number of times. * There can be multiple modes in a list: in the event of a tie, this * algorithm will return the most recently seen mode. * * modeFast uses a Map object to keep track of the mode, instead of the approach * used with `mode`, a sorted array. As a result, it is faster * than `mode` and supports any data type that can be compared with `==`. * It also requires a * [JavaScript environment with support for Map](https://kangax.github.io/compat-table/es6/#test-Map), * and will throw an error if Map is not available. * * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency): * a method of finding a typical or central value of a set of numbers. * * @param {Array<*>} x a sample of one or more data points * @returns {?*} mode * @throws {ReferenceError} if the JavaScript environment doesn't support Map * @throws {Error} if x is empty * @example * modeFast(['rabbits', 'rabbits', 'squirrels']); // => 'rabbits' */ function modeFast(x) { // This index will reflect the incidence of different values, indexing // them like // { value: count } var index = new Map(); // A running `mode` and the number of times it has been encountered. var mode; var modeCount = 0; for (var i = 0; i < x.length; i++) { var newCount = index.get(x[i]); if (newCount === undefined) { newCount = 1; } else { newCount++; } if (newCount > modeCount) { mode = x[i]; modeCount = newCount; } index.set(x[i], newCount); } if (modeCount === 0) { throw new Error("mode requires at last one data point"); } return mode; } /** * The min is the lowest number in the array. * This runs in `O(n)`, linear time, with respect to the length of the array. * * @param {Array<number>} x sample of one or more data points * @throws {Error} if the length of x is less than one * @returns {number} minimum value * @example * min([1, 5, -10, 100, 2]); // => -10 */ function min(x) { if (x.length === 0) { throw new Error("min requires at least one data point"); } var value = x[0]; for (var i = 1; i < x.length; i++) { if (x[i] < value) { value = x[i]; } } return value; } /** * This computes the maximum number in an array. * * This runs in `O(n)`, linear time, with respect to the length of the array. * * @param {Array<number>} x sample of one or more data points * @returns {number} maximum value * @throws {Error} if the length of x is less than one * @example * max([1, 2, 3, 4]); * // => 4 */ function max(x) { if (x.length === 0) { throw new Error("max requires at least one data point"); } var value = x[0]; for (var i = 1; i < x.length; i++) { if (x[i] > value) { value = x[i]; } } return value; } /** * This computes the minimum & maximum number in an array. * * This runs in `O(n)`, linear time, with respect to the length of the array. * * @param {Array<number>} x sample of one or more data points * @returns {Array<number>} minimum & maximum value * @throws {Error} if the length of x is less than one * @example * extent([1, 2, 3, 4]); * // => [1, 4] */ function extent(x) { if (x.length === 0) { throw new Error("extent requires at least one data point"); } var min = x[0]; var max = x[0]; for (var i = 1; i < x.length; i++) { if (x[i] > max) { max = x[i]; } if (x[i] < min) { min = x[i]; } } return [min, max]; } /** * The minimum is the lowest number in the array. With a sorted array, * the first element in the array is always the smallest, so this calculation * can be done in one step, or constant time. * * @param {Array<number>} x input * @returns {number} minimum value * @example * minSorted([-100, -10, 1, 2, 5]); // => -100 */ function minSorted(x) { return x[0]; } /** * The maximum is the highest number in the array. With a sorted array, * the last element in the array is always the largest, so this calculation * can be done in one step, or constant time. * * @param {Array<number>} x input * @returns {number} maximum value * @example * maxSorted([-100, -10, 1, 2, 5]); // => 5 */ function maxSorted(x) { return x[x.length - 1]; } /** * The extent is the lowest & highest number in the array. With a sorted array, * the first element in the array is always the lowest while the last element is always the largest, so this calculation * can be done in one step, or constant time. * * @param {Array<number>} x input * @returns {Array<number>} minimum & maximum value * @example * extentSorted([-100, -10, 1, 2, 5]); // => [-100, 5] */ function extentSorted(x) { return [x[0], x[x.length - 1]]; } /** * The simple [sum](https://en.wikipedia.org/wiki/Summation) of an array * is the result of adding all numbers together, starting from zero. * * This runs in `O(n)`, linear time, with respect to the length of the array. * * @param {Array<number>} x input * @return {number} sum of all input numbers * @example * sumSimple([1, 2, 3]); // => 6 */ function sumSimple(x) { var value = 0; for (var i = 0; i < x.length; i++) { if (typeof x[i] !== "number") { return Number.NaN; } value += x[i]; } return value; } /** * The [product](https://en.wikipedia.org/wiki/Product_(mathematics)) of an array * is the result of multiplying all numbers together, starting using one as the multiplicative identity. * * This runs in `O(n)`, linear time, with respect to the length of the array. * * @param {Array<number>} x input * @return {number} product of all input numbers * @example * product([1, 2, 3, 4]); // => 24 */ function product(x) { var value = 1; for (var i = 0; i < x.length; i++) { value *= x[i]; } return value; } /** * This is the internal implementation of quantiles: when you know * that the order is sorted, you don't need to re-sort it, and the computations * are faster. * * @param {Array<number>} x sample of one or more data points * @param {number} p desired quantile: a number between 0 to 1, inclusive * @returns {number} quantile value * @throws {Error} if p ix outside of the range from 0 to 1 * @throws {Error} if x is empty * @example * quantileSorted([3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20], 0.5); // => 9 */ function quantileSorted(x, p) { var idx = x.length * p; if (x.length === 0) { throw new Error("quantile requires at least one data point."); } else if (p < 0 || p > 1) { throw new Error("quantiles must be between 0 and 1"); } else if (p === 1) { // If p is 1, directly return the last element return x[x.length - 1]; } else if (p === 0) { // If p is 0, directly return the first element return x[0]; } else if (idx % 1 !== 0) { // If p is not integer, return the next element in array return x[Math.ceil(idx) - 1]; } else if (x.length % 2 === 0) { // If the list has even-length, we'll take the average of this number // and the next value, if there is one return (x[idx - 1] + x[idx]) / 2; } else { // Finally, in the simple case of an integer value // with an odd-length list, return the x value at the index. return x[idx]; } } /** * Rearrange items in `arr` so that all items in `[left, k]` range are the smallest. * The `k`-th element will have the `(k - left + 1)`-th smallest value in `[left, right]`. * * Implements Floyd-Rivest selection algorithm https://en.wikipedia.org/wiki/Floyd-Rivest_algorithm * * @param {Array<number>} arr input array * @param {number} k pivot index * @param {number} [left] left index * @param {number} [right] right index * @returns {void} mutates input array * @example * var arr = [65, 28, 59, 33, 21, 56, 22, 95, 50, 12, 90, 53, 28, 77, 39]; * quickselect(arr, 8); * // = [39, 28, 28, 33, 21, 12, 22, 50, 53, 56, 59, 65, 90, 77, 95] */ function quickselect(arr, k, left, right) { left = left || 0; right = right || arr.length - 1; while (right > left) { // 600 and 0.5 are arbitrary constants chosen in the original paper to minimize execution time if (right - left > 600) { var n = right - left + 1; var m = k - left + 1; var z = Math.log(n); var s = 0.5 * Math.exp((2 * z) / 3); var sd = 0.5 * Math.sqrt((z * s * (n - s)) / n); if (m - n / 2 < 0) { sd *= -1; } var newLeft = Math.max(left, Math.floor(k - (m * s) / n + sd)); var newRight = Math.min( right, Math.floor(k + ((n - m) * s) / n + sd) ); quickselect(arr, k, newLeft, newRight); } var t = arr[k]; var i = left; var j = right; swap(arr, left, k); if (arr[right] > t) { swap(arr, left, right); } while (i < j) { swap(arr, i, j); i++; j--; while (arr[i] < t) { i++; } while (arr[j] > t) { j--; } } if (arr[left] === t) { swap(arr, left, j); } else { j++; swap(arr, j, right); } if (j <= k) { left = j + 1; } if (k <= j) { right = j - 1; } } } function swap(arr, i, j) { var tmp = arr[i]; arr[i] = arr[j]; arr[j] = tmp; } /** * The [quantile](https://en.wikipedia.org/wiki/Quantile): * this is a population quantile, since we assume to know the entire * dataset in this library. This is an implementation of the * [Quantiles of a Population](http://en.wikipedia.org/wiki/Quantile#Quantiles_of_a_population) * algorithm from wikipedia. * * Sample is a one-dimensional array of numbers, * and p is either a decimal number from 0 to 1 or an array of decimal * numbers from 0 to 1. * In terms of a k/q quantile, p = k/q - it's just dealing with fractions or dealing * with decimal values. * When p is an array, the result of the function is also an array containing the appropriate * quantiles in input order * * @param {Array<number>} x sample of one or more numbers * @param {Array<number> | number} p the desired quantile, as a number between 0 and 1 * @returns {number} quantile * @example * quantile([3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20], 0.5); // => 9 */ function quantile(x, p) { var copy = x.slice(); if (Array.isArray(p)) { // rearrange elements so that each element corresponding to a requested // quantile is on a place it would be if the array was fully sorted multiQuantileSelect(copy, p); // Initialize the result array var results = []; // For each requested quantile for (var i = 0; i < p.length; i++) { results[i] = quantileSorted(copy, p[i]); } return results; } else { var idx = quantileIndex(copy.length, p); quantileSelect(copy, idx, 0, copy.length - 1); return quantileSorted(copy, p); } } function quantileSelect(arr, k, left, right) { if (k % 1 === 0) { quickselect(arr, k, left, right); } else { k = Math.floor(k); quickselect(arr, k, left, right); quickselect(arr, k + 1, k + 1, right); } } function multiQuantileSelect(arr, p) { var indices = [0]; for (var i = 0; i < p.length; i++) { indices.push(quantileIndex(arr.length, p[i])); } indices.push(arr.length - 1); indices.sort(compare); var stack = [0, indices.length - 1]; while (stack.length) { var r = Math.ceil(stack.pop()); var l = Math.floor(stack.pop()); if (r - l <= 1) { continue; } var m = Math.floor((l + r) / 2); quantileSelect( arr, indices[m], Math.floor(indices[l]), Math.ceil(indices[r]) ); stack.push(l, m, m, r); } } function compare(a, b) { return a - b; } function quantileIndex(len, p) { var idx = len * p; if (p === 1) { // If p is 1, directly return the last index return len - 1; } else if (p === 0) { // If p is 0, directly return the first index return 0; } else if (idx % 1 !== 0) { // If index is not integer, return the next index in array return Math.ceil(idx) - 1; } else if (len % 2 === 0) { // If the list has even-length, we'll return the middle of two indices // around quantile to indicate that we need an average value of the two return idx - 0.5; } else { // Finally, in the simple case of an integer index // with an odd-length list, return the index return idx; } } /* eslint no-bitwise: 0 */ /** * This function returns the quantile in which one would find the given value in * the given array. With a sorted array, leveraging binary search, we can find * this information in logarithmic time. * * @param {Array<number>} x input * @returns {number} value value * @example * quantileRankSorted([1, 2, 3, 4], 3); // => 0.75 * quantileRankSorted([1, 2, 3, 3, 4], 3); // => 0.7 * quantileRankSorted([1, 2, 3, 4], 6); // => 1 * quantileRankSorted([1, 2, 3, 3, 5], 4); // => 0.8 */ function quantileRankSorted(x, value) { // Value is lesser than any value in the array if (value < x[0]) { return 0; } // Value is greater than any value in the array if (value > x[x.length - 1]) { return 1; } var l = lowerBound(x, value); // Value is not in the array if (x[l] !== value) { return l / x.length; } l++; var u = upperBound(x, value); // The value exists only once in the array if (u === l) { return l / x.length; } // Here, we are basically computing the mean of the range of indices // containing our searched value. But, instead, of initializing an // array and looping over it, there is a dedicated math formula that // we apply below to get the result. var r = u - l + 1; var sum = (r * (u + l)) / 2; var mean = sum / r; return mean / x.length; } function lowerBound(x, value) { var mid = 0; var lo = 0; var hi = x.length; while (lo < hi) { mid = (lo + hi) >>> 1; if (value <= x[mid]) { hi = mid; } else { lo = -~mid; } } return lo; } function upperBound(x, value) { var mid = 0; var lo = 0; var hi = x.length; while (lo < hi) { mid = (lo + hi) >>> 1; if (value >= x[mid]) { lo = -~mid; } else { hi = mid; } } return lo; } /** * This function returns the quantile in which one would find the given value in * the given array. It will copy and sort your array before each run, so * if you know your array is already sorted, you should use `quantileRankSorted` * instead. * * @param {Array<number>} x input * @returns {number} value value * @example * quantileRank([4, 3, 1, 2], 3); // => 0.75 * quantileRank([4, 3, 2, 3, 1], 3); // => 0.7 * quantileRank([2, 4, 1, 3], 6); // => 1 * quantileRank([5, 3, 1, 2, 3], 4); // => 0.8 */ function quantileRank(x, value) { // Cloning and sorting the array var sortedCopy = numericSort(x); return quantileRankSorted(sortedCopy, value); } /** * The [Interquartile range](http://en.wikipedia.org/wiki/Interquartile_range) is * a measure of statistical dispersion, or how scattered, spread, or * concentrated a distribution is. It's computed as the difference between * the third quartile and first quartile. * * @param {Array<number>} x sample of one or more numbers * @returns {number} interquartile range: the span between lower and upper quartile, * 0.25 and 0.75 * @example * interquartileRange([0, 1, 2, 3]); // => 2 */ function interquartileRange(x) { // Interquartile range is the span between the upper quartile, // at `0.75`, and lower quartile, `0.25` var q1 = quantile(x, 0.75); var q2 = quantile(x, 0.25); if (typeof q1 === "number" && typeof q2 === "number") { return q1 - q2; } } /** * The [median](http://en.wikipedia.org/wiki/Median) is * the middle number of a list. This is often a good indicator of 'the middle' * when there are outliers that skew the `mean()` value. * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency): * a method of finding a typical or central value of a set of numbers. * * The median isn't necessarily one of the elements in the list: the value * can be the average of two elements if the list has an even length * and the two central values are different. * * @param {Array<number>} x input * @returns {number} median value * @example * median([10, 2, 5, 100, 2, 1]); // => 3.5 */ function median(x) { return +quantile(x, 0.5); } /** * The [Median Absolute Deviation](http://en.wikipedia.org/wiki/Median_absolute_deviation) is * a robust measure of statistical * dispersion. It is more resilient to outliers than the standard deviation. * * @param {Array<number>} x input array * @returns {number} median absolute deviation * @example * medianAbsoluteDeviation([1, 1, 2, 2, 4, 6, 9]); // => 1 */ function medianAbsoluteDeviation(x) { var medianValue = median(x); var medianAbsoluteDeviations = []; // Make a list of absolute deviations from the median for (var i = 0; i < x.length; i++) { medianAbsoluteDeviations.push(Math.abs(x[i] - medianValue)); } // Find the median value of that list return median(medianAbsoluteDeviations); } /** * Split an array into chunks of a specified size. This function * has the same behavior as [PHP's array_chunk](http://php.net/manual/en/function.array-chunk.php) * function, and thus will insert smaller-sized chunks at the end if * the input size is not divisible by the chunk size. * * `x` is expected to be an array, and `chunkSize` a number. * The `x` array can contain any kind of data. * * @param {Array} x a sample * @param {number} chunkSize size of each output array. must be a positive integer * @returns {Array<Array>} a chunked array * @throws {Error} if chunk size is less than 1 or not an integer * @example * chunk([1, 2, 3, 4, 5, 6], 2); * // => [[1, 2], [3, 4], [5, 6]] */ function chunk(x, chunkSize) { // a list of result chunks, as arrays in an array var output = []; // `chunkSize` must be zero or higher - otherwise the loop below, // in which we call `start += chunkSize`, will loop infinitely. // So, we'll detect and throw in that case to indicate // invalid input. if (chunkSize < 1) { throw new Error("chunk size must be a positive number"); } if (Math.floor(chunkSize) !== chunkSize) { throw new Error("chunk size must be an integer"); } // `start` is the index at which `.slice` will start selecting // new array elements for (var start = 0; start < x.length; start += chunkSize) { // for each chunk, slice that part of the array and add it // to the output. The `.slice` function does not change // the original array. output.push(x.slice(start, start + chunkSize)); } return output; } /** * Sampling with replacement is a type of sampling that allows the same * item to be picked out of a population more than once. * * @param {Array<*>} x an array of any kind of value * @param {number} n count of how many elements to take * @param {Function} [randomSource=Math.random] an optional entropy source that * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1) * @return {Array} n sampled items from the population * @example * var values = [1, 2, 3, 4]; * sampleWithReplacement(values, 2); // returns 2 random values, like [2, 4]; */ function sampleWithReplacement(x, n, randomSource) { if (x.length === 0) { return []; } // a custom random number source can be provided if you want to use // a fixed seed or another random number generator, like // [random-js](https://www.npmjs.org/package/random-js) randomSource = randomSource || Math.random; var length = x.length; var sample = []; for (var i = 0; i < n; i++) { var index = Math.floor(randomSource() * length); sample.push(x[index]); } return sample; } /** * A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle) * in-place - which means that it **will change the order of the original * array by reference**. * * This is an algorithm that generates a random [permutation](https://en.wikipedia.org/wiki/Permutation) * of a set. * * @param {Array} x sample of one or more numbers * @param {Function} [randomSource=Math.random] an optional entropy source that * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1) * @returns {Array} x * @example * var x = [1, 2, 3, 4]; * shuffleInPlace(x); * // x is shuffled to a value like [2, 1, 4, 3] */ function shuffleInPlace(x, randomSource) { // a custom random number source can be provided if you want to use // a fixed seed or another random number generator, like // [random-js](https://www.npmjs.org/package/random-js) randomSource = randomSource || Math.random; // store the current length of the x to determine // when no elements remain to shuffle. var length = x.length; // temporary is used to hold an item when it is being // swapped between indices. var temporary; // The index to swap at each stage. var index; // While there are still items to shuffle while (length > 0) { // choose a random index within the subset of the array // that is not yet shuffled index = Math.floor(randomSource() * length--); // store the value that we'll move temporarily temporary = x[length]; // swap the value at `x[length]` with `x[index]` x[length] = x[index]; x[index] = temporary; } return x; } /** * A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle) * is a fast way to create a random permutation of a finite set. This is * a function around `shuffle_in_place` that adds the guarantee that * it will not modify its input. * * @param {Array} x sample of 0 or more numbers * @param {Function} [randomSource=Math.random] an optional entropy source that * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1) * @return {Array} shuffled version of input * @example * var shuffled = shuffle([1, 2, 3, 4]); * shuffled; // = [2, 3, 1, 4] or any other random permutation */ function shuffle(x, randomSource) { // slice the original array so that it is not modified var sample = x.slice(); // and then shuffle that shallow-copied array, in place return shuffleInPlace(sample, randomSource); } /** * Create a [simple random sample](http://en.wikipedia.org/wiki/Simple_random_sample) * from a given array of `n` elements. * * The sampled values will be in any order, not necessarily the order * they appear in the input. * * @param {Array<any>} x input array. can contain any type * @param {number} n count of how many elements to take * @param {Function} [randomSource=Math.random] an optional entropy source that * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1) * @return {Array} subset of n elements in original array * * @example * var values = [1, 2, 4, 5, 6, 7, 8, 9]; * sample(values, 3); // returns 3 random values, like [2, 5, 8]; */ function sample(x, n, randomSource) { // shuffle the original array using a fisher-yates shuffle var shuffled = shuffle(x, randomSource); // and then return a subset of it - the first `n` elements. return shuffled.slice(0, n); } /** * Create a new column x row matrix. * * @private * @param {number} columns * @param {number} rows * @return {Array<Array<number>>} matrix * @example * makeMatrix(10, 10); */ function makeMatrix(columns, rows) { var matrix = []; for (var i = 0; i < columns; i++) { var column = []; for (var j = 0; j < rows; j++) { column.push(0); } matrix.push(column); } return matrix; } /** * For a sorted input, counting the number of unique values * is possible in constant time and constant memory. This is * a simple implementation of the algorithm. * * Values are compared with `===`, so objects and non-primitive objects * are not handled in any special way. * * @param {Array<*>} x an array of any kind of value * @returns {number} count of unique values * @example * uniqueCountSorted([1, 2, 3]); // => 3 * uniqueCountSorted([1, 1, 1]); // => 1 */ function uniqueCountSorted(x) { var uniqueValueCount = 0; var lastSeenValue; for (var i = 0; i < x.length; i++) { if (i === 0 || x[i] !== lastSeenValue) { lastSeenValue = x[i]; uniqueValueCount++; } } return uniqueValueCount; } /** * Generates incrementally computed values based on the sums and sums of * squares for the data array * * @private * @param {number} j * @param {number} i * @param {Array<number>} sums * @param {Array<number>} sumsOfSquares * @return {number} * @example * ssq(0, 1, [-1, 0, 2], [1, 1, 5]); */ function ssq(j, i, sums, sumsOfSquares) { var sji; // s(j, i) if (j > 0) { var muji = (sums[i] - sums[j - 1]) / (i - j + 1); // mu(j, i) sji = sumsOfSquares[i] - sumsOfSquares[j - 1] - (i - j + 1) * muji * muji; } else { sji = sumsOfSquares[i] - (sums[i] * sums[i]) / (i + 1); } if (sji < 0) { return 0; } return sji; } /** * Function that recursively divides and conquers computations * for cluster j * * @private * @param {number} iMin Minimum index in cluster to be computed * @param {number} iMax Maximum index in cluster to be computed * @param {number} cluster Index of the cluster currently being computed * @param {Array<Array<number>>} matrix * @param {Array<Array<number>>} backtrackMatrix * @param {Array<number>} sums * @param {Array<number>} sumsOfSquares */ function fillMatrixColumn( iMin, iMax, cluster, matrix, backtrackMatrix, sums, sumsOfSquares ) { if (iMin > iMax) { return; } // Start at midpoint between iMin and iMax var i = Math.floor((iMin + iMax) / 2); matrix[cluster][i] = matrix[cluster - 1][i - 1]; backtrackMatrix[cluster][i] = i; var jlow = cluster; // the lower end for j if (iMin > cluster) { jlow = Math.max(jlow, backtrackMatrix[cluster][iMin - 1] || 0); } jlow = Math.max(jlow, backtrackMatrix[cluster - 1][i] || 0); var jhigh = i - 1; // the upper end for j if (iMax < matrix[0].length - 1) { /* c8 ignore start */ jhigh = Math.min(jhigh, backtrackMatrix[cluster][iMax + 1] || 0); /* c8 ignore end */ } var sji; var sjlowi; var ssqjlow; var ssqj; for (var j = jhigh; j >= jlow; --j) { sji = ssq(j, i, sums, sumsOfSquares); if (sji + matrix[cluster - 1][jlow - 1] >= matrix[cluster][i]) { break; } // Examine the lower bound of the cluster border sjlowi = ssq(jlow, i, sums, sumsOfSquares); ssqjlow = sjlowi + matrix[cluster - 1][jlow - 1]; if (ssqjlow < matrix[cluster][i]) { // Shrink the lower bound matrix[cluster][i] = ssqjlow; backtrackMatrix[cluster][i] = jlow; } jlow++; ssqj = sji + matrix[cluster - 1][j - 1]; if (ssqj < matrix[cluster][i]) { matrix[cluster][i] = ssqj; backtrackMatrix[cluster][i] = j; } } fillMatrixColumn( iMin, i - 1, cluster, matrix, backtrackMatrix, sums, sumsOfSquares ); fillMatrixColumn( i + 1, iMax, cluster, matrix, backtrackMatrix, sums, sumsOfSquares ); } /** * Initializes the main matrices used in Ckmeans and kicks * off the divide and conquer cluster computation strategy * * @private * @param {Array<number>} data sorted array of values * @param {Array<Array<number>>} matrix * @param {Array<Array<number>>} backtrackMatrix */ function fillMatrices(data, matrix, backtrackMatrix) { var nValues = matrix[0].length; // Shift values by the median to improve numeric stability var shift = data[Math.floor(nValues / 2)]; // Cumulative sum and cumulative sum of squares for all values in data array var sums = []; var sumsOfSquares = []; // Initialize first column in matrix & backtrackMatrix for (var i = 0, shiftedValue = (void 0); i < nValues; ++i) { shiftedValue = data[i] - shift; if (i === 0) { sums.push(shiftedValue); sumsOfSquares.push(shiftedValue * shiftedValue); } else { sums.push(sums[i - 1] + shiftedValue); sumsOfSquares.push( sumsOfSquares[i - 1] + shiftedValue * shiftedValue ); } // Initialize for cluster = 0 matrix[0][i] = ssq(0, i, sums, sumsOfSquares); backtrackMatrix[0][i] = 0; } // Initialize the rest of the columns var iMin; for (var cluster = 1; cluster < matrix.length; ++cluster) { if (cluster < matrix.length - 1) { iMin = cluster; } else { // No need to compute matrix[K-1][0] ... matrix[K-1][N-2] iMin = nValues - 1; } fillMatrixColumn( iMin, nValues - 1, cluster, matrix, backtrackMatrix, sums, sumsOfSquares ); } } /** * Ckmeans clustering is an improvement on heuristic-based clustering * approaches like Jenks. The algorithm was developed in * [Haizhou Wang and Mingzhou Song](http://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf) * as a [dynamic programming](https://en.wikipedia.org/wiki/Dynamic_programming) approach * to the problem of clustering numeric data into groups with the least * within-group sum-of-squared-deviations. * * Minimizing the difference within groups - what Wang & Song refer to as * `withinss`, or within sum-of-squares, means that groups are optimally * homogenous within and the data is split into representative groups. * This is very useful for visualization, where you may want to represent * a continuous variable in discrete color or style groups. This function * can provide groups that emphasize differences between data. * * Being a dynamic approach, this algorithm is based on two matrices that * store incrementally-computed values for squared deviations and backtracking * indexes. * * This implementation is based on Ckmeans 3.4.6, which introduced a new divide * and conquer approach that improved runtime from O(kn^2) to O(kn log(n)). * * Unlike the [original implementation](https://cran.r-project.org/web/packages/Ckmeans.1d.dp/index.html), * this implementation does not include any code to automatically determine * the optimal number of clusters: this information needs to be explicitly * provided. * * ### References * _Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic * Programming_ Haizhou Wang and Mingzhou Song ISSN 2073-4859 * * from The R Journal Vol. 3/2, December 2011 * @param {Array<number>} x input data, as an array of number values * @param {number} nClusters number of desired classes. This cannot be * greater than the number of values in the data array. * @returns {Array<Array<number>>} clustered input * @throws {Error} if the number of requested clusters is higher than the size of the data * @example * ckmeans([-1, 2, -1, 2, 4, 5, 6, -1, 2, -1], 3); * // The input, clustered into groups of similar numbers. * //= [[-1, -1, -1, -1], [2, 2, 2], [4, 5, 6]]); */ function ckmeans(x, nClusters) { if (nClusters > x.length) { throw new Error( "cannot generate more classes than there are data values" ); } var sorted = numericSort(x); // we'll use this as the maximum number of clusters var uniqueCount = uniqueCountSorted(sorted); // if all of the input values are identical, there's one cluster // with all of the input in it. if (uniqueCount === 1) { return [sorted]; } // named 'S' originally var matrix = makeMatrix(nClusters, sorted.length); // named 'J' originally var backtrackMatrix = makeMatrix(nClusters, sorted.length); // This is a dynamic programming way to solve the problem of minimizing // within-cluster sum of squares. It's similar to linear regression // in this way, and this calculation incrementally computes the // sum of squares that are later read. fillMatrices(sorted, matrix, backtrackMatrix); // The real work of Ckmeans clustering happens in the matrix generation: // the generated matrices encode all possible clustering combinations, and // once they're generated we can solve for the best clustering groups // very quickly. var clusters = []; var clusterRight = backtrackMatrix[0].length - 1; // Backtrack the clusters from the dynamic programming matrix. This // starts at the bottom-right corner of the matrix (if the top-left is 0, 0), // and moves the cluster target with the loop. for (var cluster = backtrackMatrix.length - 1; cluster >= 0; cluster--) { var clusterLeft = backtrackMatrix[cluster][clusterRight]; // fill the cluster from the sorted input by taking a slice of the // array. the backtrack matrix makes this easy - it stores the // indexes where the cluster should start and end. clusters[cluster] = sorted.slice(clusterLeft, clusterRight + 1); if (cluster > 0) { clusterRight = clusterLeft - 1; } } return clusters; } /* * Pull Breaks Values for Jenks * * the second part of the jenks recipe: take the calculated matrices * and derive an array of n breaks. * * @private */ function jenksBreaks(data, lowerClassLimits, nClasses) { var k = data.length; var kclass = []; var countNum = nClasses; // the calculation of classes will never include the upper // bound, so we need to explicitly set it kclass[nClasses] = data[data.length - 1]; // the lowerClassLimits matrix is used as indices into itself // here: the `k` variable is reused in each iteration. while (countNum > 0) { kclass[countNum - 1] = data[lowerClassLimits[k][countNum] - 1]; k = lowerClassLimits[k][countNum] - 1; countNum--; } return kclass; } /* * Compute Matrices for Jenks * * Compute the matrices required for Jenks breaks. These matrices * can be used for any classing of data with `classes <= nClasses` * * @private */ function jenksMatrices(data, nClasses) { // in the original implementation, these matrices are referred to // as `LC` and `OP` // // * lowerClassLimits (LC): optimal lower class limits // * varianceCombinations (OP): optimal variance combinations for all classes var lowerClassLimits = []; var varianceCombinations = []; // loop counters var i; var j; // the variance, as computed at each step in the calculation var variance = 0; // Initialize and fill each matrix with zeroes for (i = 0; i < data.length + 1; i++) { var tmp1 = []; var tmp2 = []; // despite these arrays having the same values, we need // to keep them separate so that changing one does not change // the other for (j = 0; j < nClasses + 1; j++) { tmp1.push(0);