UNPKG

datalib

Version:

JavaScript utilites for loading, summarizing and working with data.

721 lines (640 loc) 19.4 kB
var util = require('./util'); var type = require('./import/type'); var gen = require('./generate'); var stats = module.exports; // Collect unique values. // Output: an array of unique values, in first-observed order stats.unique = function(values, f, results) { f = util.$(f); results = results || []; var u = {}, v, i, n; for (i=0, n=values.length; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (v in u) continue; u[v] = 1; results.push(v); } return results; }; // Return the length of the input array. stats.count = function(values) { return values && values.length || 0; }; // Count the number of non-null, non-undefined, non-NaN values. stats.count.valid = function(values, f) { f = util.$(f); var v, i, n, valid = 0; for (i=0, n=values.length; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (util.isValid(v)) valid += 1; } return valid; }; // Count the number of null or undefined values. stats.count.missing = function(values, f) { f = util.$(f); var v, i, n, count = 0; for (i=0, n=values.length; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (v == null) count += 1; } return count; }; // Count the number of distinct values. // Null, undefined and NaN are each considered distinct values. stats.count.distinct = function(values, f) { f = util.$(f); var u = {}, v, i, n, count = 0; for (i=0, n=values.length; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (v in u) continue; u[v] = 1; count += 1; } return count; }; // Construct a map from distinct values to occurrence counts. stats.count.map = function(values, f) { f = util.$(f); var map = {}, v, i, n; for (i=0, n=values.length; i<n; ++i) { v = f ? f(values[i]) : values[i]; map[v] = (v in map) ? map[v] + 1 : 1; } return map; }; // Compute the median of an array of numbers. stats.median = function(values, f) { if (f) values = values.map(util.$(f)); values = values.filter(util.isValid).sort(util.cmp); return stats.quantile(values, 0.5); }; // Computes the quartile boundaries of an array of numbers. stats.quartile = function(values, f) { if (f) values = values.map(util.$(f)); values = values.filter(util.isValid).sort(util.cmp); var q = stats.quantile; return [q(values, 0.25), q(values, 0.50), q(values, 0.75)]; }; // Compute the quantile of a sorted array of numbers. // Adapted from the D3.js implementation. stats.quantile = function(values, f, p) { if (p === undefined) { p = f; f = util.identity; } f = util.$(f); var H = (values.length - 1) * p + 1, h = Math.floor(H), v = +f(values[h - 1]), e = H - h; return e ? v + e * (f(values[h]) - v) : v; }; // Compute the sum of an array of numbers. stats.sum = function(values, f) { f = util.$(f); for (var sum=0, i=0, n=values.length, v; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (util.isValid(v)) sum += v; } return sum; }; // Compute the mean (average) of an array of numbers. stats.mean = function(values, f) { f = util.$(f); var mean = 0, delta, i, n, c, v; for (i=0, c=0, n=values.length; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (util.isValid(v)) { delta = v - mean; mean = mean + delta / (++c); } } return mean; }; // Compute the geometric mean of an array of numbers. stats.mean.geometric = function(values, f) { f = util.$(f); var mean = 1, c, n, v, i; for (i=0, c=0, n=values.length; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (util.isValid(v)) { if (v <= 0) { throw Error("Geometric mean only defined for positive values."); } mean *= v; ++c; } } mean = c > 0 ? Math.pow(mean, 1/c) : 0; return mean; }; // Compute the harmonic mean of an array of numbers. stats.mean.harmonic = function(values, f) { f = util.$(f); var mean = 0, c, n, v, i; for (i=0, c=0, n=values.length; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (util.isValid(v)) { mean += 1/v; ++c; } } return c / mean; }; // Compute the sample variance of an array of numbers. stats.variance = function(values, f) { f = util.$(f); if (!util.isArray(values) || values.length < 2) return 0; var mean = 0, M2 = 0, delta, i, c, v; for (i=0, c=0; i<values.length; ++i) { v = f ? f(values[i]) : values[i]; if (util.isValid(v)) { delta = v - mean; mean = mean + delta / (++c); M2 = M2 + delta * (v - mean); } } M2 = M2 / (c - 1); return M2; }; // Compute the sample standard deviation of an array of numbers. stats.stdev = function(values, f) { return Math.sqrt(stats.variance(values, f)); }; // Compute the Pearson mode skewness ((median-mean)/stdev) of an array of numbers. stats.modeskew = function(values, f) { var avg = stats.mean(values, f), med = stats.median(values, f), std = stats.stdev(values, f); return std === 0 ? 0 : (avg - med) / std; }; // Find the minimum value in an array. stats.min = function(values, f) { return stats.extent(values, f)[0]; }; // Find the maximum value in an array. stats.max = function(values, f) { return stats.extent(values, f)[1]; }; // Find the minimum and maximum of an array of values. stats.extent = function(values, f) { f = util.$(f); var a, b, v, i, n = values.length; for (i=0; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (util.isValid(v)) { a = b = v; break; } } for (; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (util.isValid(v)) { if (v < a) a = v; if (v > b) b = v; } } return [a, b]; }; // Find the integer indices of the minimum and maximum values. stats.extent.index = function(values, f) { f = util.$(f); var x = -1, y = -1, a, b, v, i, n = values.length; for (i=0; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (util.isValid(v)) { a = b = v; x = y = i; break; } } for (; i<n; ++i) { v = f ? f(values[i]) : values[i]; if (util.isValid(v)) { if (v < a) { a = v; x = i; } if (v > b) { b = v; y = i; } } } return [x, y]; }; // Compute the dot product of two arrays of numbers. stats.dot = function(values, a, b) { var sum = 0, i, v; if (!b) { if (values.length !== a.length) { throw Error('Array lengths must match.'); } for (i=0; i<values.length; ++i) { v = values[i] * a[i]; if (v === v) sum += v; } } else { a = util.$(a); b = util.$(b); for (i=0; i<values.length; ++i) { v = a(values[i]) * b(values[i]); if (v === v) sum += v; } } return sum; }; // Compute the vector distance between two arrays of numbers. // Default is Euclidean (exp=2) distance, configurable via exp argument. stats.dist = function(values, a, b, exp) { var f = util.isFunction(b) || util.isString(b), X = values, Y = f ? values : a, e = f ? exp : b, L2 = e === 2 || e == null, n = values.length, s = 0, d, i; if (f) { a = util.$(a); b = util.$(b); } for (i=0; i<n; ++i) { d = f ? (a(X[i])-b(Y[i])) : (X[i]-Y[i]); s += L2 ? d*d : Math.pow(Math.abs(d), e); } return L2 ? Math.sqrt(s) : Math.pow(s, 1/e); }; // Compute the Cohen's d effect size between two arrays of numbers. stats.cohensd = function(values, a, b) { var X = b ? values.map(util.$(a)) : values, Y = b ? values.map(util.$(b)) : a, x1 = stats.mean(X), x2 = stats.mean(Y), n1 = stats.count.valid(X), n2 = stats.count.valid(Y); if ((n1+n2-2) <= 0) { // if both arrays are size 1, or one is empty, there's no effect size return 0; } // pool standard deviation var s1 = stats.variance(X), s2 = stats.variance(Y), s = Math.sqrt((((n1-1)*s1) + ((n2-1)*s2)) / (n1+n2-2)); // if there is no variance, there's no effect size return s===0 ? 0 : (x1 - x2) / s; }; // Computes the covariance between two arrays of numbers stats.covariance = function(values, a, b) { var X = b ? values.map(util.$(a)) : values, Y = b ? values.map(util.$(b)) : a, n = X.length, xm = stats.mean(X), ym = stats.mean(Y), sum = 0, c = 0, i, x, y, vx, vy; if (n !== Y.length) { throw Error('Input lengths must match.'); } for (i=0; i<n; ++i) { x = X[i]; vx = util.isValid(x); y = Y[i]; vy = util.isValid(y); if (vx && vy) { sum += (x-xm) * (y-ym); ++c; } else if (vx || vy) { throw Error('Valid values must align.'); } } return sum / (c-1); }; // Compute ascending rank scores for an array of values. // Ties are assigned their collective mean rank. stats.rank = function(values, f) { f = util.$(f) || util.identity; var a = values.map(function(v, i) { return {idx: i, val: f(v)}; }) .sort(util.comparator('val')); var n = values.length, r = Array(n), tie = -1, p = {}, i, v, mu; for (i=0; i<n; ++i) { v = a[i].val; if (tie < 0 && p === v) { tie = i - 1; } else if (tie > -1 && p !== v) { mu = 1 + (i-1 + tie) / 2; for (; tie<i; ++tie) r[a[tie].idx] = mu; tie = -1; } r[a[i].idx] = i + 1; p = v; } if (tie > -1) { mu = 1 + (n-1 + tie) / 2; for (; tie<n; ++tie) r[a[tie].idx] = mu; } return r; }; // Compute the sample Pearson product-moment correlation of two arrays of numbers. stats.cor = function(values, a, b) { var fn = b; b = fn ? values.map(util.$(b)) : a; a = fn ? values.map(util.$(a)) : values; var dot = stats.dot(a, b), mua = stats.mean(a), mub = stats.mean(b), sda = stats.stdev(a), sdb = stats.stdev(b), n = values.length; return (dot - n*mua*mub) / ((n-1) * sda * sdb); }; // Compute the Spearman rank correlation of two arrays of values. stats.cor.rank = function(values, a, b) { var ra = b ? stats.rank(values, a) : stats.rank(values), rb = b ? stats.rank(values, b) : stats.rank(a), n = values.length, i, s, d; for (i=0, s=0; i<n; ++i) { d = ra[i] - rb[i]; s += d * d; } return 1 - 6*s / (n * (n*n-1)); }; // Compute the distance correlation of two arrays of numbers. // http://en.wikipedia.org/wiki/Distance_correlation stats.cor.dist = function(values, a, b) { var X = b ? values.map(util.$(a)) : values, Y = b ? values.map(util.$(b)) : a; var A = stats.dist.mat(X), B = stats.dist.mat(Y), n = A.length, i, aa, bb, ab; for (i=0, aa=0, bb=0, ab=0; i<n; ++i) { aa += A[i]*A[i]; bb += B[i]*B[i]; ab += A[i]*B[i]; } return Math.sqrt(ab / Math.sqrt(aa*bb)); }; // Simple linear regression. // Returns a "fit" object with slope (m), intercept (b), // r value (R), and sum-squared residual error (rss). stats.linearRegression = function(values, a, b) { var X = b ? values.map(util.$(a)) : values, Y = b ? values.map(util.$(b)) : a, n = X.length, xy = stats.covariance(X, Y), // will throw err if valid vals don't align sx = stats.stdev(X), sy = stats.stdev(Y), slope = xy / (sx*sx), icept = stats.mean(Y) - slope * stats.mean(X), fit = {slope: slope, intercept: icept, R: xy / (sx*sy), rss: 0}, res, i; for (i=0; i<n; ++i) { if (util.isValid(X[i]) && util.isValid(Y[i])) { res = (slope*X[i] + icept) - Y[i]; fit.rss += res * res; } } return fit; }; // Namespace for bootstrap stats.bootstrap = {}; // Construct a bootstrapped confidence interval at a given percentile level // Arguments are an array, an optional n (defaults to 1000), // an optional alpha (defaults to 0.05), and an optional smoothing parameter stats.bootstrap.ci = function(values, a, b, c, d) { var X, N, alpha, smooth, bs, means, i; if (util.isFunction(a) || util.isString(a)) { X = values.map(util.$(a)); N = b; alpha = c; smooth = d; } else { X = values; N = a; alpha = b; smooth = c; } N = N ? +N : 1000; alpha = alpha || 0.05; bs = gen.random.bootstrap(X, smooth); for (i=0, means = Array(N); i<N; ++i) { means[i] = stats.mean(bs.samples(X.length)); } means.sort(util.numcmp); return [ stats.quantile(means, alpha/2), stats.quantile(means, 1-(alpha/2)) ]; }; // Namespace for z-tests stats.z = {}; // Construct a z-confidence interval at a given significance level // Arguments are an array and an optional alpha (defaults to 0.05). stats.z.ci = function(values, a, b) { var X = values, alpha = a; if (util.isFunction(a) || util.isString(a)) { X = values.map(util.$(a)); alpha = b; } alpha = alpha || 0.05; var z = alpha===0.05 ? 1.96 : gen.random.normal(0, 1).icdf(1-(alpha/2)), mu = stats.mean(X), SE = stats.stdev(X) / Math.sqrt(stats.count.valid(X)); return [mu - (z*SE), mu + (z*SE)]; }; // Perform a z-test of means. Returns the p-value. // If a single array is provided, performs a one-sample location test. // If two arrays or a table and two accessors are provided, performs // a two-sample location test. A paired test is performed if specified // by the options hash. // The options hash format is: {paired: boolean, nullh: number}. // http://en.wikipedia.org/wiki/Z-test // http://en.wikipedia.org/wiki/Paired_difference_test stats.z.test = function(values, a, b, opt) { if (util.isFunction(b) || util.isString(b)) { // table and accessors return (opt && opt.paired ? ztestP : ztest2)(opt, values, a, b); } else if (util.isArray(a)) { // two arrays return (b && b.paired ? ztestP : ztest2)(b, values, a); } else if (util.isFunction(a) || util.isString(a)) { return ztest1(b, values, a); // table and accessor } else { return ztest1(a, values); // one array } }; // Perform a z-test of means. Returns the p-value. // Assuming we have a list of values, and a null hypothesis. If no null // hypothesis, assume our null hypothesis is mu=0. function ztest1(opt, X, f) { var nullH = opt && opt.nullh || 0, gaussian = gen.random.normal(0, 1), mu = stats.mean(X,f), SE = stats.stdev(X,f) / Math.sqrt(stats.count.valid(X,f)); if (SE===0) { // Test not well defined when standard error is 0. return (mu - nullH) === 0 ? 1 : 0; } // Two-sided, so twice the one-sided cdf. var z = (mu - nullH) / SE; return 2 * gaussian.cdf(-Math.abs(z)); } // Perform a two sample paired z-test of means. Returns the p-value. function ztestP(opt, values, a, b) { var X = b ? values.map(util.$(a)) : values, Y = b ? values.map(util.$(b)) : a, n1 = stats.count(X), n2 = stats.count(Y), diffs = Array(), i; if (n1 !== n2) { throw Error('Array lengths must match.'); } for (i=0; i<n1; ++i) { // Only valid differences should contribute to the test statistic if (util.isValid(X[i]) && util.isValid(Y[i])) { diffs.push(X[i] - Y[i]); } } return stats.z.test(diffs, opt && opt.nullh || 0); } // Perform a two sample z-test of means. Returns the p-value. function ztest2(opt, values, a, b) { var X = b ? values.map(util.$(a)) : values, Y = b ? values.map(util.$(b)) : a, n1 = stats.count.valid(X), n2 = stats.count.valid(Y), gaussian = gen.random.normal(0, 1), meanDiff = stats.mean(X) - stats.mean(Y) - (opt && opt.nullh || 0), SE = Math.sqrt(stats.variance(X)/n1 + stats.variance(Y)/n2); if (SE===0) { // Not well defined when pooled standard error is 0. return meanDiff===0 ? 1 : 0; } // Two-tailed, so twice the one-sided cdf. var z = meanDiff / SE; return 2 * gaussian.cdf(-Math.abs(z)); } // Construct a mean-centered distance matrix for an array of numbers. stats.dist.mat = function(X) { var n = X.length, m = n*n, A = Array(m), R = gen.zeros(n), M = 0, v, i, j; for (i=0; i<n; ++i) { A[i*n+i] = 0; for (j=i+1; j<n; ++j) { A[i*n+j] = (v = Math.abs(X[i] - X[j])); A[j*n+i] = v; R[i] += v; R[j] += v; } } for (i=0; i<n; ++i) { M += R[i]; R[i] /= n; } M /= m; for (i=0; i<n; ++i) { for (j=i; j<n; ++j) { A[i*n+j] += M - R[i] - R[j]; A[j*n+i] = A[i*n+j]; } } return A; }; // Compute the Shannon entropy (log base 2) of an array of counts. stats.entropy = function(counts, f) { f = util.$(f); var i, p, s = 0, H = 0, n = counts.length; for (i=0; i<n; ++i) { s += (f ? f(counts[i]) : counts[i]); } if (s === 0) return 0; for (i=0; i<n; ++i) { p = (f ? f(counts[i]) : counts[i]) / s; if (p) H += p * Math.log(p); } return -H / Math.LN2; }; // Compute the mutual information between two discrete variables. // Returns an array of the form [MI, MI_distance] // MI_distance is defined as 1 - I(a,b) / H(a,b). // http://en.wikipedia.org/wiki/Mutual_information stats.mutual = function(values, a, b, counts) { var x = counts ? values.map(util.$(a)) : values, y = counts ? values.map(util.$(b)) : a, z = counts ? values.map(util.$(counts)) : b; var px = {}, py = {}, n = z.length, s = 0, I = 0, H = 0, p, t, i; for (i=0; i<n; ++i) { px[x[i]] = 0; py[y[i]] = 0; } for (i=0; i<n; ++i) { px[x[i]] += z[i]; py[y[i]] += z[i]; s += z[i]; } t = 1 / (s * Math.LN2); for (i=0; i<n; ++i) { if (z[i] === 0) continue; p = (s * z[i]) / (px[x[i]] * py[y[i]]); I += z[i] * t * Math.log(p); H += z[i] * t * Math.log(z[i]/s); } return [I, 1 + I/H]; }; // Compute the mutual information between two discrete variables. stats.mutual.info = function(values, a, b, counts) { return stats.mutual(values, a, b, counts)[0]; }; // Compute the mutual information distance between two discrete variables. // MI_distance is defined as 1 - I(a,b) / H(a,b). stats.mutual.dist = function(values, a, b, counts) { return stats.mutual(values, a, b, counts)[1]; }; // Compute a profile of summary statistics for a variable. stats.profile = function(values, f) { var mean = 0, valid = 0, missing = 0, distinct = 0, min = null, max = null, M2 = 0, vals = [], u = {}, delta, sd, i, v, x; // compute summary stats for (i=0; i<values.length; ++i) { v = f ? f(values[i]) : values[i]; // update unique values u[v] = (v in u) ? u[v] + 1 : (distinct += 1, 1); if (v == null) { ++missing; } else if (util.isValid(v)) { // update stats x = (typeof v === 'string') ? v.length : v; if (min===null || x < min) min = x; if (max===null || x > max) max = x; delta = x - mean; mean = mean + delta / (++valid); M2 = M2 + delta * (x - mean); vals.push(x); } } M2 = M2 / (valid - 1); sd = Math.sqrt(M2); // sort values for median and iqr vals.sort(util.cmp); return { type: type(values, f), unique: u, count: values.length, valid: valid, missing: missing, distinct: distinct, min: min, max: max, mean: mean, stdev: sd, median: (v = stats.quantile(vals, 0.5)), q1: stats.quantile(vals, 0.25), q3: stats.quantile(vals, 0.75), modeskew: sd === 0 ? 0 : (mean - v) / sd }; }; // Compute profiles for all variables in a data set. stats.summary = function(data, fields) { fields = fields || util.keys(data[0]); var s = fields.map(function(f) { var p = stats.profile(data, util.$(f)); return (p.field = f, p); }); return (s.__summary__ = true, s); };