datalib
Version:
JavaScript utilites for loading, summarizing and working with data.
721 lines (640 loc) • 19.4 kB
JavaScript
var util = require('./util');
var type = require('./import/type');
var gen = require('./generate');
var stats = module.exports;
// Collect unique values.
// Output: an array of unique values, in first-observed order
stats.unique = function(values, f, results) {
f = util.$(f);
results = results || [];
var u = {}, v, i, n;
for (i=0, n=values.length; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (v in u) continue;
u[v] = 1;
results.push(v);
}
return results;
};
// Return the length of the input array.
stats.count = function(values) {
return values && values.length || 0;
};
// Count the number of non-null, non-undefined, non-NaN values.
stats.count.valid = function(values, f) {
f = util.$(f);
var v, i, n, valid = 0;
for (i=0, n=values.length; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (util.isValid(v)) valid += 1;
}
return valid;
};
// Count the number of null or undefined values.
stats.count.missing = function(values, f) {
f = util.$(f);
var v, i, n, count = 0;
for (i=0, n=values.length; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (v == null) count += 1;
}
return count;
};
// Count the number of distinct values.
// Null, undefined and NaN are each considered distinct values.
stats.count.distinct = function(values, f) {
f = util.$(f);
var u = {}, v, i, n, count = 0;
for (i=0, n=values.length; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (v in u) continue;
u[v] = 1;
count += 1;
}
return count;
};
// Construct a map from distinct values to occurrence counts.
stats.count.map = function(values, f) {
f = util.$(f);
var map = {}, v, i, n;
for (i=0, n=values.length; i<n; ++i) {
v = f ? f(values[i]) : values[i];
map[v] = (v in map) ? map[v] + 1 : 1;
}
return map;
};
// Compute the median of an array of numbers.
stats.median = function(values, f) {
if (f) values = values.map(util.$(f));
values = values.filter(util.isValid).sort(util.cmp);
return stats.quantile(values, 0.5);
};
// Computes the quartile boundaries of an array of numbers.
stats.quartile = function(values, f) {
if (f) values = values.map(util.$(f));
values = values.filter(util.isValid).sort(util.cmp);
var q = stats.quantile;
return [q(values, 0.25), q(values, 0.50), q(values, 0.75)];
};
// Compute the quantile of a sorted array of numbers.
// Adapted from the D3.js implementation.
stats.quantile = function(values, f, p) {
if (p === undefined) { p = f; f = util.identity; }
f = util.$(f);
var H = (values.length - 1) * p + 1,
h = Math.floor(H),
v = +f(values[h - 1]),
e = H - h;
return e ? v + e * (f(values[h]) - v) : v;
};
// Compute the sum of an array of numbers.
stats.sum = function(values, f) {
f = util.$(f);
for (var sum=0, i=0, n=values.length, v; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (util.isValid(v)) sum += v;
}
return sum;
};
// Compute the mean (average) of an array of numbers.
stats.mean = function(values, f) {
f = util.$(f);
var mean = 0, delta, i, n, c, v;
for (i=0, c=0, n=values.length; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (util.isValid(v)) {
delta = v - mean;
mean = mean + delta / (++c);
}
}
return mean;
};
// Compute the geometric mean of an array of numbers.
stats.mean.geometric = function(values, f) {
f = util.$(f);
var mean = 1, c, n, v, i;
for (i=0, c=0, n=values.length; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (util.isValid(v)) {
if (v <= 0) {
throw Error("Geometric mean only defined for positive values.");
}
mean *= v;
++c;
}
}
mean = c > 0 ? Math.pow(mean, 1/c) : 0;
return mean;
};
// Compute the harmonic mean of an array of numbers.
stats.mean.harmonic = function(values, f) {
f = util.$(f);
var mean = 0, c, n, v, i;
for (i=0, c=0, n=values.length; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (util.isValid(v)) {
mean += 1/v;
++c;
}
}
return c / mean;
};
// Compute the sample variance of an array of numbers.
stats.variance = function(values, f) {
f = util.$(f);
if (!util.isArray(values) || values.length < 2) return 0;
var mean = 0, M2 = 0, delta, i, c, v;
for (i=0, c=0; i<values.length; ++i) {
v = f ? f(values[i]) : values[i];
if (util.isValid(v)) {
delta = v - mean;
mean = mean + delta / (++c);
M2 = M2 + delta * (v - mean);
}
}
M2 = M2 / (c - 1);
return M2;
};
// Compute the sample standard deviation of an array of numbers.
stats.stdev = function(values, f) {
return Math.sqrt(stats.variance(values, f));
};
// Compute the Pearson mode skewness ((median-mean)/stdev) of an array of numbers.
stats.modeskew = function(values, f) {
var avg = stats.mean(values, f),
med = stats.median(values, f),
std = stats.stdev(values, f);
return std === 0 ? 0 : (avg - med) / std;
};
// Find the minimum value in an array.
stats.min = function(values, f) {
return stats.extent(values, f)[0];
};
// Find the maximum value in an array.
stats.max = function(values, f) {
return stats.extent(values, f)[1];
};
// Find the minimum and maximum of an array of values.
stats.extent = function(values, f) {
f = util.$(f);
var a, b, v, i, n = values.length;
for (i=0; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (util.isValid(v)) { a = b = v; break; }
}
for (; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (util.isValid(v)) {
if (v < a) a = v;
if (v > b) b = v;
}
}
return [a, b];
};
// Find the integer indices of the minimum and maximum values.
stats.extent.index = function(values, f) {
f = util.$(f);
var x = -1, y = -1, a, b, v, i, n = values.length;
for (i=0; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (util.isValid(v)) { a = b = v; x = y = i; break; }
}
for (; i<n; ++i) {
v = f ? f(values[i]) : values[i];
if (util.isValid(v)) {
if (v < a) { a = v; x = i; }
if (v > b) { b = v; y = i; }
}
}
return [x, y];
};
// Compute the dot product of two arrays of numbers.
stats.dot = function(values, a, b) {
var sum = 0, i, v;
if (!b) {
if (values.length !== a.length) {
throw Error('Array lengths must match.');
}
for (i=0; i<values.length; ++i) {
v = values[i] * a[i];
if (v === v) sum += v;
}
} else {
a = util.$(a);
b = util.$(b);
for (i=0; i<values.length; ++i) {
v = a(values[i]) * b(values[i]);
if (v === v) sum += v;
}
}
return sum;
};
// Compute the vector distance between two arrays of numbers.
// Default is Euclidean (exp=2) distance, configurable via exp argument.
stats.dist = function(values, a, b, exp) {
var f = util.isFunction(b) || util.isString(b),
X = values,
Y = f ? values : a,
e = f ? exp : b,
L2 = e === 2 || e == null,
n = values.length, s = 0, d, i;
if (f) {
a = util.$(a);
b = util.$(b);
}
for (i=0; i<n; ++i) {
d = f ? (a(X[i])-b(Y[i])) : (X[i]-Y[i]);
s += L2 ? d*d : Math.pow(Math.abs(d), e);
}
return L2 ? Math.sqrt(s) : Math.pow(s, 1/e);
};
// Compute the Cohen's d effect size between two arrays of numbers.
stats.cohensd = function(values, a, b) {
var X = b ? values.map(util.$(a)) : values,
Y = b ? values.map(util.$(b)) : a,
x1 = stats.mean(X),
x2 = stats.mean(Y),
n1 = stats.count.valid(X),
n2 = stats.count.valid(Y);
if ((n1+n2-2) <= 0) {
// if both arrays are size 1, or one is empty, there's no effect size
return 0;
}
// pool standard deviation
var s1 = stats.variance(X),
s2 = stats.variance(Y),
s = Math.sqrt((((n1-1)*s1) + ((n2-1)*s2)) / (n1+n2-2));
// if there is no variance, there's no effect size
return s===0 ? 0 : (x1 - x2) / s;
};
// Computes the covariance between two arrays of numbers
stats.covariance = function(values, a, b) {
var X = b ? values.map(util.$(a)) : values,
Y = b ? values.map(util.$(b)) : a,
n = X.length,
xm = stats.mean(X),
ym = stats.mean(Y),
sum = 0, c = 0, i, x, y, vx, vy;
if (n !== Y.length) {
throw Error('Input lengths must match.');
}
for (i=0; i<n; ++i) {
x = X[i]; vx = util.isValid(x);
y = Y[i]; vy = util.isValid(y);
if (vx && vy) {
sum += (x-xm) * (y-ym);
++c;
} else if (vx || vy) {
throw Error('Valid values must align.');
}
}
return sum / (c-1);
};
// Compute ascending rank scores for an array of values.
// Ties are assigned their collective mean rank.
stats.rank = function(values, f) {
f = util.$(f) || util.identity;
var a = values.map(function(v, i) {
return {idx: i, val: f(v)};
})
.sort(util.comparator('val'));
var n = values.length,
r = Array(n),
tie = -1, p = {}, i, v, mu;
for (i=0; i<n; ++i) {
v = a[i].val;
if (tie < 0 && p === v) {
tie = i - 1;
} else if (tie > -1 && p !== v) {
mu = 1 + (i-1 + tie) / 2;
for (; tie<i; ++tie) r[a[tie].idx] = mu;
tie = -1;
}
r[a[i].idx] = i + 1;
p = v;
}
if (tie > -1) {
mu = 1 + (n-1 + tie) / 2;
for (; tie<n; ++tie) r[a[tie].idx] = mu;
}
return r;
};
// Compute the sample Pearson product-moment correlation of two arrays of numbers.
stats.cor = function(values, a, b) {
var fn = b;
b = fn ? values.map(util.$(b)) : a;
a = fn ? values.map(util.$(a)) : values;
var dot = stats.dot(a, b),
mua = stats.mean(a),
mub = stats.mean(b),
sda = stats.stdev(a),
sdb = stats.stdev(b),
n = values.length;
return (dot - n*mua*mub) / ((n-1) * sda * sdb);
};
// Compute the Spearman rank correlation of two arrays of values.
stats.cor.rank = function(values, a, b) {
var ra = b ? stats.rank(values, a) : stats.rank(values),
rb = b ? stats.rank(values, b) : stats.rank(a),
n = values.length, i, s, d;
for (i=0, s=0; i<n; ++i) {
d = ra[i] - rb[i];
s += d * d;
}
return 1 - 6*s / (n * (n*n-1));
};
// Compute the distance correlation of two arrays of numbers.
// http://en.wikipedia.org/wiki/Distance_correlation
stats.cor.dist = function(values, a, b) {
var X = b ? values.map(util.$(a)) : values,
Y = b ? values.map(util.$(b)) : a;
var A = stats.dist.mat(X),
B = stats.dist.mat(Y),
n = A.length,
i, aa, bb, ab;
for (i=0, aa=0, bb=0, ab=0; i<n; ++i) {
aa += A[i]*A[i];
bb += B[i]*B[i];
ab += A[i]*B[i];
}
return Math.sqrt(ab / Math.sqrt(aa*bb));
};
// Simple linear regression.
// Returns a "fit" object with slope (m), intercept (b),
// r value (R), and sum-squared residual error (rss).
stats.linearRegression = function(values, a, b) {
var X = b ? values.map(util.$(a)) : values,
Y = b ? values.map(util.$(b)) : a,
n = X.length,
xy = stats.covariance(X, Y), // will throw err if valid vals don't align
sx = stats.stdev(X),
sy = stats.stdev(Y),
slope = xy / (sx*sx),
icept = stats.mean(Y) - slope * stats.mean(X),
fit = {slope: slope, intercept: icept, R: xy / (sx*sy), rss: 0},
res, i;
for (i=0; i<n; ++i) {
if (util.isValid(X[i]) && util.isValid(Y[i])) {
res = (slope*X[i] + icept) - Y[i];
fit.rss += res * res;
}
}
return fit;
};
// Namespace for bootstrap
stats.bootstrap = {};
// Construct a bootstrapped confidence interval at a given percentile level
// Arguments are an array, an optional n (defaults to 1000),
// an optional alpha (defaults to 0.05), and an optional smoothing parameter
stats.bootstrap.ci = function(values, a, b, c, d) {
var X, N, alpha, smooth, bs, means, i;
if (util.isFunction(a) || util.isString(a)) {
X = values.map(util.$(a));
N = b;
alpha = c;
smooth = d;
} else {
X = values;
N = a;
alpha = b;
smooth = c;
}
N = N ? +N : 1000;
alpha = alpha || 0.05;
bs = gen.random.bootstrap(X, smooth);
for (i=0, means = Array(N); i<N; ++i) {
means[i] = stats.mean(bs.samples(X.length));
}
means.sort(util.numcmp);
return [
stats.quantile(means, alpha/2),
stats.quantile(means, 1-(alpha/2))
];
};
// Namespace for z-tests
stats.z = {};
// Construct a z-confidence interval at a given significance level
// Arguments are an array and an optional alpha (defaults to 0.05).
stats.z.ci = function(values, a, b) {
var X = values, alpha = a;
if (util.isFunction(a) || util.isString(a)) {
X = values.map(util.$(a));
alpha = b;
}
alpha = alpha || 0.05;
var z = alpha===0.05 ? 1.96 : gen.random.normal(0, 1).icdf(1-(alpha/2)),
mu = stats.mean(X),
SE = stats.stdev(X) / Math.sqrt(stats.count.valid(X));
return [mu - (z*SE), mu + (z*SE)];
};
// Perform a z-test of means. Returns the p-value.
// If a single array is provided, performs a one-sample location test.
// If two arrays or a table and two accessors are provided, performs
// a two-sample location test. A paired test is performed if specified
// by the options hash.
// The options hash format is: {paired: boolean, nullh: number}.
// http://en.wikipedia.org/wiki/Z-test
// http://en.wikipedia.org/wiki/Paired_difference_test
stats.z.test = function(values, a, b, opt) {
if (util.isFunction(b) || util.isString(b)) { // table and accessors
return (opt && opt.paired ? ztestP : ztest2)(opt, values, a, b);
} else if (util.isArray(a)) { // two arrays
return (b && b.paired ? ztestP : ztest2)(b, values, a);
} else if (util.isFunction(a) || util.isString(a)) {
return ztest1(b, values, a); // table and accessor
} else {
return ztest1(a, values); // one array
}
};
// Perform a z-test of means. Returns the p-value.
// Assuming we have a list of values, and a null hypothesis. If no null
// hypothesis, assume our null hypothesis is mu=0.
function ztest1(opt, X, f) {
var nullH = opt && opt.nullh || 0,
gaussian = gen.random.normal(0, 1),
mu = stats.mean(X,f),
SE = stats.stdev(X,f) / Math.sqrt(stats.count.valid(X,f));
if (SE===0) {
// Test not well defined when standard error is 0.
return (mu - nullH) === 0 ? 1 : 0;
}
// Two-sided, so twice the one-sided cdf.
var z = (mu - nullH) / SE;
return 2 * gaussian.cdf(-Math.abs(z));
}
// Perform a two sample paired z-test of means. Returns the p-value.
function ztestP(opt, values, a, b) {
var X = b ? values.map(util.$(a)) : values,
Y = b ? values.map(util.$(b)) : a,
n1 = stats.count(X),
n2 = stats.count(Y),
diffs = Array(), i;
if (n1 !== n2) {
throw Error('Array lengths must match.');
}
for (i=0; i<n1; ++i) {
// Only valid differences should contribute to the test statistic
if (util.isValid(X[i]) && util.isValid(Y[i])) {
diffs.push(X[i] - Y[i]);
}
}
return stats.z.test(diffs, opt && opt.nullh || 0);
}
// Perform a two sample z-test of means. Returns the p-value.
function ztest2(opt, values, a, b) {
var X = b ? values.map(util.$(a)) : values,
Y = b ? values.map(util.$(b)) : a,
n1 = stats.count.valid(X),
n2 = stats.count.valid(Y),
gaussian = gen.random.normal(0, 1),
meanDiff = stats.mean(X) - stats.mean(Y) - (opt && opt.nullh || 0),
SE = Math.sqrt(stats.variance(X)/n1 + stats.variance(Y)/n2);
if (SE===0) {
// Not well defined when pooled standard error is 0.
return meanDiff===0 ? 1 : 0;
}
// Two-tailed, so twice the one-sided cdf.
var z = meanDiff / SE;
return 2 * gaussian.cdf(-Math.abs(z));
}
// Construct a mean-centered distance matrix for an array of numbers.
stats.dist.mat = function(X) {
var n = X.length,
m = n*n,
A = Array(m),
R = gen.zeros(n),
M = 0, v, i, j;
for (i=0; i<n; ++i) {
A[i*n+i] = 0;
for (j=i+1; j<n; ++j) {
A[i*n+j] = (v = Math.abs(X[i] - X[j]));
A[j*n+i] = v;
R[i] += v;
R[j] += v;
}
}
for (i=0; i<n; ++i) {
M += R[i];
R[i] /= n;
}
M /= m;
for (i=0; i<n; ++i) {
for (j=i; j<n; ++j) {
A[i*n+j] += M - R[i] - R[j];
A[j*n+i] = A[i*n+j];
}
}
return A;
};
// Compute the Shannon entropy (log base 2) of an array of counts.
stats.entropy = function(counts, f) {
f = util.$(f);
var i, p, s = 0, H = 0, n = counts.length;
for (i=0; i<n; ++i) {
s += (f ? f(counts[i]) : counts[i]);
}
if (s === 0) return 0;
for (i=0; i<n; ++i) {
p = (f ? f(counts[i]) : counts[i]) / s;
if (p) H += p * Math.log(p);
}
return -H / Math.LN2;
};
// Compute the mutual information between two discrete variables.
// Returns an array of the form [MI, MI_distance]
// MI_distance is defined as 1 - I(a,b) / H(a,b).
// http://en.wikipedia.org/wiki/Mutual_information
stats.mutual = function(values, a, b, counts) {
var x = counts ? values.map(util.$(a)) : values,
y = counts ? values.map(util.$(b)) : a,
z = counts ? values.map(util.$(counts)) : b;
var px = {},
py = {},
n = z.length,
s = 0, I = 0, H = 0, p, t, i;
for (i=0; i<n; ++i) {
px[x[i]] = 0;
py[y[i]] = 0;
}
for (i=0; i<n; ++i) {
px[x[i]] += z[i];
py[y[i]] += z[i];
s += z[i];
}
t = 1 / (s * Math.LN2);
for (i=0; i<n; ++i) {
if (z[i] === 0) continue;
p = (s * z[i]) / (px[x[i]] * py[y[i]]);
I += z[i] * t * Math.log(p);
H += z[i] * t * Math.log(z[i]/s);
}
return [I, 1 + I/H];
};
// Compute the mutual information between two discrete variables.
stats.mutual.info = function(values, a, b, counts) {
return stats.mutual(values, a, b, counts)[0];
};
// Compute the mutual information distance between two discrete variables.
// MI_distance is defined as 1 - I(a,b) / H(a,b).
stats.mutual.dist = function(values, a, b, counts) {
return stats.mutual(values, a, b, counts)[1];
};
// Compute a profile of summary statistics for a variable.
stats.profile = function(values, f) {
var mean = 0,
valid = 0,
missing = 0,
distinct = 0,
min = null,
max = null,
M2 = 0,
vals = [],
u = {}, delta, sd, i, v, x;
// compute summary stats
for (i=0; i<values.length; ++i) {
v = f ? f(values[i]) : values[i];
// update unique values
u[v] = (v in u) ? u[v] + 1 : (distinct += 1, 1);
if (v == null) {
++missing;
} else if (util.isValid(v)) {
// update stats
x = (typeof v === 'string') ? v.length : v;
if (min===null || x < min) min = x;
if (max===null || x > max) max = x;
delta = x - mean;
mean = mean + delta / (++valid);
M2 = M2 + delta * (x - mean);
vals.push(x);
}
}
M2 = M2 / (valid - 1);
sd = Math.sqrt(M2);
// sort values for median and iqr
vals.sort(util.cmp);
return {
type: type(values, f),
unique: u,
count: values.length,
valid: valid,
missing: missing,
distinct: distinct,
min: min,
max: max,
mean: mean,
stdev: sd,
median: (v = stats.quantile(vals, 0.5)),
q1: stats.quantile(vals, 0.25),
q3: stats.quantile(vals, 0.75),
modeskew: sd === 0 ? 0 : (mean - v) / sd
};
};
// Compute profiles for all variables in a data set.
stats.summary = function(data, fields) {
fields = fields || util.keys(data[0]);
var s = fields.map(function(f) {
var p = stats.profile(data, util.$(f));
return (p.field = f, p);
});
return (s.__summary__ = true, s);
};