UNPKG

tdigest

Version:

javascript implementation of Dunning's T-Digest for streaming quantile approximation

394 lines (364 loc) 13.4 kB
// // TDigest: // // approximate distribution percentiles from a stream of reals // var RBTree = require('bintrees').RBTree; function TDigest(delta, K, CX) { // allocate a TDigest structure. // // delta is the compression factor, the max fraction of mass that // can be owned by one centroid (bigger, up to 1.0, means more // compression). delta=false switches off TDigest behavior and treats // the distribution as discrete, with no merging and exact values // reported. // // K is a size threshold that triggers recompression as the TDigest // grows during input. (Set it to 0 to disable automatic recompression) // // CX specifies how often to update cached cumulative totals used // for quantile estimation during ingest (see cumulate()). Set to // 0 to use exact quantiles for each new point. // this.discrete = (delta === false); this.delta = delta || 0.01; this.K = (K === undefined) ? 25 : K; this.CX = (CX === undefined) ? 1.1 : CX; this.centroids = new RBTree(compare_centroid_means); this.nreset = 0; this.reset(); } TDigest.prototype.reset = function() { // prepare to digest new points. // this.centroids.clear(); this.n = 0; this.nreset += 1; this.last_cumulate = 0; }; TDigest.prototype.size = function() { return this.centroids.size; }; TDigest.prototype.toArray = function(everything) { // return {mean,n} of centroids as an array ordered by mean. // var result = []; if (everything) { this._cumulate(true); // be sure cumns are exact this.centroids.each(function(c) { result.push(c); }); } else { this.centroids.each(function(c) { result.push({mean:c.mean, n:c.n}); }); } return result; }; TDigest.prototype.summary = function() { var approx = (this.discrete) ? "exact " : "approximating "; var s = [approx + this.n + " samples using " + this.size() + " centroids", "min = "+this.percentile(0), "Q1 = "+this.percentile(0.25), "Q2 = "+this.percentile(0.5), "Q3 = "+this.percentile(0.75), "max = "+this.percentile(1.0)]; return s.join('\n'); }; function compare_centroid_means(a, b) { // order two centroids by mean. // return (a.mean > b.mean) ? 1 : (a.mean < b.mean) ? -1 : 0; } function compare_centroid_mean_cumns(a, b) { // order two centroids by mean_cumn. // return (a.mean_cumn - b.mean_cumn); } TDigest.prototype.push = function(x, n) { // incorporate value or array of values x, having count n into the // TDigest. n defaults to 1. // n = n || 1; x = Array.isArray(x) ? x : [x]; for (var i = 0 ; i < x.length ; i++) { this._digest(x[i], n); } }; TDigest.prototype.push_centroid = function(c) { // incorporate centroid or array of centroids c // c = Array.isArray(c) ? c : [c]; for (var i = 0 ; i < c.length ; i++) { this._digest(c[i].mean, c[i].n); } }; TDigest.prototype._cumulate = function(exact) { // update cumulative counts for each centroid // // exact: falsey means only cumulate after sufficient // growth. During ingest, these counts are used as quantile // estimates, and they work well even when somewhat out of // date. (this is a departure from the publication, you may set CX // to 0 to disable). // if (this.n === this.last_cumulate || !exact && this.CX && this.CX > (this.n / this.last_cumulate)) { return; } var cumn = 0; this.centroids.each(function(c) { c.mean_cumn = cumn + c.n / 2; // half of n at the mean cumn = c.cumn = cumn + c.n; }); this.n = this.last_cumulate = cumn; }; TDigest.prototype.find_nearest = function(x) { // find the centroid closest to x. The assumption of // unique means and a unique nearest centroid departs from the // paper, see _digest() below // if (this.size() === 0) { return null; } var iter = this.centroids.lowerBound({mean:x}); // x <= iter || iter==null var c = (iter.data() === null) ? iter.prev() : iter.data(); if (c.mean === x || this.discrete) { return c; // c is either x or a neighbor (discrete: no distance func) } var prev = iter.prev(); if (prev && Math.abs(prev.mean - x) < Math.abs(c.mean - x)) { return prev; } else { return c; } }; TDigest.prototype._new_centroid = function(x, n, cumn) { // create and insert a new centroid into the digest (don't update // cumulatives). // var c = {mean:x, n:n, cumn:cumn}; this.centroids.insert(c); this.n += n; return c; }; TDigest.prototype._addweight = function(nearest, x, n) { // add weight at location x to nearest centroid. adding x to // nearest will not shift its relative position in the tree and // require reinsertion. // if (x !== nearest.mean) { nearest.mean += n * (x - nearest.mean) / (nearest.n + n); } nearest.cumn += n; nearest.mean_cumn += n / 2; nearest.n += n; this.n += n; }; TDigest.prototype._digest = function(x, n) { // incorporate value x, having count n into the TDigest. // var min = this.centroids.min(); var max = this.centroids.max(); var nearest = this.find_nearest(x); if (nearest && nearest.mean === x) { // accumulate exact matches into the centroid without // limit. this is a departure from the paper, made so // centroids remain unique and code can be simple. this._addweight(nearest, x, n); } else if (nearest === min) { this._new_centroid(x, n, 0); // new point around min boundary } else if (nearest === max ) { this._new_centroid(x, n, this.n); // new point around max boundary } else if (this.discrete) { this._new_centroid(x, n, nearest.cumn); // never merge } else { // conider a merge based on nearest centroid's capacity. if // there's not room for all of n, don't bother merging any of // it into nearest, as we'll have to make a new centroid // anyway for the remainder (departure from the paper). var p = nearest.mean_cumn / this.n; var max_n = Math.floor(4 * this.n * this.delta * p * (1 - p)); if (max_n - nearest.n >= n) { this._addweight(nearest, x, n); } else { this._new_centroid(x, n, nearest.cumn); } } this._cumulate(false); if (!this.discrete && this.K && this.size() > this.K / this.delta) { // re-process the centroids and hope for some compression. this.compress(); } }; TDigest.prototype.bound_mean = function(x) { // find centroids lower and upper such that lower.mean < x < // upper.mean or lower.mean === x === upper.mean. Don't call // this for x out of bounds. // var iter = this.centroids.upperBound({mean:x}); // x < iter var lower = iter.prev(); // lower <= x var upper = (lower.mean === x) ? lower : iter.next(); return [lower, upper]; }; TDigest.prototype.p_rank = function(x_or_xlist) { // return approximate percentile-ranks (0..1) for data value x. // or list of x. calculated according to // https://en.wikipedia.org/wiki/Percentile_rank // // (Note that in continuous mode, boundary sample values will // report half their centroid weight inward from 0/1 as the // percentile-rank. X values outside the observed range return // 0/1) // // this triggers cumulate() if cumn's are out of date. // var xs = Array.isArray(x_or_xlist) ? x_or_xlist : [x_or_xlist]; var ps = xs.map(this._p_rank, this); return Array.isArray(x_or_xlist) ? ps : ps[0]; }; TDigest.prototype._p_rank = function(x) { if (this.size() === 0) { return undefined; } else if (x < this.centroids.min().mean) { return 0.0; } else if (x > this.centroids.max().mean) { return 1.0; } // find centroids that bracket x and interpolate x's cumn from // their cumn's. this._cumulate(true); // be sure cumns are exact var bound = this.bound_mean(x); var lower = bound[0], upper = bound[1]; if (this.discrete) { return lower.cumn / this.n; } else { var cumn = lower.mean_cumn; if (lower !== upper) { cumn += (x - lower.mean) * (upper.mean_cumn - lower.mean_cumn) / (upper.mean - lower.mean); } return cumn / this.n; } }; TDigest.prototype.bound_mean_cumn = function(cumn) { // find centroids lower and upper such that lower.mean_cumn < x < // upper.mean_cumn or lower.mean_cumn === x === upper.mean_cumn. Don't call // this for cumn out of bounds. // // XXX because mean and mean_cumn give rise to the same sort order // (up to identical means), use the mean rbtree for our search. this.centroids._comparator = compare_centroid_mean_cumns; var iter = this.centroids.upperBound({mean_cumn:cumn}); // cumn < iter this.centroids._comparator = compare_centroid_means; var lower = iter.prev(); // lower <= cumn var upper = (lower && lower.mean_cumn === cumn) ? lower : iter.next(); return [lower, upper]; }; TDigest.prototype.percentile = function(p_or_plist) { // for percentage p (0..1), or for each p in a list of ps, return // the smallest data value q at which at least p percent of the // observations <= q. // // for discrete distributions, this selects q using the Nearest // Rank Method // (https://en.wikipedia.org/wiki/Percentile#The_Nearest_Rank_method) // (in scipy, same as percentile(...., interpolation='higher') // // for continuous distributions, interpolates data values between // count-weighted bracketing means. // // this triggers cumulate() if cumn's are out of date. // var ps = Array.isArray(p_or_plist) ? p_or_plist : [p_or_plist]; var qs = ps.map(this._percentile, this); return Array.isArray(p_or_plist) ? qs : qs[0]; }; TDigest.prototype._percentile = function(p) { if (this.size() === 0) { return undefined; } this._cumulate(true); // be sure cumns are exact var h = this.n * p; var bound = this.bound_mean_cumn(h); var lower = bound[0], upper = bound[1]; if (upper === lower || lower === null || upper === null) { return (lower || upper).mean; } else if (!this.discrete) { return lower.mean + (h - lower.mean_cumn) * (upper.mean - lower.mean) / (upper.mean_cumn - lower.mean_cumn); } else if (h <= lower.cumn) { return lower.mean; } else { return upper.mean; } }; function pop_random(choices) { // remove and return an item randomly chosen from the array of choices // (mutates choices) // var idx = Math.floor(Math.random() * choices.length); return choices.splice(idx, 1)[0]; } TDigest.prototype.compress = function() { // TDigests experience worst case compression (none) when input // increases monotonically. Improve on any bad luck by // reconsuming digest centroids as if they were weighted points // while shuffling their order (and hope for the best). // if (this.compressing) { return; } var points = this.toArray(); this.reset(); this.compressing = true; while (points.length > 0) { this.push_centroid(pop_random(points)); } this._cumulate(true); this.compressing = false; }; function Digest(config) { // allocate a distribution digest structure. This is an extension // of a TDigest structure that starts in exact histogram (discrete) // mode, and automatically switches to TDigest mode for large // samples that appear to be from a continuous distribution. // this.config = config || {}; this.mode = this.config.mode || 'auto'; // disc, cont, auto TDigest.call(this, this.mode === 'cont' ? config.delta : false); this.digest_ratio = this.config.ratio || 0.9; this.digest_thresh = this.config.thresh || 1000; this.n_unique = 0; } Digest.prototype = Object.create(TDigest.prototype); Digest.prototype.constructor = Digest; Digest.prototype.push = function(x_or_xlist) { TDigest.prototype.push.call(this, x_or_xlist); this.check_continuous(); }; Digest.prototype._new_centroid = function(x, n, cumn) { this.n_unique += 1; TDigest.prototype._new_centroid.call(this, x, n, cumn); }; Digest.prototype._addweight = function(nearest, x, n) { if (nearest.n === 1) { this.n_unique -= 1; } TDigest.prototype._addweight.call(this, nearest, x, n); }; Digest.prototype.check_continuous = function() { // while in 'auto' mode, if there are many unique elements, assume // they are from a continuous distribution and switch to 'cont' // mode (tdigest behavior). Return true on transition from // disctete to continuous. if (this.mode !== 'auto' || this.size() < this.digest_thresh) { return false; } if (this.n_unique / this.size() > this.digest_ratio) { this.mode = 'cont'; this.discrete = false; this.delta = this.config.delta || 0.01; this.compress(); return true; } return false; }; module.exports = { 'TDigest': TDigest, 'Digest': Digest };