UNPKG

tdigest

Version:

javascript implementation of Dunning's T-Digest for streaming quantile approximation

908 lines 40.4 kB
(function (factory) { if (typeof define === 'function' && define.amd) { // AMD. Register as an anonymous module. define([], factory); } else if (typeof exports === 'object') { // Node. Does not work with strict CommonJS, but // only CommonJS-like enviroments that support module.exports, // like Node. module.exports = factory(); } else { // Browser globals this.tdigest = factory(); } }(function () { var global = this, define; function _require(id) { var module = _require.cache[id]; if (!module) { var exports = {}; module = _require.cache[id] = { id: id, exports: exports }; _require.modules[id].call(exports, module, exports); } return module.exports; } _require.cache = []; _require.modules = [ function (module, exports) { module.exports = { RBTree: _require(2), BinTree: _require(1) }; }, function (module, exports) { var TreeBase = _require(3); function Node(data) { this.data = data; this.left = null; this.right = null; } Node.prototype.get_child = function (dir) { return dir ? this.right : this.left; }; Node.prototype.set_child = function (dir, val) { if (dir) { this.right = val; } else { this.left = val; } }; function BinTree(comparator) { this._root = null; this._comparator = comparator; this.size = 0; } BinTree.prototype = new TreeBase(); // returns true if inserted, false if duplicate // returns true if inserted, false if duplicate BinTree.prototype.insert = function (data) { if (this._root === null) { // empty tree this._root = new Node(data); this.size++; return true; } var dir = 0; // setup // setup var p = null; // parent // parent var node = this._root; // search down // search down while (true) { if (node === null) { // insert new node at the bottom node = new Node(data); p.set_child(dir, node); ret = true; this.size++; return true; } // stop if found // stop if found if (this._comparator(node.data, data) === 0) { return false; } dir = this._comparator(node.data, data) < 0; // update helpers // update helpers p = node; node = node.get_child(dir); } }; // returns true if removed, false if not found // returns true if removed, false if not found BinTree.prototype.remove = function (data) { if (this._root === null) { return false; } var head = new Node(undefined); // fake tree root // fake tree root var node = head; node.right = this._root; var p = null; // parent // parent var found = null; // found item // found item var dir = 1; while (node.get_child(dir) !== null) { p = node; node = node.get_child(dir); var cmp = this._comparator(data, node.data); dir = cmp > 0; if (cmp === 0) { found = node; } } if (found !== null) { found.data = node.data; p.set_child(p.right === node, node.get_child(node.left === null)); this._root = head.right; this.size--; return true; } else { return false; } }; module.exports = BinTree; }, function (module, exports) { var TreeBase = _require(3); function Node(data) { this.data = data; this.left = null; this.right = null; this.red = true; } Node.prototype.get_child = function (dir) { return dir ? this.right : this.left; }; Node.prototype.set_child = function (dir, val) { if (dir) { this.right = val; } else { this.left = val; } }; function RBTree(comparator) { this._root = null; this._comparator = comparator; this.size = 0; } RBTree.prototype = new TreeBase(); // returns true if inserted, false if duplicate // returns true if inserted, false if duplicate RBTree.prototype.insert = function (data) { var ret = false; if (this._root === null) { // empty tree this._root = new Node(data); ret = true; this.size++; } else { var head = new Node(undefined); // fake tree root // fake tree root var dir = 0; var last = 0; // setup // setup var gp = null; // grandparent // grandparent var ggp = head; // grand-grand-parent // grand-grand-parent var p = null; // parent // parent var node = this._root; ggp.right = this._root; // search down // search down while (true) { if (node === null) { // insert new node at the bottom node = new Node(data); p.set_child(dir, node); ret = true; this.size++; } else if (is_red(node.left) && is_red(node.right)) { // color flip node.red = true; node.left.red = false; node.right.red = false; } // fix red violation // fix red violation if (is_red(node) && is_red(p)) { var dir2 = ggp.right === gp; if (node === p.get_child(last)) { ggp.set_child(dir2, single_rotate(gp, !last)); } else { ggp.set_child(dir2, double_rotate(gp, !last)); } } var cmp = this._comparator(node.data, data); // stop if found // stop if found if (cmp === 0) { break; } last = dir; dir = cmp < 0; // update helpers // update helpers if (gp !== null) { ggp = gp; } gp = p; p = node; node = node.get_child(dir); } // update root // update root this._root = head.right; } // make root black // make root black this._root.red = false; return ret; }; // returns true if removed, false if not found // returns true if removed, false if not found RBTree.prototype.remove = function (data) { if (this._root === null) { return false; } var head = new Node(undefined); // fake tree root // fake tree root var node = head; node.right = this._root; var p = null; // parent // parent var gp = null; // grand parent // grand parent var found = null; // found item // found item var dir = 1; while (node.get_child(dir) !== null) { var last = dir; // update helpers // update helpers gp = p; p = node; node = node.get_child(dir); var cmp = this._comparator(data, node.data); dir = cmp > 0; // save found node // save found node if (cmp === 0) { found = node; } // push the red node down // push the red node down if (!is_red(node) && !is_red(node.get_child(dir))) { if (is_red(node.get_child(!dir))) { var sr = single_rotate(node, dir); p.set_child(last, sr); p = sr; } else if (!is_red(node.get_child(!dir))) { var sibling = p.get_child(!last); if (sibling !== null) { if (!is_red(sibling.get_child(!last)) && !is_red(sibling.get_child(last))) { // color flip p.red = false; sibling.red = true; node.red = true; } else { var dir2 = gp.right === p; if (is_red(sibling.get_child(last))) { gp.set_child(dir2, double_rotate(p, last)); } else if (is_red(sibling.get_child(!last))) { gp.set_child(dir2, single_rotate(p, last)); } // ensure correct coloring // ensure correct coloring var gpc = gp.get_child(dir2); gpc.red = true; node.red = true; gpc.left.red = false; gpc.right.red = false; } } } } } // replace and remove if found // replace and remove if found if (found !== null) { found.data = node.data; p.set_child(p.right === node, node.get_child(node.left === null)); this.size--; } // update root and make it black // update root and make it black this._root = head.right; if (this._root !== null) { this._root.red = false; } return found !== null; }; function is_red(node) { return node !== null && node.red; } function single_rotate(root, dir) { var save = root.get_child(!dir); root.set_child(!dir, save.get_child(dir)); save.set_child(dir, root); root.red = true; save.red = false; return save; } function double_rotate(root, dir) { root.set_child(!dir, single_rotate(root.get_child(!dir), !dir)); return single_rotate(root, dir); } module.exports = RBTree; }, function (module, exports) { function TreeBase() { } // removes all nodes from the tree // removes all nodes from the tree TreeBase.prototype.clear = function () { this._root = null; this.size = 0; }; // returns node data if found, null otherwise // returns node data if found, null otherwise TreeBase.prototype.find = function (data) { var res = this._root; while (res !== null) { var c = this._comparator(data, res.data); if (c === 0) { return res.data; } else { res = res.get_child(c > 0); } } return null; }; // returns iterator to node if found, null otherwise // returns iterator to node if found, null otherwise TreeBase.prototype.findIter = function (data) { var res = this._root; var iter = this.iterator(); while (res !== null) { var c = this._comparator(data, res.data); if (c === 0) { iter._cursor = res; return iter; } else { iter._ancestors.push(res); res = res.get_child(c > 0); } } return null; }; // Returns an iterator to the tree node at or immediately after the item // Returns an iterator to the tree node at or immediately after the item TreeBase.prototype.lowerBound = function (item) { var cur = this._root; var iter = this.iterator(); var cmp = this._comparator; while (cur !== null) { var c = cmp(item, cur.data); if (c === 0) { iter._cursor = cur; return iter; } iter._ancestors.push(cur); cur = cur.get_child(c > 0); } for (var i = iter._ancestors.length - 1; i >= 0; --i) { cur = iter._ancestors[i]; if (cmp(item, cur.data) < 0) { iter._cursor = cur; iter._ancestors.length = i; return iter; } } iter._ancestors.length = 0; return iter; }; // Returns an iterator to the tree node immediately after the item // Returns an iterator to the tree node immediately after the item TreeBase.prototype.upperBound = function (item) { var iter = this.lowerBound(item); var cmp = this._comparator; while (iter.data() !== null && cmp(iter.data(), item) === 0) { iter.next(); } return iter; }; // returns null if tree is empty // returns null if tree is empty TreeBase.prototype.min = function () { var res = this._root; if (res === null) { return null; } while (res.left !== null) { res = res.left; } return res.data; }; // returns null if tree is empty // returns null if tree is empty TreeBase.prototype.max = function () { var res = this._root; if (res === null) { return null; } while (res.right !== null) { res = res.right; } return res.data; }; // returns a null iterator // call next() or prev() to point to an element // returns a null iterator // call next() or prev() to point to an element TreeBase.prototype.iterator = function () { return new Iterator(this); }; // calls cb on each node's data, in order // calls cb on each node's data, in order TreeBase.prototype.each = function (cb) { var it = this.iterator(), data; while ((data = it.next()) !== null) { cb(data); } }; // calls cb on each node's data, in reverse order // calls cb on each node's data, in reverse order TreeBase.prototype.reach = function (cb) { var it = this.iterator(), data; while ((data = it.prev()) !== null) { cb(data); } }; function Iterator(tree) { this._tree = tree; this._ancestors = []; this._cursor = null; } Iterator.prototype.data = function () { return this._cursor !== null ? this._cursor.data : null; }; // if null-iterator, returns first node // otherwise, returns next node // if null-iterator, returns first node // otherwise, returns next node Iterator.prototype.next = function () { if (this._cursor === null) { var root = this._tree._root; if (root !== null) { this._minNode(root); } } else { if (this._cursor.right === null) { // no greater node in subtree, go up to parent // if coming from a right child, continue up the stack var save; do { save = this._cursor; if (this._ancestors.length) { this._cursor = this._ancestors.pop(); } else { this._cursor = null; break; } } while (this._cursor.right === save); } else { // get the next node from the subtree this._ancestors.push(this._cursor); this._minNode(this._cursor.right); } } return this._cursor !== null ? this._cursor.data : null; }; // if null-iterator, returns last node // otherwise, returns previous node // if null-iterator, returns last node // otherwise, returns previous node Iterator.prototype.prev = function () { if (this._cursor === null) { var root = this._tree._root; if (root !== null) { this._maxNode(root); } } else { if (this._cursor.left === null) { var save; do { save = this._cursor; if (this._ancestors.length) { this._cursor = this._ancestors.pop(); } else { this._cursor = null; break; } } while (this._cursor.left === save); } else { this._ancestors.push(this._cursor); this._maxNode(this._cursor.left); } } return this._cursor !== null ? this._cursor.data : null; }; Iterator.prototype._minNode = function (start) { while (start.left !== null) { this._ancestors.push(start); start = start.left; } this._cursor = start; }; Iterator.prototype._maxNode = function (start) { while (start.right !== null) { this._ancestors.push(start); start = start.right; } this._cursor = start; }; module.exports = TreeBase; }, function (module, exports) { // // TDigest: // // approximate distribution percentiles from a stream of reals // var RBTree = _require(0).RBTree; function TDigest(delta, K, CX) { // allocate a TDigest structure. // // delta is the compression factor, the max fraction of mass that // can be owned by one centroid (bigger, up to 1.0, means more // compression). delta=false switches off TDigest behavior and treats // the distribution as discrete, with no merging and exact values // reported. // // K is a size threshold that triggers recompression as the TDigest // grows during input. (Set it to 0 to disable automatic recompression) // // CX specifies how often to update cached cumulative totals used // for quantile estimation during ingest (see cumulate()). Set to // 0 to use exact quantiles for each new point. // this.discrete = delta === false; this.delta = delta || 0.01; this.K = K === undefined ? 25 : K; this.CX = CX === undefined ? 1.1 : CX; this.centroids = new RBTree(compare_centroid_means); this.nreset = 0; this.reset(); } TDigest.prototype.reset = function () { // prepare to digest new points. // this.centroids.clear(); this.n = 0; this.nreset += 1; this.last_cumulate = 0; }; TDigest.prototype.size = function () { return this.centroids.size; }; TDigest.prototype.toArray = function (everything) { // return {mean,n} of centroids as an array ordered by mean. // var result = []; if (everything) { this._cumulate(true); // be sure cumns are exact // be sure cumns are exact this.centroids.each(function (c) { result.push(c); }); } else { this.centroids.each(function (c) { result.push({ mean: c.mean, n: c.n }); }); } return result; }; TDigest.prototype.summary = function () { var approx = this.discrete ? 'exact ' : 'approximating '; var s = [ approx + this.n + ' samples using ' + this.size() + ' centroids', 'min = ' + this.percentile(0), 'Q1 = ' + this.percentile(0.25), 'Q2 = ' + this.percentile(0.5), 'Q3 = ' + this.percentile(0.75), 'max = ' + this.percentile(1) ]; return s.join('\n'); }; function compare_centroid_means(a, b) { // order two centroids by mean. // return a.mean > b.mean ? 1 : a.mean < b.mean ? -1 : 0; } function compare_centroid_mean_cumns(a, b) { // order two centroids by mean_cumn. // return a.mean_cumn - b.mean_cumn; } TDigest.prototype.push = function (x, n) { // incorporate value or array of values x, having count n into the // TDigest. n defaults to 1. // n = n || 1; x = Array.isArray(x) ? x : [x]; for (var i = 0; i < x.length; i++) { this._digest(x[i], n); } }; TDigest.prototype.push_centroid = function (c) { // incorporate centroid or array of centroids c // c = Array.isArray(c) ? c : [c]; for (var i = 0; i < c.length; i++) { this._digest(c[i].mean, c[i].n); } }; TDigest.prototype._cumulate = function (exact) { // update cumulative counts for each centroid // // exact: falsey means only cumulate after sufficient // growth. During ingest, these counts are used as quantile // estimates, and they work well even when somewhat out of // date. (this is a departure from the publication, you may set CX // to 0 to disable). // if (this.n === this.last_cumulate || !exact && this.CX && this.CX > this.n / this.last_cumulate) { return; } var cumn = 0; var self = this; this.centroids.each(function (c) { c.mean_cumn = cumn + c.n / 2; // half of n at the mean // half of n at the mean cumn = c.cumn = cumn + c.n; }); this.n = this.last_cumulate = cumn; }; TDigest.prototype.find_nearest = function (x) { // find the centroid closest to x. The assumption of // unique means and a unique nearest centroid departs from the // paper, see _digest() below // if (this.size() === 0) { return null; } var iter = this.centroids.lowerBound({ mean: x }); // x <= iter || iter==null // x <= iter || iter==null var c = iter.data() === null ? iter.prev() : iter.data(); if (c.mean === x || this.discrete) { return c; // c is either x or a neighbor (discrete: no distance func) } // c is either x or a neighbor (discrete: no distance func) var prev = iter.prev(); if (prev && Math.abs(prev.mean - x) < Math.abs(c.mean - x)) { return prev; } else { return c; } }; TDigest.prototype._new_centroid = function (x, n, cumn) { // create and insert a new centroid into the digest (don't update // cumulatives). // var c = { mean: x, n: n, cumn: cumn }; this.centroids.insert(c); this.n += n; return c; }; TDigest.prototype._addweight = function (nearest, x, n) { // add weight at location x to nearest centroid. adding x to // nearest will not shift its relative position in the tree and // require reinsertion. // if (x !== nearest.mean) { nearest.mean += n * (x - nearest.mean) / (nearest.n + n); } nearest.cumn += n; nearest.mean_cumn += n / 2; nearest.n += n; this.n += n; }; TDigest.prototype._digest = function (x, n) { // incorporate value x, having count n into the TDigest. // var min = this.centroids.min(); var max = this.centroids.max(); var nearest = this.find_nearest(x); if (nearest && nearest.mean === x) { // accumulate exact matches into the centroid without // limit. this is a departure from the paper, made so // centroids remain unique and code can be simple. this._addweight(nearest, x, n); } else if (nearest === min) { this._new_centroid(x, n, 0); // new point around min boundary } else // new point around min boundary if (nearest === max) { this._new_centroid(x, n, this.n); // new point around max boundary } else // new point around max boundary if (this.discrete) { this._new_centroid(x, n, nearest.cumn); // never merge } else { // never merge // conider a merge based on nearest centroid's capacity. if // there's not room for all of n, don't bother merging any of // it into nearest, as we'll have to make a new centroid // anyway for the remainder (departure from the paper). var p = nearest.mean_cumn / this.n; var max_n = Math.floor(4 * this.n * this.delta * p * (1 - p)); if (max_n - nearest.n >= n) { this._addweight(nearest, x, n); } else { this._new_centroid(x, n, nearest.cumn); } } this._cumulate(false); if (!this.discrete && this.K && this.size() > this.K / this.delta) { // re-process the centroids and hope for some compression. this.compress(); } }; TDigest.prototype.bound_mean = function (x) { // find centroids lower and upper such that lower.mean < x < // upper.mean or lower.mean === x === upper.mean. Don't call // this for x out of bounds. // var iter = this.centroids.upperBound({ mean: x }); // x < iter // x < iter var lower = iter.prev(); // lower <= x // lower <= x var upper = lower.mean === x ? lower : iter.next(); return [ lower, upper ]; }; TDigest.prototype.p_rank = function (x_or_xlist) { // return approximate percentile-ranks (0..1) for data value x. // or list of x. calculated according to // https://en.wikipedia.org/wiki/Percentile_rank // // (Note that in continuous mode, boundary sample values will // report half their centroid weight inward from 0/1 as the // percentile-rank. X values outside the observed range return // 0/1) // // this triggers cumulate() if cumn's are out of date. // var xs = Array.isArray(x_or_xlist) ? x_or_xlist : [x_or_xlist]; var ps = xs.map(this._p_rank, this); return Array.isArray(x_or_xlist) ? ps : ps[0]; }; TDigest.prototype._p_rank = function (x) { if (this.size() === 0) { return undefined; } else if (x < this.centroids.min().mean) { return 0; } else if (x > this.centroids.max().mean) { return 1; } // find centroids that bracket x and interpolate x's cumn from // their cumn's. // find centroids that bracket x and interpolate x's cumn from // their cumn's. this._cumulate(true); // be sure cumns are exact // be sure cumns are exact var bound = this.bound_mean(x); var lower = bound[0], upper = bound[1]; if (this.discrete) { return lower.cumn / this.n; } else { var cumn = lower.mean_cumn; if (lower !== upper) { cumn += (x - lower.mean) * (upper.mean_cumn - lower.mean_cumn) / (upper.mean - lower.mean); } return cumn / this.n; } }; TDigest.prototype.bound_mean_cumn = function (cumn) { // find centroids lower and upper such that lower.mean_cumn < x < // upper.mean_cumn or lower.mean_cumn === x === upper.mean_cumn. Don't call // this for cumn out of bounds. // // XXX because mean and mean_cumn give rise to the same sort order // (up to identical means), use the mean rbtree for our search. this.centroids._comparator = compare_centroid_mean_cumns; var iter = this.centroids.upperBound({ mean_cumn: cumn }); // cumn < iter // cumn < iter this.centroids._comparator = compare_centroid_means; var lower = iter.prev(); // lower <= cumn // lower <= cumn var upper = lower && lower.mean_cumn === cumn ? lower : iter.next(); return [ lower, upper ]; }; TDigest.prototype.percentile = function (p_or_plist) { // for percentage p (0..1), or for each p in a list of ps, return // the smallest data value q at which at least p percent of the // observations <= q. // // for discrete distributions, this selects q using the Nearest // Rank Method // (https://en.wikipedia.org/wiki/Percentile#The_Nearest_Rank_method) // (in scipy, same as percentile(...., interpolation='higher') // // for continuous distributions, interpolates data values between // count-weighted bracketing means. // // this triggers cumulate() if cumn's are out of date. // var ps = Array.isArray(p_or_plist) ? p_or_plist : [p_or_plist]; var qs = ps.map(this._percentile, this); return Array.isArray(p_or_plist) ? qs : qs[0]; }; TDigest.prototype._percentile = function (p) { if (this.size() === 0) { return undefined; } this._cumulate(true); // be sure cumns are exact // be sure cumns are exact var min = this.centroids.min(); var max = this.centroids.max(); var h = this.n * p; var bound = this.bound_mean_cumn(h); var lower = bound[0], upper = bound[1]; if (upper === lower || lower === null || upper === null) { return (lower || upper).mean; } else if (!this.discrete) { return lower.mean + (h - lower.mean_cumn) * (upper.mean - lower.mean) / (upper.mean_cumn - lower.mean_cumn); } else if (h <= lower.cumn) { return lower.mean; } else { return upper.mean; } }; function pop_random(choices) { // remove and return an item randomly chosen from the array of choices // (mutates choices) // var idx = Math.floor(Math.random() * choices.length); return choices.splice(idx, 1)[0]; } TDigest.prototype.compress = function () { // TDigests experience worst case compression (none) when input // increases monotonically. Improve on any bad luck by // reconsuming digest centroids as if they were weighted points // while shuffling their order (and hope for the best). // if (this.compressing) { return; } var points = this.toArray(); this.reset(); this.compressing = true; while (points.length > 0) { this.push_centroid(pop_random(points)); } this._cumulate(true); this.compressing = false; }; function Digest(config) { // allocate a distribution digest structure. This is an extension // of a TDigest structure that starts in exact histogram (discrete) // mode, and automatically switches to TDigest mode for large // samples that appear to be from a continuous distribution. // this.config = config || {}; this.mode = this.config.mode || 'auto'; // disc, cont, auto // disc, cont, auto TDigest.call(this, this.mode === 'cont' ? config.delta : false); this.digest_ratio = this.config.ratio || 0.9; this.digest_thresh = this.config.thresh || 1000; this.n_unique = 0; } Digest.prototype = Object.create(TDigest.prototype); Digest.prototype.constructor = Digest; Digest.prototype.push = function (x_or_xlist) { TDigest.prototype.push.call(this, x_or_xlist); this.check_continuous(); }; Digest.prototype._new_centroid = function (x, n, cumn) { this.n_unique += 1; TDigest.prototype._new_centroid.call(this, x, n, cumn); }; Digest.prototype._addweight = function (nearest, x, n) { if (nearest.n === 1) { this.n_unique -= 1; } TDigest.prototype._addweight.call(this, nearest, x, n); }; Digest.prototype.check_continuous = function () { // while in 'auto' mode, if there are many unique elements, assume // they are from a continuous distribution and switch to 'cont' // mode (tdigest behavior). Return true on transition from // disctete to continuous. if (this.mode !== 'auto' || this.size() < this.digest_thresh) { return false; } if (this.n_unique / this.size() > this.digest_ratio) { this.mode = 'cont'; this.discrete = false; this.delta = this.config.delta || 0.01; this.compress(); return true; } return false; }; module.exports = { 'TDigest': TDigest, 'Digest': Digest }; } ]; return _require(4); }));