tdigest
Version:
javascript implementation of Dunning's T-Digest for streaming quantile approximation
908 lines • 40.4 kB
JavaScript
(function (factory) {
if (typeof define === 'function' && define.amd) {
// AMD. Register as an anonymous module.
define([], factory);
} else if (typeof exports === 'object') {
// Node. Does not work with strict CommonJS, but
// only CommonJS-like enviroments that support module.exports,
// like Node.
module.exports = factory();
} else {
// Browser globals
this.tdigest = factory();
}
}(function () {
var global = this, define;
function _require(id) {
var module = _require.cache[id];
if (!module) {
var exports = {};
module = _require.cache[id] = {
id: id,
exports: exports
};
_require.modules[id].call(exports, module, exports);
}
return module.exports;
}
_require.cache = [];
_require.modules = [
function (module, exports) {
module.exports = {
RBTree: _require(2),
BinTree: _require(1)
};
},
function (module, exports) {
var TreeBase = _require(3);
function Node(data) {
this.data = data;
this.left = null;
this.right = null;
}
Node.prototype.get_child = function (dir) {
return dir ? this.right : this.left;
};
Node.prototype.set_child = function (dir, val) {
if (dir) {
this.right = val;
} else {
this.left = val;
}
};
function BinTree(comparator) {
this._root = null;
this._comparator = comparator;
this.size = 0;
}
BinTree.prototype = new TreeBase(); // returns true if inserted, false if duplicate
// returns true if inserted, false if duplicate
BinTree.prototype.insert = function (data) {
if (this._root === null) {
// empty tree
this._root = new Node(data);
this.size++;
return true;
}
var dir = 0; // setup
// setup
var p = null; // parent
// parent
var node = this._root; // search down
// search down
while (true) {
if (node === null) {
// insert new node at the bottom
node = new Node(data);
p.set_child(dir, node);
ret = true;
this.size++;
return true;
} // stop if found
// stop if found
if (this._comparator(node.data, data) === 0) {
return false;
}
dir = this._comparator(node.data, data) < 0; // update helpers
// update helpers
p = node;
node = node.get_child(dir);
}
}; // returns true if removed, false if not found
// returns true if removed, false if not found
BinTree.prototype.remove = function (data) {
if (this._root === null) {
return false;
}
var head = new Node(undefined); // fake tree root
// fake tree root
var node = head;
node.right = this._root;
var p = null; // parent
// parent
var found = null; // found item
// found item
var dir = 1;
while (node.get_child(dir) !== null) {
p = node;
node = node.get_child(dir);
var cmp = this._comparator(data, node.data);
dir = cmp > 0;
if (cmp === 0) {
found = node;
}
}
if (found !== null) {
found.data = node.data;
p.set_child(p.right === node, node.get_child(node.left === null));
this._root = head.right;
this.size--;
return true;
} else {
return false;
}
};
module.exports = BinTree;
},
function (module, exports) {
var TreeBase = _require(3);
function Node(data) {
this.data = data;
this.left = null;
this.right = null;
this.red = true;
}
Node.prototype.get_child = function (dir) {
return dir ? this.right : this.left;
};
Node.prototype.set_child = function (dir, val) {
if (dir) {
this.right = val;
} else {
this.left = val;
}
};
function RBTree(comparator) {
this._root = null;
this._comparator = comparator;
this.size = 0;
}
RBTree.prototype = new TreeBase(); // returns true if inserted, false if duplicate
// returns true if inserted, false if duplicate
RBTree.prototype.insert = function (data) {
var ret = false;
if (this._root === null) {
// empty tree
this._root = new Node(data);
ret = true;
this.size++;
} else {
var head = new Node(undefined); // fake tree root
// fake tree root
var dir = 0;
var last = 0; // setup
// setup
var gp = null; // grandparent
// grandparent
var ggp = head; // grand-grand-parent
// grand-grand-parent
var p = null; // parent
// parent
var node = this._root;
ggp.right = this._root; // search down
// search down
while (true) {
if (node === null) {
// insert new node at the bottom
node = new Node(data);
p.set_child(dir, node);
ret = true;
this.size++;
} else if (is_red(node.left) && is_red(node.right)) {
// color flip
node.red = true;
node.left.red = false;
node.right.red = false;
} // fix red violation
// fix red violation
if (is_red(node) && is_red(p)) {
var dir2 = ggp.right === gp;
if (node === p.get_child(last)) {
ggp.set_child(dir2, single_rotate(gp, !last));
} else {
ggp.set_child(dir2, double_rotate(gp, !last));
}
}
var cmp = this._comparator(node.data, data); // stop if found
// stop if found
if (cmp === 0) {
break;
}
last = dir;
dir = cmp < 0; // update helpers
// update helpers
if (gp !== null) {
ggp = gp;
}
gp = p;
p = node;
node = node.get_child(dir);
} // update root
// update root
this._root = head.right;
} // make root black
// make root black
this._root.red = false;
return ret;
}; // returns true if removed, false if not found
// returns true if removed, false if not found
RBTree.prototype.remove = function (data) {
if (this._root === null) {
return false;
}
var head = new Node(undefined); // fake tree root
// fake tree root
var node = head;
node.right = this._root;
var p = null; // parent
// parent
var gp = null; // grand parent
// grand parent
var found = null; // found item
// found item
var dir = 1;
while (node.get_child(dir) !== null) {
var last = dir; // update helpers
// update helpers
gp = p;
p = node;
node = node.get_child(dir);
var cmp = this._comparator(data, node.data);
dir = cmp > 0; // save found node
// save found node
if (cmp === 0) {
found = node;
} // push the red node down
// push the red node down
if (!is_red(node) && !is_red(node.get_child(dir))) {
if (is_red(node.get_child(!dir))) {
var sr = single_rotate(node, dir);
p.set_child(last, sr);
p = sr;
} else if (!is_red(node.get_child(!dir))) {
var sibling = p.get_child(!last);
if (sibling !== null) {
if (!is_red(sibling.get_child(!last)) && !is_red(sibling.get_child(last))) {
// color flip
p.red = false;
sibling.red = true;
node.red = true;
} else {
var dir2 = gp.right === p;
if (is_red(sibling.get_child(last))) {
gp.set_child(dir2, double_rotate(p, last));
} else if (is_red(sibling.get_child(!last))) {
gp.set_child(dir2, single_rotate(p, last));
} // ensure correct coloring
// ensure correct coloring
var gpc = gp.get_child(dir2);
gpc.red = true;
node.red = true;
gpc.left.red = false;
gpc.right.red = false;
}
}
}
}
} // replace and remove if found
// replace and remove if found
if (found !== null) {
found.data = node.data;
p.set_child(p.right === node, node.get_child(node.left === null));
this.size--;
} // update root and make it black
// update root and make it black
this._root = head.right;
if (this._root !== null) {
this._root.red = false;
}
return found !== null;
};
function is_red(node) {
return node !== null && node.red;
}
function single_rotate(root, dir) {
var save = root.get_child(!dir);
root.set_child(!dir, save.get_child(dir));
save.set_child(dir, root);
root.red = true;
save.red = false;
return save;
}
function double_rotate(root, dir) {
root.set_child(!dir, single_rotate(root.get_child(!dir), !dir));
return single_rotate(root, dir);
}
module.exports = RBTree;
},
function (module, exports) {
function TreeBase() {
} // removes all nodes from the tree
// removes all nodes from the tree
TreeBase.prototype.clear = function () {
this._root = null;
this.size = 0;
}; // returns node data if found, null otherwise
// returns node data if found, null otherwise
TreeBase.prototype.find = function (data) {
var res = this._root;
while (res !== null) {
var c = this._comparator(data, res.data);
if (c === 0) {
return res.data;
} else {
res = res.get_child(c > 0);
}
}
return null;
}; // returns iterator to node if found, null otherwise
// returns iterator to node if found, null otherwise
TreeBase.prototype.findIter = function (data) {
var res = this._root;
var iter = this.iterator();
while (res !== null) {
var c = this._comparator(data, res.data);
if (c === 0) {
iter._cursor = res;
return iter;
} else {
iter._ancestors.push(res);
res = res.get_child(c > 0);
}
}
return null;
}; // Returns an iterator to the tree node at or immediately after the item
// Returns an iterator to the tree node at or immediately after the item
TreeBase.prototype.lowerBound = function (item) {
var cur = this._root;
var iter = this.iterator();
var cmp = this._comparator;
while (cur !== null) {
var c = cmp(item, cur.data);
if (c === 0) {
iter._cursor = cur;
return iter;
}
iter._ancestors.push(cur);
cur = cur.get_child(c > 0);
}
for (var i = iter._ancestors.length - 1; i >= 0; --i) {
cur = iter._ancestors[i];
if (cmp(item, cur.data) < 0) {
iter._cursor = cur;
iter._ancestors.length = i;
return iter;
}
}
iter._ancestors.length = 0;
return iter;
}; // Returns an iterator to the tree node immediately after the item
// Returns an iterator to the tree node immediately after the item
TreeBase.prototype.upperBound = function (item) {
var iter = this.lowerBound(item);
var cmp = this._comparator;
while (iter.data() !== null && cmp(iter.data(), item) === 0) {
iter.next();
}
return iter;
}; // returns null if tree is empty
// returns null if tree is empty
TreeBase.prototype.min = function () {
var res = this._root;
if (res === null) {
return null;
}
while (res.left !== null) {
res = res.left;
}
return res.data;
}; // returns null if tree is empty
// returns null if tree is empty
TreeBase.prototype.max = function () {
var res = this._root;
if (res === null) {
return null;
}
while (res.right !== null) {
res = res.right;
}
return res.data;
}; // returns a null iterator
// call next() or prev() to point to an element
// returns a null iterator
// call next() or prev() to point to an element
TreeBase.prototype.iterator = function () {
return new Iterator(this);
}; // calls cb on each node's data, in order
// calls cb on each node's data, in order
TreeBase.prototype.each = function (cb) {
var it = this.iterator(), data;
while ((data = it.next()) !== null) {
cb(data);
}
}; // calls cb on each node's data, in reverse order
// calls cb on each node's data, in reverse order
TreeBase.prototype.reach = function (cb) {
var it = this.iterator(), data;
while ((data = it.prev()) !== null) {
cb(data);
}
};
function Iterator(tree) {
this._tree = tree;
this._ancestors = [];
this._cursor = null;
}
Iterator.prototype.data = function () {
return this._cursor !== null ? this._cursor.data : null;
}; // if null-iterator, returns first node
// otherwise, returns next node
// if null-iterator, returns first node
// otherwise, returns next node
Iterator.prototype.next = function () {
if (this._cursor === null) {
var root = this._tree._root;
if (root !== null) {
this._minNode(root);
}
} else {
if (this._cursor.right === null) {
// no greater node in subtree, go up to parent
// if coming from a right child, continue up the stack
var save;
do {
save = this._cursor;
if (this._ancestors.length) {
this._cursor = this._ancestors.pop();
} else {
this._cursor = null;
break;
}
} while (this._cursor.right === save);
} else {
// get the next node from the subtree
this._ancestors.push(this._cursor);
this._minNode(this._cursor.right);
}
}
return this._cursor !== null ? this._cursor.data : null;
}; // if null-iterator, returns last node
// otherwise, returns previous node
// if null-iterator, returns last node
// otherwise, returns previous node
Iterator.prototype.prev = function () {
if (this._cursor === null) {
var root = this._tree._root;
if (root !== null) {
this._maxNode(root);
}
} else {
if (this._cursor.left === null) {
var save;
do {
save = this._cursor;
if (this._ancestors.length) {
this._cursor = this._ancestors.pop();
} else {
this._cursor = null;
break;
}
} while (this._cursor.left === save);
} else {
this._ancestors.push(this._cursor);
this._maxNode(this._cursor.left);
}
}
return this._cursor !== null ? this._cursor.data : null;
};
Iterator.prototype._minNode = function (start) {
while (start.left !== null) {
this._ancestors.push(start);
start = start.left;
}
this._cursor = start;
};
Iterator.prototype._maxNode = function (start) {
while (start.right !== null) {
this._ancestors.push(start);
start = start.right;
}
this._cursor = start;
};
module.exports = TreeBase;
},
function (module, exports) {
//
// TDigest:
//
// approximate distribution percentiles from a stream of reals
//
var RBTree = _require(0).RBTree;
function TDigest(delta, K, CX) {
// allocate a TDigest structure.
//
// delta is the compression factor, the max fraction of mass that
// can be owned by one centroid (bigger, up to 1.0, means more
// compression). delta=false switches off TDigest behavior and treats
// the distribution as discrete, with no merging and exact values
// reported.
//
// K is a size threshold that triggers recompression as the TDigest
// grows during input. (Set it to 0 to disable automatic recompression)
//
// CX specifies how often to update cached cumulative totals used
// for quantile estimation during ingest (see cumulate()). Set to
// 0 to use exact quantiles for each new point.
//
this.discrete = delta === false;
this.delta = delta || 0.01;
this.K = K === undefined ? 25 : K;
this.CX = CX === undefined ? 1.1 : CX;
this.centroids = new RBTree(compare_centroid_means);
this.nreset = 0;
this.reset();
}
TDigest.prototype.reset = function () {
// prepare to digest new points.
//
this.centroids.clear();
this.n = 0;
this.nreset += 1;
this.last_cumulate = 0;
};
TDigest.prototype.size = function () {
return this.centroids.size;
};
TDigest.prototype.toArray = function (everything) {
// return {mean,n} of centroids as an array ordered by mean.
//
var result = [];
if (everything) {
this._cumulate(true); // be sure cumns are exact
// be sure cumns are exact
this.centroids.each(function (c) {
result.push(c);
});
} else {
this.centroids.each(function (c) {
result.push({
mean: c.mean,
n: c.n
});
});
}
return result;
};
TDigest.prototype.summary = function () {
var approx = this.discrete ? 'exact ' : 'approximating ';
var s = [
approx + this.n + ' samples using ' + this.size() + ' centroids',
'min = ' + this.percentile(0),
'Q1 = ' + this.percentile(0.25),
'Q2 = ' + this.percentile(0.5),
'Q3 = ' + this.percentile(0.75),
'max = ' + this.percentile(1)
];
return s.join('\n');
};
function compare_centroid_means(a, b) {
// order two centroids by mean.
//
return a.mean > b.mean ? 1 : a.mean < b.mean ? -1 : 0;
}
function compare_centroid_mean_cumns(a, b) {
// order two centroids by mean_cumn.
//
return a.mean_cumn - b.mean_cumn;
}
TDigest.prototype.push = function (x, n) {
// incorporate value or array of values x, having count n into the
// TDigest. n defaults to 1.
//
n = n || 1;
x = Array.isArray(x) ? x : [x];
for (var i = 0; i < x.length; i++) {
this._digest(x[i], n);
}
};
TDigest.prototype.push_centroid = function (c) {
// incorporate centroid or array of centroids c
//
c = Array.isArray(c) ? c : [c];
for (var i = 0; i < c.length; i++) {
this._digest(c[i].mean, c[i].n);
}
};
TDigest.prototype._cumulate = function (exact) {
// update cumulative counts for each centroid
//
// exact: falsey means only cumulate after sufficient
// growth. During ingest, these counts are used as quantile
// estimates, and they work well even when somewhat out of
// date. (this is a departure from the publication, you may set CX
// to 0 to disable).
//
if (this.n === this.last_cumulate || !exact && this.CX && this.CX > this.n / this.last_cumulate) {
return;
}
var cumn = 0;
var self = this;
this.centroids.each(function (c) {
c.mean_cumn = cumn + c.n / 2; // half of n at the mean
// half of n at the mean
cumn = c.cumn = cumn + c.n;
});
this.n = this.last_cumulate = cumn;
};
TDigest.prototype.find_nearest = function (x) {
// find the centroid closest to x. The assumption of
// unique means and a unique nearest centroid departs from the
// paper, see _digest() below
//
if (this.size() === 0) {
return null;
}
var iter = this.centroids.lowerBound({ mean: x }); // x <= iter || iter==null
// x <= iter || iter==null
var c = iter.data() === null ? iter.prev() : iter.data();
if (c.mean === x || this.discrete) {
return c; // c is either x or a neighbor (discrete: no distance func)
}
// c is either x or a neighbor (discrete: no distance func)
var prev = iter.prev();
if (prev && Math.abs(prev.mean - x) < Math.abs(c.mean - x)) {
return prev;
} else {
return c;
}
};
TDigest.prototype._new_centroid = function (x, n, cumn) {
// create and insert a new centroid into the digest (don't update
// cumulatives).
//
var c = {
mean: x,
n: n,
cumn: cumn
};
this.centroids.insert(c);
this.n += n;
return c;
};
TDigest.prototype._addweight = function (nearest, x, n) {
// add weight at location x to nearest centroid. adding x to
// nearest will not shift its relative position in the tree and
// require reinsertion.
//
if (x !== nearest.mean) {
nearest.mean += n * (x - nearest.mean) / (nearest.n + n);
}
nearest.cumn += n;
nearest.mean_cumn += n / 2;
nearest.n += n;
this.n += n;
};
TDigest.prototype._digest = function (x, n) {
// incorporate value x, having count n into the TDigest.
//
var min = this.centroids.min();
var max = this.centroids.max();
var nearest = this.find_nearest(x);
if (nearest && nearest.mean === x) {
// accumulate exact matches into the centroid without
// limit. this is a departure from the paper, made so
// centroids remain unique and code can be simple.
this._addweight(nearest, x, n);
} else if (nearest === min) {
this._new_centroid(x, n, 0); // new point around min boundary
} else // new point around min boundary
if (nearest === max) {
this._new_centroid(x, n, this.n); // new point around max boundary
} else // new point around max boundary
if (this.discrete) {
this._new_centroid(x, n, nearest.cumn); // never merge
} else {
// never merge
// conider a merge based on nearest centroid's capacity. if
// there's not room for all of n, don't bother merging any of
// it into nearest, as we'll have to make a new centroid
// anyway for the remainder (departure from the paper).
var p = nearest.mean_cumn / this.n;
var max_n = Math.floor(4 * this.n * this.delta * p * (1 - p));
if (max_n - nearest.n >= n) {
this._addweight(nearest, x, n);
} else {
this._new_centroid(x, n, nearest.cumn);
}
}
this._cumulate(false);
if (!this.discrete && this.K && this.size() > this.K / this.delta) {
// re-process the centroids and hope for some compression.
this.compress();
}
};
TDigest.prototype.bound_mean = function (x) {
// find centroids lower and upper such that lower.mean < x <
// upper.mean or lower.mean === x === upper.mean. Don't call
// this for x out of bounds.
//
var iter = this.centroids.upperBound({ mean: x }); // x < iter
// x < iter
var lower = iter.prev(); // lower <= x
// lower <= x
var upper = lower.mean === x ? lower : iter.next();
return [
lower,
upper
];
};
TDigest.prototype.p_rank = function (x_or_xlist) {
// return approximate percentile-ranks (0..1) for data value x.
// or list of x. calculated according to
// https://en.wikipedia.org/wiki/Percentile_rank
//
// (Note that in continuous mode, boundary sample values will
// report half their centroid weight inward from 0/1 as the
// percentile-rank. X values outside the observed range return
// 0/1)
//
// this triggers cumulate() if cumn's are out of date.
//
var xs = Array.isArray(x_or_xlist) ? x_or_xlist : [x_or_xlist];
var ps = xs.map(this._p_rank, this);
return Array.isArray(x_or_xlist) ? ps : ps[0];
};
TDigest.prototype._p_rank = function (x) {
if (this.size() === 0) {
return undefined;
} else if (x < this.centroids.min().mean) {
return 0;
} else if (x > this.centroids.max().mean) {
return 1;
} // find centroids that bracket x and interpolate x's cumn from
// their cumn's.
// find centroids that bracket x and interpolate x's cumn from
// their cumn's.
this._cumulate(true); // be sure cumns are exact
// be sure cumns are exact
var bound = this.bound_mean(x);
var lower = bound[0], upper = bound[1];
if (this.discrete) {
return lower.cumn / this.n;
} else {
var cumn = lower.mean_cumn;
if (lower !== upper) {
cumn += (x - lower.mean) * (upper.mean_cumn - lower.mean_cumn) / (upper.mean - lower.mean);
}
return cumn / this.n;
}
};
TDigest.prototype.bound_mean_cumn = function (cumn) {
// find centroids lower and upper such that lower.mean_cumn < x <
// upper.mean_cumn or lower.mean_cumn === x === upper.mean_cumn. Don't call
// this for cumn out of bounds.
//
// XXX because mean and mean_cumn give rise to the same sort order
// (up to identical means), use the mean rbtree for our search.
this.centroids._comparator = compare_centroid_mean_cumns;
var iter = this.centroids.upperBound({ mean_cumn: cumn }); // cumn < iter
// cumn < iter
this.centroids._comparator = compare_centroid_means;
var lower = iter.prev(); // lower <= cumn
// lower <= cumn
var upper = lower && lower.mean_cumn === cumn ? lower : iter.next();
return [
lower,
upper
];
};
TDigest.prototype.percentile = function (p_or_plist) {
// for percentage p (0..1), or for each p in a list of ps, return
// the smallest data value q at which at least p percent of the
// observations <= q.
//
// for discrete distributions, this selects q using the Nearest
// Rank Method
// (https://en.wikipedia.org/wiki/Percentile#The_Nearest_Rank_method)
// (in scipy, same as percentile(...., interpolation='higher')
//
// for continuous distributions, interpolates data values between
// count-weighted bracketing means.
//
// this triggers cumulate() if cumn's are out of date.
//
var ps = Array.isArray(p_or_plist) ? p_or_plist : [p_or_plist];
var qs = ps.map(this._percentile, this);
return Array.isArray(p_or_plist) ? qs : qs[0];
};
TDigest.prototype._percentile = function (p) {
if (this.size() === 0) {
return undefined;
}
this._cumulate(true); // be sure cumns are exact
// be sure cumns are exact
var min = this.centroids.min();
var max = this.centroids.max();
var h = this.n * p;
var bound = this.bound_mean_cumn(h);
var lower = bound[0], upper = bound[1];
if (upper === lower || lower === null || upper === null) {
return (lower || upper).mean;
} else if (!this.discrete) {
return lower.mean + (h - lower.mean_cumn) * (upper.mean - lower.mean) / (upper.mean_cumn - lower.mean_cumn);
} else if (h <= lower.cumn) {
return lower.mean;
} else {
return upper.mean;
}
};
function pop_random(choices) {
// remove and return an item randomly chosen from the array of choices
// (mutates choices)
//
var idx = Math.floor(Math.random() * choices.length);
return choices.splice(idx, 1)[0];
}
TDigest.prototype.compress = function () {
// TDigests experience worst case compression (none) when input
// increases monotonically. Improve on any bad luck by
// reconsuming digest centroids as if they were weighted points
// while shuffling their order (and hope for the best).
//
if (this.compressing) {
return;
}
var points = this.toArray();
this.reset();
this.compressing = true;
while (points.length > 0) {
this.push_centroid(pop_random(points));
}
this._cumulate(true);
this.compressing = false;
};
function Digest(config) {
// allocate a distribution digest structure. This is an extension
// of a TDigest structure that starts in exact histogram (discrete)
// mode, and automatically switches to TDigest mode for large
// samples that appear to be from a continuous distribution.
//
this.config = config || {};
this.mode = this.config.mode || 'auto'; // disc, cont, auto
// disc, cont, auto
TDigest.call(this, this.mode === 'cont' ? config.delta : false);
this.digest_ratio = this.config.ratio || 0.9;
this.digest_thresh = this.config.thresh || 1000;
this.n_unique = 0;
}
Digest.prototype = Object.create(TDigest.prototype);
Digest.prototype.constructor = Digest;
Digest.prototype.push = function (x_or_xlist) {
TDigest.prototype.push.call(this, x_or_xlist);
this.check_continuous();
};
Digest.prototype._new_centroid = function (x, n, cumn) {
this.n_unique += 1;
TDigest.prototype._new_centroid.call(this, x, n, cumn);
};
Digest.prototype._addweight = function (nearest, x, n) {
if (nearest.n === 1) {
this.n_unique -= 1;
}
TDigest.prototype._addweight.call(this, nearest, x, n);
};
Digest.prototype.check_continuous = function () {
// while in 'auto' mode, if there are many unique elements, assume
// they are from a continuous distribution and switch to 'cont'
// mode (tdigest behavior). Return true on transition from
// disctete to continuous.
if (this.mode !== 'auto' || this.size() < this.digest_thresh) {
return false;
}
if (this.n_unique / this.size() > this.digest_ratio) {
this.mode = 'cont';
this.discrete = false;
this.delta = this.config.delta || 0.01;
this.compress();
return true;
}
return false;
};
module.exports = {
'TDigest': TDigest,
'Digest': Digest
};
}
];
return _require(4);
}));