hyperloglog
Version:
HyperLogLog Distinct Value Estimator
136 lines (110 loc) • 4.98 kB
JavaScript
function compute_alpha_times_bucket_count_squared(bucket_count) {
return 0.7213 / (1 + 1.079 / bucket_count) * bucket_count * bucket_count;
}
// Create a HyperLogLog counter of 2^n buckets.
// 2^0 to 2^32 - requires that many BYTES (really 6 bit words for 64 bit hashing)
// The limit of 2^32 comes from using the first 32 bit int of the hash
// for the bucket index. Theoretically we could scale that to allow more, but that means
// more than 4GB per HLL, which is unlikely.
function HyperLogLog(n) {
var bucket_count = Math.pow(2, n);
var alpha_times_bucket_count_squared = compute_alpha_times_bucket_count_squared(bucket_count);
var buckets = new Buffer(bucket_count);
buckets.fill(0);
// Maintain some running counts so that returning cardinality is cheap.
var sum_of_inverses = bucket_count;
var count_zero_buckets = bucket_count;
var self = {
add: function add(unique_hash) {
if (unique_hash === null) {
return; // nothing to add
}
var bucket = unique_hash[0] >>> (32 - n);
var trailing_zeroes = 1;
count_zeroes:
for (var i = 3; i >= 2; --i) {
var data = unique_hash[i];
for (var j = 32; j; --j) {
if (data & 0x1) {
break count_zeroes;
}
++trailing_zeroes;
data = data >>> 1;
}
}
// Maintain a running sum of inverses for quick cardinality checking.
var old_value = buckets[bucket];
var new_value = Math.max(trailing_zeroes, old_value);
sum_of_inverses += Math.pow(2, -new_value) - Math.pow(2, -old_value);
if (new_value !== 0 && old_value === 0) {
--count_zero_buckets;
}
buckets[bucket] = new_value;
return self;
},
count: function count() {
/*var sum_of_inverses = 0;
var count_zero_buckets = 0;
for (var i = 0; i < bucket_count; ++i) {
var bucket = buckets[i];
if (bucket === 0) ++count_zero_buckets;
sum_of_inverses += 1 / Math.pow(2, bucket);
}*/
// No longer need to compute this all every time, since we keep running counts to keep this cheap.
var estimate = alpha_times_bucket_count_squared / sum_of_inverses;
// Apply small cardinality correction
if (count_zero_buckets > 0 && estimate < 5/2 * bucket_count) {
estimate = bucket_count * Math.log(bucket_count / count_zero_buckets);
}
return Math.floor(estimate + 0.5);
},
relative_error: function relative_error() {
// Estimate the relative error for this HLL.
return 1.04 / Math.sqrt(bucket_count);
},
output: function output() {
return {
n: n,
buckets: buckets
}
},
merge: function merge(data) {
if (n > data.n) {
// Fold this HLL down to the size of the incoming one.
var new_bucket_count = Math.pow(2, data.n);
var old_buckets_per_new_bucket = Math.pow(2, n - data.n);
var new_buckets = new Buffer(new_bucket_count);
for (var i = 0; i < new_bucket_count; ++i) {
var new_bucket_value = data.buckets[i];
for (var j = 0; j < old_buckets_per_new_bucket; ++j) {
new_bucket_value = Math.max(new_bucket_value, buckets[i * old_buckets_per_new_bucket + j]);
}
new_buckets[i] = new_bucket_value;
}
buckets = new_buckets;
n = data.n;
bucket_count = Math.pow(2, n);
alpha_times_bucket_count_squared = compute_alpha_times_bucket_count_squared(bucket_count);
} else {
var new_buckets_per_existing = Math.pow(2, data.n - n);
for (var i = data.buckets.length - 1; i >= 0; --i) {
var existing_bucket_index = (i / new_buckets_per_existing) | 0;
buckets[existing_bucket_index] = Math.max(buckets[existing_bucket_index], data.buckets[i]);
}
}
// Recompute running totals
sum_of_inverses = 0;
count_zero_buckets = 0;
for (var i = 0; i < bucket_count; ++i) {
var bucket = buckets[i];
if (bucket === 0) {
++count_zero_buckets;
}
sum_of_inverses += Math.pow(2, -bucket);
}
}
};
return self;
};
module.exports = HyperLogLog;
module.exports.hash = require('murmurhash3').murmur128Sync;