UNPKG

hyperloglog-lite

Version:

HyperLogLog Distinct Value Estimator with an alternative implementation to murmurhash 128bit based on murmurhash-native instead of murmurhashv3

155 lines (126 loc) 4.81 kB
const murmurHashNode = require('murmurhash32-node') function hashMurmur32Bit (value, seed) { seed = seed || 0 return murmurHashNode.bit32.v2(value, seed) } function getSimulated128BitHash (value) { return [ hashMurmur32Bit(value), hashMurmur32Bit(value, 2), hashMurmur32Bit(value, 4), hashMurmur32Bit(value, 8)] } function compute_alpha_times_bucket_count_squared (bucket_count) { return 0.7213 / (1 + 1.079 / bucket_count) * bucket_count * bucket_count } // Create a HyperLogLog counter of 2^n buckets. // 2^0 to 2^32 - requires that many BYTES (really 6 bit words for 64 bit hashing) // The limit of 2^32 comes from using the first 32 bit int of the hash // for the bucket index. Theoretically we could scale that to allow more, but that means // more than 4GB per HLL, which is unlikely. function HyperLogLog (n) { var bucket_count = Math.pow(2, n) var alpha_times_bucket_count_squared = compute_alpha_times_bucket_count_squared( bucket_count) var buckets = new Buffer(bucket_count) buckets.fill(0) // Maintain some running counts so that returning cardinality is cheap. var sum_of_inverses = bucket_count var count_zero_buckets = bucket_count var self = { add: function add (unique_hash) { if (unique_hash === null) { return // nothing to add } var bucket = unique_hash[0] >>> (32 - n) var trailing_zeroes = 1 count_zeroes: for (var i = 3; i >= 2; --i) { var data = unique_hash[i] for (var j = 32; j; --j) { if (data & 0x1) { break count_zeroes } ++trailing_zeroes data = data >>> 1 } } // Maintain a running sum of inverses for quick cardinality checking. var old_value = buckets[bucket] var new_value = Math.max(trailing_zeroes, old_value) sum_of_inverses += Math.pow(2, -new_value) - Math.pow(2, -old_value) if (new_value !== 0 && old_value === 0) { --count_zero_buckets } buckets[bucket] = new_value return self }, count: function count () { /*var sum_of_inverses = 0; var count_zero_buckets = 0; for (var i = 0; i < bucket_count; ++i) { var bucket = buckets[i]; if (bucket === 0) ++count_zero_buckets; sum_of_inverses += 1 / Math.pow(2, bucket); }*/ // No longer need to compute this all every time, since we keep running counts to keep this cheap. var estimate = alpha_times_bucket_count_squared / sum_of_inverses // Apply small cardinality correction if (count_zero_buckets > 0 && estimate < 5 / 2 * bucket_count) { estimate = bucket_count * Math.log(bucket_count / count_zero_buckets) } return Math.floor(estimate + 0.5) }, relative_error: function relative_error () { // Estimate the relative error for this HLL. return 1.04 / Math.sqrt(bucket_count) }, output: function output () { return { n: n, buckets: buckets, } }, merge: function merge (data) { if (n > data.n) { // Fold this HLL down to the size of the incoming one. var new_bucket_count = Math.pow(2, data.n) var old_buckets_per_new_bucket = Math.pow(2, n - data.n) var new_buckets = new Buffer(new_bucket_count) for (var i = 0; i < new_bucket_count; ++i) { var new_bucket_value = data.buckets[i] for (var j = 0; j < old_buckets_per_new_bucket; ++j) { new_bucket_value = Math.max(new_bucket_value, buckets[i * old_buckets_per_new_bucket + j]) } new_buckets[i] = new_bucket_value } buckets = new_buckets n = data.n bucket_count = Math.pow(2, n) alpha_times_bucket_count_squared = compute_alpha_times_bucket_count_squared( bucket_count) } else { var new_buckets_per_existing = Math.pow(2, data.n - n) for (var i = data.buckets.length - 1; i >= 0; --i) { var existing_bucket_index = (i / new_buckets_per_existing) | 0 buckets[existing_bucket_index] = Math.max( buckets[existing_bucket_index], data.buckets[i]) } } // Recompute running totals sum_of_inverses = 0 count_zero_buckets = 0 for (var i = 0; i < bucket_count; ++i) { var bucket = buckets[i] if (bucket === 0) { ++count_zero_buckets } sum_of_inverses += Math.pow(2, -bucket) } }, } return self } module.exports = HyperLogLog module.exports.hash = getSimulated128BitHash