UNPKG

clustring

Version:
210 lines (169 loc) 5.15 kB
'use strict'; Object.defineProperty(exports, '__esModule', { value: true }); /** * Cede control to the event loop for one tick, from within an async function. * * Usage: * * // ... slow stuff... * await tick() // will cede control * // ... slow stuff... */ function tick() { return new Promise(resolve => setTimeout(resolve, 0)); } class KeyClusterer { constructor(bucket, keyer, options) { this.bucket = bucket; this.keyer = keyer; this.options = options; this.progress = 0; this.canceled = false; } cancel() { this.canceled = true; } async cluster() { const bucket = this.bucket, keyer = this.keyer; const _this$options = this.options, tickMs = _this$options.tickMs, nIterationsBetweenTickChecks = _this$options.nIterationsBetweenTickChecks; const bins = []; const keyToBin = {}; let t1 = new Date(); const strs = Object.keys(bucket); for (let i = 0; i < strs.length; i++) { const str = strs[i]; if ((i + 1 & nIterationsBetweenTickChecks) === 0) { const t2 = new Date(); if (t2 - t1 >= tickMs) { this.progress = i / strs.length; await tick(); // We can only be canceled while we aren't executing. So now that // we're back from our tick is the only time we need to check. if (this.canceled) { throw new Error('canceled'); } t1 = new Date(); } } const count = bucket[str]; const key = keyer(str); let bin = keyToBin[key]; if (!bin) { bin = { key: key, name: str, count: 0, bucket: {} }; keyToBin[key] = bin; bins.push(bin); } else { // Maybe change name. We do it in this loop so we're O(n) const maxCount = bin.bucket[bin.name]; if (count > maxCount || count === maxCount && str.localeCompare(bin.name) < 0) { bin.name = str; } } bin.count += count; bin.bucket[str] = count; } this.progress = 1; return bins.filter(b => Object.keys(b.bucket).length > 1); } } function clusterByKey(bucket, keyer, options = {}) { options = Object.assign({ tickMs: 8, nIterationsBetweenTickChecks: 0xfff // must be power of two, minus one }, options); return new KeyClusterer(bucket, keyer, options); } class KnnClusterer { constructor(bucket, distance, radius, options) { this.bucket = bucket; this.distance = distance; this.radius = radius; this.options = options; this.progress = 0; this.canceled = false; } cancel() { this.canceled = true; } async cluster() { const bucket = this.bucket, distance = this.distance, radius = this.radius; const _this$options = this.options, tickMs = _this$options.tickMs, nIterationsBetweenTickChecks = _this$options.nIterationsBetweenTickChecks; const usedStrs = {}; // strs that have been placed in a cluster already const strs = Object.keys(bucket); const nStrs = strs.length; const bins = []; let t1 = new Date(); let i = 0; const nComparisons = Math.max(0, nStrs * (nStrs - 1)); for (let ai = 0; ai < nStrs; ai++) { const a = strs[ai]; if (a in usedStrs) { i += nStrs - ai - 1; continue; } const aCount = bucket[a]; let bin = null; // set iff any b clusters with a for (let bi = ai + 1; bi < nStrs; bi++) { i += 1; if ((i & nIterationsBetweenTickChecks) === 0) { const t2 = new Date(); if (t2 - t1 >= tickMs) { this.progress = (i - 1) / nComparisons; await tick(); // We can only be canceled while we aren't executing. So now that // we're back from our tick is the only time we need to check. if (this.canceled) { throw new Error('canceled'); } t1 = new Date(); } } const b = strs[bi]; if (b in usedStrs) continue; const d = distance(a, b); if (d <= radius) { if (!bin) { bin = { name: a, count: aCount, bucket: { [a]: aCount } }; bins.push(bin); } const maxCount = bin.bucket[bin.name]; const bCount = bucket[b]; if (bCount > maxCount || bCount === maxCount && b.localeCompare(bin.name) < 0) { bin.name = b; } bin.count += bCount; bin.bucket[b] = bCount; usedStrs[b] = null; } } } this.progress = 1; return bins; } } function clusterByKnn(bucket, distance, radius, options = {}) { options = Object.assign({ tickMs: 8, nIterationsBetweenTickChecks: 0xfff // must be power of two, minus one }, options); return new KnnClusterer(bucket, distance, radius, options); } exports.clusterByKey = clusterByKey; exports.clusterByKnn = clusterByKnn; //# sourceMappingURL=index.js.map