clustring
Version:
Algorithms for clustering strings
210 lines (169 loc) • 5.15 kB
JavaScript
;
Object.defineProperty(exports, '__esModule', { value: true });
/**
* Cede control to the event loop for one tick, from within an async function.
*
* Usage:
*
* // ... slow stuff...
* await tick() // will cede control
* // ... slow stuff...
*/
function tick() {
return new Promise(resolve => setTimeout(resolve, 0));
}
class KeyClusterer {
constructor(bucket, keyer, options) {
this.bucket = bucket;
this.keyer = keyer;
this.options = options;
this.progress = 0;
this.canceled = false;
}
cancel() {
this.canceled = true;
}
async cluster() {
const bucket = this.bucket,
keyer = this.keyer;
const _this$options = this.options,
tickMs = _this$options.tickMs,
nIterationsBetweenTickChecks = _this$options.nIterationsBetweenTickChecks;
const bins = [];
const keyToBin = {};
let t1 = new Date();
const strs = Object.keys(bucket);
for (let i = 0; i < strs.length; i++) {
const str = strs[i];
if ((i + 1 & nIterationsBetweenTickChecks) === 0) {
const t2 = new Date();
if (t2 - t1 >= tickMs) {
this.progress = i / strs.length;
await tick(); // We can only be canceled while we aren't executing. So now that
// we're back from our tick is the only time we need to check.
if (this.canceled) {
throw new Error('canceled');
}
t1 = new Date();
}
}
const count = bucket[str];
const key = keyer(str);
let bin = keyToBin[key];
if (!bin) {
bin = {
key: key,
name: str,
count: 0,
bucket: {}
};
keyToBin[key] = bin;
bins.push(bin);
} else {
// Maybe change name. We do it in this loop so we're O(n)
const maxCount = bin.bucket[bin.name];
if (count > maxCount || count === maxCount && str.localeCompare(bin.name) < 0) {
bin.name = str;
}
}
bin.count += count;
bin.bucket[str] = count;
}
this.progress = 1;
return bins.filter(b => Object.keys(b.bucket).length > 1);
}
}
function clusterByKey(bucket, keyer, options = {}) {
options = Object.assign({
tickMs: 8,
nIterationsBetweenTickChecks: 0xfff // must be power of two, minus one
}, options);
return new KeyClusterer(bucket, keyer, options);
}
class KnnClusterer {
constructor(bucket, distance, radius, options) {
this.bucket = bucket;
this.distance = distance;
this.radius = radius;
this.options = options;
this.progress = 0;
this.canceled = false;
}
cancel() {
this.canceled = true;
}
async cluster() {
const bucket = this.bucket,
distance = this.distance,
radius = this.radius;
const _this$options = this.options,
tickMs = _this$options.tickMs,
nIterationsBetweenTickChecks = _this$options.nIterationsBetweenTickChecks;
const usedStrs = {}; // strs that have been placed in a cluster already
const strs = Object.keys(bucket);
const nStrs = strs.length;
const bins = [];
let t1 = new Date();
let i = 0;
const nComparisons = Math.max(0, nStrs * (nStrs - 1));
for (let ai = 0; ai < nStrs; ai++) {
const a = strs[ai];
if (a in usedStrs) {
i += nStrs - ai - 1;
continue;
}
const aCount = bucket[a];
let bin = null; // set iff any b clusters with a
for (let bi = ai + 1; bi < nStrs; bi++) {
i += 1;
if ((i & nIterationsBetweenTickChecks) === 0) {
const t2 = new Date();
if (t2 - t1 >= tickMs) {
this.progress = (i - 1) / nComparisons;
await tick(); // We can only be canceled while we aren't executing. So now that
// we're back from our tick is the only time we need to check.
if (this.canceled) {
throw new Error('canceled');
}
t1 = new Date();
}
}
const b = strs[bi];
if (b in usedStrs) continue;
const d = distance(a, b);
if (d <= radius) {
if (!bin) {
bin = {
name: a,
count: aCount,
bucket: {
[a]: aCount
}
};
bins.push(bin);
}
const maxCount = bin.bucket[bin.name];
const bCount = bucket[b];
if (bCount > maxCount || bCount === maxCount && b.localeCompare(bin.name) < 0) {
bin.name = b;
}
bin.count += bCount;
bin.bucket[b] = bCount;
usedStrs[b] = null;
}
}
}
this.progress = 1;
return bins;
}
}
function clusterByKnn(bucket, distance, radius, options = {}) {
options = Object.assign({
tickMs: 8,
nIterationsBetweenTickChecks: 0xfff // must be power of two, minus one
}, options);
return new KnnClusterer(bucket, distance, radius, options);
}
exports.clusterByKey = clusterByKey;
exports.clusterByKnn = clusterByKnn;
//# sourceMappingURL=index.js.map