clustering-tfjs
Version:
High-performance TypeScript clustering algorithms (K-Means, Spectral, Agglomerative) with TensorFlow.js acceleration and scikit-learn compatibility
186 lines (185 loc) • 8.82 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.compute_rbf_affinity = compute_rbf_affinity;
exports.compute_knn_affinity = compute_knn_affinity;
exports.compute_affinity_matrix = compute_affinity_matrix;
const tf = __importStar(require("../tf-adapter"));
const pairwise_distance_1 = require("./pairwise_distance");
/**
* Computes the RBF (Gaussian) kernel affinity matrix for the given points.
*
* A[i, j] = exp(-gamma * ||x_i - x_j||^2)
*
* • The diagonal is guaranteed to be exactly 1 (because the distance is 0).
* • The result is symmetric by construction.
*
* The function is wrapped in `tf.tidy` so that all intermediate tensors are
* automatically disposed of once the result tensor has been returned.
*/
function compute_rbf_affinity(points, gamma) {
return tf.tidy(() => {
const nFeatures = points.shape[1];
// Default gamma mirrors scikit-learn behaviour for its RBF kernel used
// inside SpectralClustering: gamma = 1.0 / n_features when the user does
// not specify a value. We align with that default to ensure parity with
// reference fixtures.
const gammaVal = gamma ?? 1.0 / nFeatures;
const distances = (0, pairwise_distance_1.pairwiseEuclideanMatrix)(points); // (n, n)
// squared distances
const sq = distances.square();
const A = sq.mul(-gammaVal).exp();
// Ensure exact symmetry by averaging with its transpose (to mitigate any
// potential numerical asymmetry) and set the diagonal to 1.
const sym = A.add(A.transpose()).div(2);
const eye = tf.eye(sym.shape[0]);
return sym.mul(tf.scalar(1).sub(eye)).add(eye);
});
}
/**
* Builds a (k-)nearest-neighbour adjacency / affinity matrix.
*
* For each sample the `k` closest neighbours are connected with affinity
* value **1**. Self-loops are included to ensure connectivity, matching
* sklearn's behavior. The final matrix is **symmetrised** via `max(A, Aᵀ)`
* so that an edge is present when either sample appears in the other's
* neighbourhood.
*
* The result is returned as a dense `tf.Tensor2D` containing zeros for
* non-connected pairs. While a sparse representation would be more memory
* efficient, downstream TensorFlow.js ops (e.g. eigen-decomposition) currently
* expect dense tensors.
*/
function compute_knn_affinity(points, k, includeSelf = true) {
if (!Number.isInteger(k) || k < 1) {
throw new Error('k (nNeighbors) must be a positive integer.');
}
const nSamples = points.shape[0];
if (nSamples === 0) {
throw new Error('Input points tensor must contain at least one sample.');
}
if (k >= nSamples) {
throw new Error('k (nNeighbors) must be smaller than the number of samples.');
}
/* --------------------------------------------------------------------- */
/* Implementation note – memory-efficient block-wise distance scanning */
/* --------------------------------------------------------------------- */
// A naive implementation constructs the full pair-wise distance matrix
// (n×n) and then selects the k closest entries per row. This requires
// O(n²) memory which becomes prohibitive for large datasets.
//
// Instead we process the data in reasonable row-blocks: for each block of
// b rows we compute the distances to *all* samples (b×n) which has a peak
// memory footprint of O(b·n). With a modest block size (e.g. 1024) this
// scales to tens of thousands of samples while maintaining GPU/CPU
// efficiency thanks to matrix operations.
// Keep tensors that are required across blocks to avoid accidental disposal.
const pointsKept = tf.keep(points);
const squaredNormsKept = tf.keep(pointsKept.square().sum(1)); // (n)
const coords = [];
// Empirically chosen – small enough to fit typical accelerator memory while
// large enough to utilise BLAS throughput.
const BLOCK_SIZE = 1024;
for (let start = 0; start < nSamples; start += BLOCK_SIZE) {
const b = Math.min(BLOCK_SIZE, nSamples - start);
tf.tidy(() => {
// Slice current block (b,d)
const block = pointsKept.slice([start, 0], [b, -1]);
// Efficient squared Euclidean distances using the identity
// ‖x − y‖² = ‖x‖² + ‖y‖² − 2·xᵀy
const blockNorms = squaredNormsKept.slice([start], [b]).reshape([b, 1]); // (b,1)
const allNormsRow = squaredNormsKept.reshape([1, nSamples]); // (1,n)
const cross = block.matMul(pointsKept.transpose()); // (b,n)
const distsSquared = blockNorms.add(allNormsRow).sub(cross.mul(2)); // (b,n)
// We can avoid the costly sqrt, distances squared preserve ordering.
const negDists = distsSquared.neg(); // Want k smallest ⇒ largest of negative values.
// topk on each row
// When includeSelf=true, k neighbors include self
// When includeSelf=false, we need k+1 to later filter out self
const topK = includeSelf ? k : k + 1;
const { indices } = tf.topk(negDists, topK);
// Collect indices and apply deterministic tie-breaking: sort ascending
// so that ties are resolved towards the lower index mirroring NumPy.
const indArr = indices.arraySync();
for (let i = 0; i < b; i++) {
const rowGlobal = start + i;
// Sort to achieve deterministic order of equal-distance neighbours.
indArr[i].sort((a, b) => a - b);
let neighbours;
if (includeSelf) {
// When includeSelf=true, the k neighbors already include self
neighbours = indArr[i];
}
else {
// Remove self-index to get exactly k neighbors (excluding self)
neighbours = indArr[i].filter((idx) => idx !== rowGlobal).slice(0, k);
}
for (const nb of neighbours) {
coords.push([rowGlobal, nb]);
}
}
}); // tidy – dispose temporaries for this block
}
// Release kept tensors
pointsKept.dispose();
squaredNormsKept.dispose();
if (coords.length === 0) {
return tf.zeros([nSamples, nSamples]);
}
// Scatter ones into a dense zero matrix – TensorFlow.js scatterND expects
// typed arrays / tensors for the indices as well as values. Passing plain
// JS arrays is fine, the backend converts them on the fly.
return tf.tidy(() => {
const values = tf.ones([coords.length]);
const dense = tf.scatterND(coords, values, [
nSamples,
nSamples,
]);
// Symmetrise: A = 0.5 * (A + Aᵀ) to match sklearn
// This gives 0.5 for edges that only appear in one direction
return dense.add(dense.transpose()).mul(0.5);
});
}
/**
* Convenience wrapper that dispatches to the appropriate affinity builder
* based on the provided `affinity` option.
*/
function compute_affinity_matrix(points, options) {
if (options.affinity === 'rbf') {
return compute_rbf_affinity(points, options.gamma);
}
// nearest neighbours - include self-loops for connectivity
return compute_knn_affinity(points, options.nNeighbors, true);
}