clusternova
Version:
HDBSCAN clustering algorithm implementation in TypeScript
457 lines (456 loc) • 20.3 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.findCentralElements = exports.cosine = exports.manhattan = exports.euclidean = void 0;
/**
* HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) implementation
* @template T - Type of data points, must include 'id' and 'vector' properties
*/
class HDBSCAN {
/**
* Creates a new HDBSCAN instance
* @param X - Array of data points to cluster
* @param mpts - Minimum points required to form a dense region (minimum cluster size)
* @param distanceFunction - Optional function to calculate distance between points (defaults to cosine distance)
*/
constructor(X, mpts, distanceFunction) {
this.X = X;
this.allNearestNeighbors = new Map();
this.mpts = mpts;
this.coreDistances = new Map();
this.mrg = new Map();
this.mstEdges = [];
this.distanceFunction = distanceFunction !== null && distanceFunction !== void 0 ? distanceFunction : cosine;
this.idToObject = new Map();
this.X.forEach((p) => {
this.idToObject.set(p.id, p);
});
}
/**
* Runs the HDBSCAN clustering algorithm
* @returns Object containing clusters and outliers
* @returns {T[][]} clusters - Array of clusters, where each cluster is an array of data points
* @returns {T[]} outliers - Array of data points that don't belong to any cluster
* @throws {Error} If an error occurs during clustering
*/
run() {
if (this.X.length === 0) {
return { clusters: [], outliers: [] };
}
try {
this._computeAllNearestNeighbors();
this._computeCoreDistances();
this._constructMRG();
this._computeMST();
const { clusters, outliers } = this._extractHDBSCANHierarchy();
// Transform IDs into original objects
const clustersWithOriginalObjs = clusters.map((cluster) => cluster.map((id) => this.idToObject.get(id)));
const outliersWithOriginalObjs = outliers.map((id) => this.idToObject.get(id));
return {
clusters: clustersWithOriginalObjs,
outliers: outliersWithOriginalObjs,
};
}
catch (e) {
console.error("Error in HDBSCAN:", e);
//rethrow
throw e;
}
}
_computeAllNearestNeighbors() {
this.X.forEach((from, fromIndex) => {
this.allNearestNeighbors.set(from.id, new Map());
this.X.forEach((to, toIndex) => {
var _a;
if (fromIndex !== toIndex // Ensure we are not calculating distance to self
) {
if ((_a = this.allNearestNeighbors.get(to.id)) === null || _a === void 0 ? void 0 : _a.has(from.id)) {
// already calculated, just need to set the other direction
this.allNearestNeighbors
.get(from.id)
.set(to.id, this.allNearestNeighbors.get(to.id).get(from.id));
}
else {
// Calculate the distance from the current point to all other points
const distance = this.distanceFunction(from.vector, to.vector);
this.allNearestNeighbors.get(from.id).set(to.id, distance);
}
}
});
});
}
_computeCoreDistances() {
// Iterate over all points in the dataset
this.allNearestNeighbors.forEach((edgesAtPoint, id) => {
const distances = Array.from(edgesAtPoint.values());
// Sort the array of distances to find the mpts-th smallest distance
// TODO: potentially optimize with quickselect
distances.sort((a, b) => a - b);
if (distances.length < this.mpts) {
// error check if mpts is greater than the number of points
throw new Error("mpts is greater than the number of points in the dataset");
}
// The core distance is the distance to the mpts-th nearest neighbor
let coreDistance = distances[this.mpts - 1]; // Adjusted for zero-based array index.
this.coreDistances.set(id, coreDistance);
});
}
_constructMRG() {
// Prepare graph nodes
this.X.forEach((point) => {
this.mrg.set(point.id, new Map()); // Each point has a map to others with distances
});
// Now compute the mutual reachability distance for each pair of points and populate the graph
this.allNearestNeighbors.forEach((edgesAtFrom, fromId) => {
edgesAtFrom.forEach((distanceTo, toId) => {
const mutualReachabilityDistance = Math.max(this.coreDistances.get(fromId), this.coreDistances.get(toId), distanceTo);
this.mrg.get(fromId).set(toId, mutualReachabilityDistance);
//don't need to set the other direction since we're iterating over all edges so the other direction will be added
});
});
}
_computeMST() {
// Start from the first point (you could start from any)
const startId = this.X[0].id;
const pq = new PriorityQueue((a, b) => a.weight < b.weight); // Min-Heap priority queue
// Initialize the priority queue with all edges from the starting vertex
const edgesFromStart = this.mrg.get(startId);
edgesFromStart.forEach((weight, id) => {
pq.enqueue({ from: startId, to: id, weight });
});
// Set to keep track of vertices included in the MST
const inMST = new Set();
inMST.add(startId);
// Building the MST
while (!pq.isEmpty()) {
const result = pq.dequeue();
if (result === null) {
throw new Error("Priority queue dequeued null value");
}
const { from, to, weight } = result;
// Check if the 'to' vertex is already included in the MST
if (!inMST.has(to)) {
inMST.add(to);
//add edge to mstEdges
this.mstEdges.push({ from, to, weight });
// Add all edges from the 'to' vertex to the priority queue
const edgesFromTo = this.mrg.get(to);
edgesFromTo.forEach((nextWeight, nextId) => {
if (!inMST.has(nextId)) {
pq.enqueue({ from: to, to: nextId, weight: nextWeight });
}
});
}
}
//sort the mstEdges by weight in ascending order
this.mstEdges.sort((a, b) => a.weight - b.weight);
}
_extractHDBSCANHierarchy() {
var _a;
// Initialize the union-find structure
const uf = new UnionFind(this.X.map((point) => point.id));
// Array to store the hierarchy steps, where each element is an object:
const hierarchy = [];
// map from point ID to its current highest index in hierarchy:
const pointToHierarchyIndex = new Map();
// to start, each point is in its own group (this is effectively our version of the MSText step)
const currentGroups = new Map(); // Map of current groups (both clusters and noise points). key: id of root of the group, value: array of ids in the group
for (const key of this.X.map((point) => point.id)) {
currentGroups.set(key, [key]);
}
// Merge clusters based on sorted edges
this.mstEdges.forEach((edge) => {
var _a;
const { from, to, weight } = edge;
const rootFrom = uf.find(from);
const rootTo = uf.find(to);
const sizeFrom = currentGroups.get(rootFrom).length;
const sizeTo = currentGroups.get(rootTo).length;
const newSize = sizeFrom + sizeTo;
// merge two noise points to form a new cluster!
uf.union(from, to);
//find what the root of the new cluster is
const newRoot = uf.find(from);
// console.log("newRoot", newRoot);
const newElements = currentGroups
.get(rootFrom)
.concat(currentGroups.get(rootTo));
if (newSize >= this.mpts && sizeFrom < this.mpts && sizeTo < this.mpts) {
// merge two noise points to form a new cluster!
// push the new cluster to the hierarchy
hierarchy.push({
childrenClusters: null,
elements: newElements,
lambdaPs: new Array(newElements.length).fill(1 / weight),
lambdaMin: null,
lambdaMax: 1 / weight,
});
// update the root of the new cluster in the pointToHierarchyIndex map
pointToHierarchyIndex.set(newRoot, hierarchy.length - 1);
}
else if (newSize >= this.mpts &&
sizeFrom >= this.mpts &&
sizeTo >= this.mpts) {
// merge two clusters to form a new cluster!
// push the new cluster to the hierarchy
hierarchy.push({
childrenClusters: [
pointToHierarchyIndex.get(rootFrom),
pointToHierarchyIndex.get(rootTo),
],
elements: newElements,
lambdaPs: new Array(newElements.length).fill(1 / weight),
lambdaMin: null,
lambdaMax: 1 / weight,
});
//update the lambdaMin of the two children clusters
hierarchy[pointToHierarchyIndex.get(rootFrom)].lambdaMin = 1 / weight;
hierarchy[pointToHierarchyIndex.get(rootTo)].lambdaMin = 1 / weight;
// update the root of the new cluster in the pointToHierarchyIndex map
pointToHierarchyIndex.set(newRoot, hierarchy.length - 1);
}
else if (newSize >= this.mpts) {
// merge a noise group with a cluster so a cluster grows bigger
if (pointToHierarchyIndex.get(newRoot) === undefined) {
// this means union find for some reason made the noise group the new root, so we just assign the other group's hierarchy index
const existingIndex = (_a = pointToHierarchyIndex.get(rootFrom)) !== null && _a !== void 0 ? _a : pointToHierarchyIndex.get(rootTo);
pointToHierarchyIndex.set(newRoot, existingIndex);
}
// find the existing cluster and modify it:
const updateCluster = hierarchy[pointToHierarchyIndex.get(newRoot)];
updateCluster.elements = newElements;
const mergeSize = sizeFrom < this.mpts ? sizeFrom : sizeTo; // the size of the group that was noise
for (let i = 0; i < mergeSize; i++) {
updateCluster.lambdaPs.push(1 / weight);
}
}
currentGroups.set(newRoot, newElements);
currentGroups.delete(newRoot === rootFrom ? rootTo : rootFrom); // merged into newRoot, so the merged in root no longer considered
});
// remove last element of hierarchy array bc we don't care about the root
hierarchy.pop();
// creating and condensing the hierarchy tree is done! Now we calculate stabilities of every cluster:
const stabilities = new Array(hierarchy.length).fill(0); // index is the index in the hierarchy array, value is the stability
for (let i = 0; i < hierarchy.length; i++) {
for (let j = 0; j < hierarchy[i].lambdaPs.length; j++) {
stabilities[i] +=
hierarchy[i].lambdaPs[j] - ((_a = hierarchy[i].lambdaMin) !== null && _a !== void 0 ? _a : 0);
}
}
// now we loop through the clusters again reverse topological order and set s_hat s.t.:
// s_hat(cluster_i) =
// {stabilities(cluster_i) iff cluster_i is leaf node
// {max(cluster_i, s_hat(cluster_i_left_child) + s_hat(cluster_i_right_child)) otherwise
// and we select the cluster where its stability is greater than the sum of the s_hat values of its children
const isSelected = new Array(hierarchy.length).fill(false); //index is the index in the hierarchy array, boolean value is whether we select it as a cluster
const s_hat = new Array(hierarchy.length).fill(0); // index is the index in the hierarchy array, value is the s_hat value
for (let i = 0; i < hierarchy.length; i++) {
if (hierarchy[i].childrenClusters === null) {
//cluster_i is leaf node
s_hat[i] = stabilities[i];
isSelected[i] = true;
}
else {
const i_left_child_index = hierarchy[i].childrenClusters[0]; // since we remove the root, this must be defined
const i_right_child_index = hierarchy[i].childrenClusters[1];
const i_left_child = s_hat[i_left_child_index];
const i_right_child = s_hat[i_right_child_index];
if (stabilities[i] < i_left_child + i_right_child) {
s_hat[i] = i_left_child + i_right_child;
isSelected[i] = false;
}
else {
s_hat[i] = stabilities[i];
isSelected[i] = true;
// unselect children here now that we've selected their parents
isSelected[i_left_child_index] = false;
isSelected[i_right_child_index] = false;
}
}
}
const clusters = []; // Each cluster is an array of string ids (string[][])
const outliers = []; // List of outlier ids (string[])
//finally, loop through the hierarchy to find the clusters that we end up selecting!
for (let i = 0; i < hierarchy.length; i++) {
if (isSelected[i]) {
clusters.push(hierarchy[i].elements);
}
}
// find the outliers now
const allIds = new Set(this.X.map((point) => point.id));
clusters.forEach((cluster) => {
cluster.forEach((id) => {
allIds.delete(id);
});
});
outliers.push(...allIds);
return { clusters, outliers };
}
}
function euclidean(pointA, pointB) {
if (pointA.length !== pointB.length) {
throw new Error("unequal dimension in input data");
}
let sum = 0;
for (let i = 0; i < pointA.length; i++) {
const diff = pointA[i] - pointB[i];
sum += diff * diff;
}
return Math.sqrt(sum);
}
exports.euclidean = euclidean;
function manhattan(pointA, pointB) {
if (pointA.length !== pointB.length) {
throw new Error("unequal dimension in input data");
}
let sum = 0;
for (let i = 0; i < pointA.length; i++) {
sum += Math.abs(pointA[i] - pointB[i]);
}
return sum;
}
exports.manhattan = manhattan;
function cosine(pointA, pointB) {
if (pointA.length !== pointB.length) {
throw new Error("unequal dimension in input data");
}
let dotProduct = 0.0;
let normA = 0.0;
let normB = 0.0;
for (let i = 0; i < pointA.length; i++) {
dotProduct += pointA[i] * pointB[i];
normA += pointA[i] * pointA[i];
normB += pointB[i] * pointB[i];
}
if (normA === 0 || normB === 0) {
return 1;
}
const similarity = dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
return 1 - similarity;
}
exports.cosine = cosine;
class PriorityQueue {
constructor(comparator) {
this._heap = [];
this._comparator = comparator;
}
enqueue(value) {
this._heap.push(value);
this._siftUp();
}
dequeue() {
if (this.isEmpty()) {
return null;
}
const poppedValue = this._heap[0];
const bottomValue = this._heap.pop();
if (this._heap.length > 0 && bottomValue !== undefined) {
this._heap[0] = bottomValue;
this._siftDown();
}
return poppedValue;
}
isEmpty() {
return this._heap.length === 0;
}
_siftUp() {
let nodeIdx = this._heap.length - 1;
while (nodeIdx > 0 &&
this._comparator(this._heap[nodeIdx], this._heap[Math.floor((nodeIdx - 1) / 2)])) {
this._swap(nodeIdx, Math.floor((nodeIdx - 1) / 2));
nodeIdx = Math.floor((nodeIdx - 1) / 2);
}
}
_siftDown() {
let nodeIdx = 0;
while ((2 * nodeIdx + 1 < this._heap.length &&
this._comparator(this._heap[2 * nodeIdx + 1], this._heap[nodeIdx])) ||
(2 * nodeIdx + 2 < this._heap.length &&
this._comparator(this._heap[2 * nodeIdx + 2], this._heap[nodeIdx]))) {
const smallerChildIdx = 2 * nodeIdx + 2 < this._heap.length &&
this._comparator(this._heap[2 * nodeIdx + 2], this._heap[2 * nodeIdx + 1])
? 2 * nodeIdx + 2
: 2 * nodeIdx + 1;
this._swap(nodeIdx, smallerChildIdx);
nodeIdx = smallerChildIdx;
}
}
_swap(i, j) {
[this._heap[i], this._heap[j]] = [this._heap[j], this._heap[i]];
}
}
/**
* Finds the n most central elements in a cluster of vectors
* @template T Type of cluster elements extending {id: string; vector: number[]}
* @param cluster Array of objects containing at least {id, vector} properties
* @param n Number of central elements to return (must be >= 1)
* @param distanceFunction Optional distance function (defaults to cosine)
* @returns Array of input objects with additional distance property, representing the n most central elements, sorted by distance from centroid
* @throws Error if n < 1 or cluster is empty
*/
function findCentralElements(cluster, n, distanceFunction = cosine) {
if (n < 1) {
throw new Error("Number of central elements must be at least 1");
}
if (cluster.length === 0) {
throw new Error("Cannot find central elements of empty cluster");
}
const vectors = cluster.map((p) => p.vector);
const dimensions = vectors[0].length;
const centroid = new Array(dimensions).fill(0);
for (const vector of vectors) {
if (vector.length !== dimensions) {
throw new Error("All vectors must have the same dimensions");
}
for (let i = 0; i < dimensions; i++) {
centroid[i] += vector[i];
}
}
for (let i = 0; i < dimensions; i++) {
centroid[i] /= vectors.length;
}
// Find n closest points to centroid
return cluster
.map((point) => ({
...point,
distance: distanceFunction(point.vector, centroid),
}))
.sort((a, b) => a.distance - b.distance)
.slice(0, n);
}
exports.findCentralElements = findCentralElements;
//extended union-find data structure
class UnionFind {
constructor(elements) {
this.parent = new Map();
this.rank = new Map();
elements.forEach((e) => {
this.parent.set(e, e); // Each element is the parent of itself
this.rank.set(e, 0); // Rank of each element is 0 initially
});
}
find(item) {
if (this.parent.get(item) !== item) {
this.parent.set(item, this.find(this.parent.get(item)));
}
return this.parent.get(item);
}
union(item1, item2) {
const root1 = this.find(item1);
const root2 = this.find(item2);
if (root1 === root2)
return;
const rank1 = this.rank.get(root1);
const rank2 = this.rank.get(root2);
if (rank1 > rank2) {
this.parent.set(root2, root1);
}
else if (rank1 < rank2) {
this.parent.set(root1, root2);
}
else {
this.parent.set(root2, root1);
this.rank.set(root1, rank1 + 1);
}
}
}
exports.default = HDBSCAN;