@graphty/algorithms
Version:
Graph algorithms library for browser environments implemented in TypeScript
375 lines • 13.3 kB
JavaScript
/**
* TeraHAC - Hierarchical Agglomerative Clustering for Large Graphs
*
* This algorithm performs hierarchical clustering on graphs by iteratively
* merging the closest clusters. Optimized for scalability to handle large
* graphs efficiently.
*
* Based on: "Scaling Hierarchical Agglomerative Clustering to Trillion-Edge Graphs"
* Google Research 2024
*
* @param graph - Input graph to cluster
* @param config - Configuration options
* @returns Hierarchical clustering result
*/
export function teraHAC(graph, config = {}) {
const { linkage = "average", numClusters, distanceThreshold, maxNodes = 10000, useGraphDistance = true, } = config;
const nodes = Array.from(graph.nodes());
const nodeCount = nodes.length;
if (nodeCount === 0) {
throw new Error("Cannot cluster empty graph");
}
if (nodeCount > maxNodes) {
console.warn(`Graph has ${String(nodeCount)} nodes, which exceeds maxNodes (${String(maxNodes)}). Performance may be degraded.`);
}
// Initialize each node as its own cluster
const clusters = new Map();
let nextClusterId = nodeCount;
for (let i = 0; i < nodeCount; i++) {
const node = nodes[i];
if (!node) {
continue;
}
const clusterId = i.toString();
clusters.set(clusterId, {
id: clusterId,
members: new Set([node.id]),
distance: 0,
size: 1,
});
}
// Calculate initial distance matrix
const distanceMatrix = calculateDistanceMatrix(graph, nodes, useGraphDistance);
const mergeDistances = [];
// Priority queue for efficient nearest neighbor finding
const mergeCandidates = initializeMergeCandidates(clusters, distanceMatrix);
let dendrogram;
// Perform agglomerative clustering
while (clusters.size > 1) {
// Find closest pair of clusters
const { cluster1Id, cluster2Id, distance } = findClosestPair(mergeCandidates);
// Check stopping criteria
if (numClusters && clusters.size <= numClusters) {
break;
}
if (distanceThreshold && distance > distanceThreshold) {
break;
}
// Get the two clusters to merge
const cluster1 = clusters.get(cluster1Id);
const cluster2 = clusters.get(cluster2Id);
if (!cluster1 || !cluster2) {
continue;
}
// Create new merged cluster
const newClusterId = (nextClusterId++).toString();
const mergedMembers = new Set([...cluster1.members, ...cluster2.members]);
const newCluster = {
id: newClusterId,
members: mergedMembers,
left: cluster1,
right: cluster2,
distance,
size: cluster1.size + cluster2.size,
};
// Remove old clusters
clusters.delete(cluster1Id);
clusters.delete(cluster2Id);
// Add new cluster
clusters.set(newClusterId, newCluster);
mergeDistances.push(distance);
// Update merge candidates
updateMergeCandidates(mergeCandidates, cluster1Id, cluster2Id, newClusterId, clusters, distanceMatrix, linkage);
dendrogram = newCluster;
}
// If we have multiple remaining clusters, create a virtual root
if (clusters.size > 1) {
const remainingClusters = Array.from(clusters.values());
let root = remainingClusters[0];
if (!root) {
dendrogram = undefined;
}
else {
for (let i = 1; i < remainingClusters.length; i++) {
const currentCluster = remainingClusters[i];
if (!currentCluster) {
continue;
}
const newRoot = {
id: (nextClusterId++).toString(),
members: new Set([...root.members, ...currentCluster.members]),
left: root,
right: currentCluster,
distance: Infinity,
size: root.size + currentCluster.size,
};
root = newRoot;
}
dendrogram = root;
}
dendrogram = root;
}
dendrogram ?? (dendrogram = Array.from(clusters.values())[0]);
// Extract flat clustering
const finalNumClusters = numClusters ?? clusters.size;
const flatClusters = dendrogram ? extractFlatClustering(dendrogram, finalNumClusters) : new Map();
if (!dendrogram) {
throw new Error("Failed to create dendrogram");
}
return {
dendrogram,
clusters: flatClusters,
distances: mergeDistances,
numClusters: finalNumClusters,
};
}
/**
* Calculate distance matrix between all pairs of nodes
*/
function calculateDistanceMatrix(graph, nodes, useGraphDistance) {
const n = nodes.length;
const matrix = Array.from({ length: n }, () => new Array(n).fill(Infinity));
if (useGraphDistance) {
// Use graph-based distances (shortest path)
for (let i = 0; i < n; i++) {
const node = nodes[i];
if (!node) {
continue;
}
const distances = bfsShortestPaths(graph, node.id);
for (let j = 0; j < n; j++) {
if (i !== j) {
const targetNode = nodes[j];
if (targetNode && i < matrix.length && j < n) {
const row = matrix[i];
if (row && j < row.length) {
const distance = distances.get(targetNode.id);
row[j] = distance ?? Infinity;
}
}
}
else if (i < matrix.length) {
const row = matrix[i];
if (row && j < row.length) {
row[j] = 0;
}
}
}
}
}
else {
// Use simple edge-based distances
for (let i = 0; i < n; i++) {
for (let j = i + 1; j < n; j++) {
const node1 = nodes[i];
const node2 = nodes[j];
if (node1 && node2) {
const hasEdge = graph.hasEdge(node1.id, node2.id);
const distance = hasEdge ? 1 : 2; // Connected: 1, not connected: 2
if (i < matrix.length) {
const rowI = matrix[i];
if (rowI && j < rowI.length) {
rowI[j] = distance;
}
}
if (j < matrix.length) {
const rowJ = matrix[j];
if (rowJ && i < rowJ.length) {
rowJ[i] = distance;
}
}
}
}
if (i < matrix.length) {
const row = matrix[i];
if (row && i < row.length) {
row[i] = 0;
}
}
}
}
return matrix;
}
/**
* BFS-based shortest path calculation from a source node
*/
function bfsShortestPaths(graph, source) {
const distances = new Map();
const queue = [[source, 0]];
const visited = new Set();
visited.add(source);
distances.set(source, 0);
while (queue.length > 0) {
const item = queue.shift();
if (!item) {
break;
}
const [current, distance] = item;
for (const neighbor of graph.neighbors(current)) {
if (!visited.has(neighbor)) {
visited.add(neighbor);
distances.set(neighbor, distance + 1);
queue.push([neighbor, distance + 1]);
}
}
}
return distances;
}
/**
* Initialize merge candidates priority queue
*/
function initializeMergeCandidates(clusters, distanceMatrix) {
const candidates = [];
const clusterIds = Array.from(clusters.keys());
for (let i = 0; i < clusterIds.length; i++) {
for (let j = i + 1; j < clusterIds.length; j++) {
const id1 = clusterIds[i];
const id2 = clusterIds[j];
if (!id1 || !id2) {
continue;
}
const row = distanceMatrix[parseInt(id1)];
if (!row) {
continue;
}
const distance = row[parseInt(id2)] ?? Infinity;
// Include all candidates, even disconnected ones (with finite but large distance)
candidates.push({
cluster1Id: id1,
cluster2Id: id2,
distance: distance === Infinity ? 100 : distance,
});
}
}
// Sort by distance (ascending)
candidates.sort((a, b) => a.distance - b.distance);
return candidates;
}
/**
* Find the closest pair of clusters
*/
function findClosestPair(mergeCandidates) {
// Return the first (closest) valid candidate
const candidate = mergeCandidates.shift();
if (!candidate) {
throw new Error("No merge candidates available");
}
return candidate;
}
/**
* Update merge candidates after a merge operation
*/
function updateMergeCandidates(mergeCandidates, oldCluster1Id, oldCluster2Id, newClusterId, clusters, distanceMatrix, linkage) {
// Remove candidates involving the merged clusters
for (let i = mergeCandidates.length - 1; i >= 0; i--) {
const candidate = mergeCandidates[i];
if (!candidate) {
continue;
}
if (candidate.cluster1Id === oldCluster1Id || candidate.cluster1Id === oldCluster2Id ||
candidate.cluster2Id === oldCluster1Id || candidate.cluster2Id === oldCluster2Id) {
mergeCandidates.splice(i, 1);
}
}
// Add new candidates for the merged cluster
const newCluster = clusters.get(newClusterId);
if (!newCluster) {
return;
}
for (const [clusterId, cluster] of clusters) {
if (clusterId !== newClusterId) {
const distance = calculateClusterDistance(newCluster, cluster, distanceMatrix, linkage);
mergeCandidates.push({
cluster1Id: newClusterId,
cluster2Id: clusterId,
distance: distance === Infinity ? 100 : distance,
});
}
}
// Re-sort candidates
mergeCandidates.sort((a, b) => a.distance - b.distance);
}
/**
* Calculate distance between two clusters based on linkage criterion
*/
function calculateClusterDistance(cluster1, cluster2, distanceMatrix, linkage) {
const members1 = Array.from(cluster1.members);
const members2 = Array.from(cluster2.members);
const distances = [];
// Calculate all pairwise distances between cluster members
for (const member1 of members1) {
for (const member2 of members2) {
const idx1 = parseInt(member1.toString());
const idx2 = parseInt(member2.toString());
if (idx1 < distanceMatrix.length && idx2 < distanceMatrix.length) {
const row = distanceMatrix[idx1];
if (row) {
const distance = row[idx2];
if (distance !== undefined) {
distances.push(distance);
}
}
}
}
}
if (distances.length === 0) {
return Infinity;
}
// Apply linkage criterion
switch (linkage) {
case "single":
return Math.min(...distances);
case "complete":
return Math.max(...distances);
case "average":
return distances.reduce((sum, d) => sum + d, 0) / distances.length;
case "ward":
// Simplified Ward linkage (would need cluster centroids for full implementation)
return distances.reduce((sum, d) => sum + (d * d), 0) / distances.length;
default:
return distances.reduce((sum, d) => sum + d, 0) / distances.length;
}
}
/**
* Extract flat clustering from dendrogram
*/
function extractFlatClustering(dendrogram, numClusters) {
const clusters = new Map();
if (numClusters === 1) {
// Single cluster
const clusterId = 0;
for (const member of dendrogram.members) {
clusters.set(member, clusterId);
}
return clusters;
}
// Find clusters at the specified level
const clusterNodes = [];
const queue = [dendrogram];
while (queue.length > 0 && clusterNodes.length < numClusters) {
const current = queue.shift();
if (!current) {
break;
}
if (!current.left || !current.right || clusterNodes.length + queue.length + 1 >= numClusters) {
// This is a leaf or we need to keep this level
clusterNodes.push(current);
}
else {
// Continue decomposing
queue.push(current.left, current.right);
}
}
// Assign cluster IDs
for (let i = 0; i < clusterNodes.length; i++) {
const cluster = clusterNodes[i];
if (!cluster) {
continue;
}
for (const member of cluster.members) {
clusters.set(member, i);
}
}
return clusters;
}
//# sourceMappingURL=terahac.js.map