UNPKG

@graphty/algorithms

Version:

Graph algorithms library for browser environments implemented in TypeScript

546 lines (459 loc) 15.9 kB
/** * Hierarchical Clustering Algorithm * * Builds a hierarchy of clusters through agglomeration (bottom-up) * or division (top-down). Useful for understanding multi-scale * structure in graphs. */ import type {Graph} from "../core/graph.js"; /** * Convert Graph to adjacency set representation for clustering */ function graphToAdjacencySet(graph: Graph): Map<string, Set<string>> { const adjacency = new Map<string, Set<string>>(); // Initialize all nodes for (const node of graph.nodes()) { adjacency.set(String(node.id), new Set()); } // Add edges for (const edge of graph.edges()) { const source = String(edge.source); const target = String(edge.target); const sourceSet = adjacency.get(source); if (sourceSet) { sourceSet.add(target); } // For undirected graphs, add reverse edge if (!graph.isDirected) { const targetSet = adjacency.get(target); if (targetSet) { targetSet.add(source); } } } return adjacency; } export interface ClusterNode<T> { id: string; members: Set<T>; left?: ClusterNode<T> | undefined; right?: ClusterNode<T> | undefined; distance: number; height: number; // For forest roots, store the individual trees trees?: ClusterNode<T>[] | undefined; } export interface HierarchicalClusteringResult<T> { root: ClusterNode<T>; dendrogram: ClusterNode<T>[]; clusters: Map<number, Set<T>[]>; } export type LinkageMethod = "single" | "complete" | "average" | "ward"; /** * Compute distance between two nodes based on graph structure */ function computeGraphDistance<T>( graph: Map<T, Set<T>>, distances: Map<T, Map<T, number>>, ): void { // Use BFS to compute shortest path distances for (const start of graph.keys()) { const dist = new Map<T, number>(); const queue: T[] = [start]; dist.set(start, 0); while (queue.length > 0) { const node = queue.shift(); if (!node) { continue; } const neighbors = graph.get(node); if (neighbors) { for (const neighbor of neighbors) { if (!dist.has(neighbor)) { dist.set(neighbor, (dist.get(node) ?? 0) + 1); queue.push(neighbor); } } } } distances.set(start, dist); } } /** * Compute distance between two clusters based on linkage method */ function clusterDistance<T>( cluster1: ClusterNode<T>, cluster2: ClusterNode<T>, distances: Map<T, Map<T, number>>, linkage: LinkageMethod, ): number { const allDistances: number[] = []; for (const node1 of cluster1.members) { const node1Distances = distances.get(node1); if (!node1Distances) { continue; } for (const node2 of cluster2.members) { const dist = node1Distances.get(node2); if (dist !== undefined) { allDistances.push(dist); } } } if (allDistances.length === 0) { return Infinity; } switch (linkage) { case "single": return Math.min(... allDistances); case "complete": return Math.max(... allDistances); case "average": return allDistances.reduce((a, b) => a + b, 0) / allDistances.length; case "ward": { // Simplified Ward's method - minimize within-cluster variance const size1 = cluster1.members.size; const size2 = cluster2.members.size; const avgDist = allDistances.reduce((a, b) => a + b, 0) / allDistances.length; return avgDist * ((size1 * size2) / (size1 + size2)); } default: return Math.min(... allDistances); } } /** * Internal implementation of agglomerative hierarchical clustering */ function hierarchicalClusteringImpl<T>( graph: Map<T, Set<T>>, linkage: LinkageMethod = "single", ): HierarchicalClusteringResult<T> { const nodes = Array.from(graph.keys()); const n = nodes.length; if (n === 0) { return { root: {id: "empty", members: new Set(), distance: 0, height: 0}, dendrogram: [], clusters: new Map(), }; } // Compute pairwise distances const distances = new Map<T, Map<T, number>>(); computeGraphDistance(graph, distances); // Initialize clusters - each node is its own cluster const clusters: ClusterNode<T>[] = nodes.map((node, i) => ({ id: `leaf-${String(i)}`, members: new Set([node]), distance: 0, height: 0, })); const dendrogram: ClusterNode<T>[] = [... clusters]; let clusterCount = n; // Distance matrix between clusters const clusterDistances = new Map<string, Map<string, number>>(); // Initialize cluster distances for (let i = 0; i < clusters.length; i++) { const cluster1 = clusters[i]; if (!cluster1) { continue; } const distMap = new Map<string, number>(); for (let j = i + 1; j < clusters.length; j++) { const cluster2 = clusters[j]; if (!cluster2) { continue; } const dist = clusterDistance(cluster1, cluster2, distances, linkage); distMap.set(cluster2.id, dist); } clusterDistances.set(cluster1.id, distMap); } // Active clusters const activeClusters = new Set(clusters); // Merge clusters until only one remains while (activeClusters.size > 1) { // Find closest pair of clusters let minDist = Infinity; let mergeCluster1: ClusterNode<T> | null = null; let mergeCluster2: ClusterNode<T> | null = null; for (const cluster1 of activeClusters) { const distMap = clusterDistances.get(cluster1.id); if (!distMap) { continue; } for (const cluster2 of activeClusters) { if (cluster1.id >= cluster2.id) { continue; } const dist = distMap.get(cluster2.id) ?? clusterDistances.get(cluster2.id)?.get(cluster1.id) ?? Infinity; if (dist < minDist) { minDist = dist; mergeCluster1 = cluster1; mergeCluster2 = cluster2; } } } if (!mergeCluster1 || !mergeCluster2 || minDist === Infinity) { // No valid merge found - create forest const trees = Array.from(activeClusters); const forestRoot: ClusterNode<T> = { id: `forest-${String(clusterCount++)}`, members: new Set(nodes), distance: Infinity, height: Math.max(... trees.map((t) => t.height)) + 1, trees, }; dendrogram.push(forestRoot); activeClusters.clear(); activeClusters.add(forestRoot); break; } // Create new cluster const newCluster: ClusterNode<T> = { id: `cluster-${String(clusterCount++)}`, members: new Set([... mergeCluster1.members, ... mergeCluster2.members]), left: mergeCluster1, right: mergeCluster2, distance: minDist, height: Math.max(mergeCluster1.height, mergeCluster2.height) + 1, }; dendrogram.push(newCluster); // Update distances to new cluster const newDistMap = new Map<string, number>(); for (const cluster of activeClusters) { if (cluster.id === mergeCluster1.id || cluster.id === mergeCluster2.id) { continue; } const dist = clusterDistance(newCluster, cluster, distances, linkage); newDistMap.set(cluster.id, dist); // Update reverse direction const clusterDistMap = clusterDistances.get(cluster.id); if (clusterDistMap) { clusterDistMap.set(newCluster.id, dist); } } clusterDistances.set(newCluster.id, newDistMap); // Remove merged clusters activeClusters.delete(mergeCluster1); activeClusters.delete(mergeCluster2); clusterDistances.delete(mergeCluster1.id); clusterDistances.delete(mergeCluster2.id); // Clean up references for (const distMap of clusterDistances.values()) { distMap.delete(mergeCluster1.id); distMap.delete(mergeCluster2.id); } activeClusters.add(newCluster); } // Root is the last remaining cluster const root = activeClusters.size > 0 ? Array.from(activeClusters)[0] : dendrogram[dendrogram.length - 1]; // Generate clusters at different heights const clustersByLevel = new Map<number, Set<T>[]>(); if (!root) { return { root: {id: "empty", members: new Set(), distance: 0, height: 0}, dendrogram: [], clusters: new Map(), }; } for (let h = 0; h <= root.height; h++) { clustersByLevel.set(h, cutDendrogram(root, h)); } return { root, dendrogram, clusters: clustersByLevel, }; } /** * Cut dendrogram at specific height to get clusters */ export function cutDendrogram<T>( root: ClusterNode<T>, height: number, ): Set<T>[] { const clusters: Set<T>[] = []; function traverse(node: ClusterNode<T>): void { if (node.height <= height || (!node.left && !node.right)) { clusters.push(node.members); } else { if (node.trees) { // Handle forest nodes for (const tree of node.trees) { traverse(tree); } } else { if (node.left) { traverse(node.left); } if (node.right) { traverse(node.right); } } } } traverse(root); return clusters; } /** * Cut dendrogram to get exactly k clusters */ export function cutDendrogramKClusters<T>( root: ClusterNode<T>, k: number, ): Set<T>[] { if (k <= 0) { return []; } if (k === 1) { return [root.members]; } // Binary search for the right height let low = 0; let high = root.height; while (low < high) { const mid = Math.floor((low + high) / 2); const clusters = cutDendrogram(root, mid); if (clusters.length === k) { return clusters; } else if (clusters.length < k) { high = mid; } else { low = mid + 1; } } return cutDendrogram(root, low); } /** * Compute modularity-based hierarchical clustering * Uses modularity gain to decide merges */ export function modularityHierarchicalClustering( graph: Graph, ): HierarchicalClusteringResult<string> { const adjacencySet = graphToAdjacencySet(graph); const nodes = Array.from(adjacencySet.keys()); const n = nodes.length; if (n === 0) { return { root: {id: "empty", members: new Set(), distance: 0, height: 0}, dendrogram: [], clusters: new Map(), }; } // Compute degrees const degrees = new Map<string, number>(); let totalEdges = 0; for (const [node, neighbors] of adjacencySet) { degrees.set(node, neighbors.size); totalEdges += neighbors.size; } // For undirected graphs totalEdges = totalEdges / 2; // Initialize clusters const clusters: ClusterNode<string>[] = nodes.map((node, i) => ({ id: `leaf-${String(i)}`, members: new Set([node]), distance: 0, height: 0, })); const dendrogram: ClusterNode<string>[] = [... clusters]; let clusterCount = n; // Active clusters const activeClusters = new Set(clusters); // Community assignments const communities = new Map<string, number>(); nodes.forEach((node, i) => communities.set(node, i)); // Merge clusters based on modularity gain while (activeClusters.size > 1) { let maxGain = -Infinity; let mergeCluster1: ClusterNode<string> | null = null; let mergeCluster2: ClusterNode<string> | null = null; for (const cluster1 of activeClusters) { for (const cluster2 of activeClusters) { if (cluster1.id >= cluster2.id) { continue; } // Calculate modularity gain let edgesBetween = 0; let degreeProduct = 0; for (const node1 of cluster1.members) { const neighbors = adjacencySet.get(node1); const degree1 = degrees.get(node1) ?? 0; for (const node2 of cluster2.members) { if (neighbors?.has(node2)) { edgesBetween++; } const degree2 = degrees.get(node2) ?? 0; degreeProduct += degree1 * degree2; } } const gain = (edgesBetween / totalEdges) - (degreeProduct / (4 * totalEdges * totalEdges)); if (gain > maxGain) { maxGain = gain; mergeCluster1 = cluster1; mergeCluster2 = cluster2; } } } if (!mergeCluster1 || !mergeCluster2) { break; } // Create new cluster const newCluster: ClusterNode<string> = { id: `cluster-${String(clusterCount++)}`, members: new Set([... mergeCluster1.members, ... mergeCluster2.members]), left: mergeCluster1, right: mergeCluster2, distance: -maxGain, // Use negative gain as distance height: Math.max(mergeCluster1.height, mergeCluster2.height) + 1, }; dendrogram.push(newCluster); // Remove old and add new activeClusters.delete(mergeCluster1); activeClusters.delete(mergeCluster2); activeClusters.add(newCluster); } // Root is the last remaining cluster const root = Array.from(activeClusters)[0] ?? dendrogram[dendrogram.length - 1]; // Generate clusters at different heights const clustersByLevel = new Map<number, Set<string>[]>(); if (!root) { return { root: {id: "empty", members: new Set(), distance: 0, height: 0}, dendrogram: [], clusters: new Map(), }; } for (let h = 0; h <= root.height; h++) { clustersByLevel.set(h, cutDendrogram(root, h)); } return { root, dendrogram, clusters: clustersByLevel, }; } /** * Agglomerative hierarchical clustering * Builds clusters bottom-up by merging closest pairs * * @param graph - Undirected graph - accepts Graph class or Map<T, Set<T>> * @param linkage - Linkage method for cluster distance * @returns Hierarchical clustering result * * Time Complexity: O(n³) naive, O(n² log n) with heap * Space Complexity: O(n²) */ export function hierarchicalClustering( graph: Graph, linkage: LinkageMethod = "single", ): HierarchicalClusteringResult<string> { // Convert Graph to adjacency set representation const adjacencySet = graphToAdjacencySet(graph); return hierarchicalClusteringImpl(adjacencySet, linkage); }