UNPKG

clustering-tfjs

Version:

High-performance TypeScript clustering algorithms (K-Means, Spectral, Agglomerative) with TensorFlow.js acceleration and scikit-learn compatibility

165 lines (164 loc) 6.61 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.findOptimalClusters = findOptimalClusters; const tf = __importStar(require("../tf-adapter")); const kmeans_1 = require("../clustering/kmeans"); const spectral_1 = require("../clustering/spectral"); const agglomerative_1 = require("../clustering/agglomerative"); const silhouette_1 = require("../validation/silhouette"); const davies_bouldin_1 = require("../validation/davies_bouldin"); const calinski_harabasz_1 = require("../validation/calinski_harabasz"); const tensor_utils_1 = require("./tensor-utils"); /** * Automatically finds the optimal number of clusters for a dataset by evaluating * multiple k values using validation metrics. * * @param X - Input data matrix (samples × features) * @param options - Configuration options * @returns Object containing optimal k and detailed results for all tested k values * * @example * ```typescript * import { findOptimalClusters } from 'clustering-tfjs'; * * const data = [[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]]; * const result = await findOptimalClusters(data, { maxClusters: 5 }); * * console.log(`Optimal number of clusters: ${result.optimal.k}`); * console.log(`Best silhouette score: ${result.optimal.silhouette}`); * ``` */ async function findOptimalClusters(X, options = {}) { const { minClusters = 2, maxClusters = 10, algorithm = 'kmeans', algorithmParams = {}, metrics = ['silhouette', 'daviesBouldin', 'calinskiHarabasz'], scoringFunction, } = options; // Validate inputs if (minClusters < 2) { throw new Error('minClusters must be at least 2'); } if (maxClusters < minClusters) { throw new Error('maxClusters must be greater than or equal to minClusters'); } // Convert data to tensor if needed const isInputTensor = (0, tensor_utils_1.isTensor)(X); const dataTensor = isInputTensor ? X : tf.tensor2d(X); const nSamples = dataTensor.shape[0]; // Adjust maxClusters if it exceeds number of samples const effectiveMaxClusters = Math.min(maxClusters, nSamples - 1); if (effectiveMaxClusters < minClusters) { // Clean up tensor if we created it if (!isInputTensor) { dataTensor.dispose(); } throw new Error(`Not enough samples (${nSamples}) for minimum clusters (${minClusters})`); } const evaluations = []; // Test each k value for (let k = minClusters; k <= effectiveMaxClusters; k++) { // Create clustering instance let clusterer; switch (algorithm) { case 'kmeans': clusterer = new kmeans_1.KMeans({ nClusters: k, ...algorithmParams }); break; case 'spectral': clusterer = new spectral_1.SpectralClustering({ nClusters: k, ...algorithmParams, }); break; case 'agglomerative': clusterer = new agglomerative_1.AgglomerativeClustering({ nClusters: k, ...algorithmParams, }); break; default: throw new Error(`Unknown algorithm: ${algorithm}`); } // Fit and predict const labelsTensor = await clusterer.fitPredict(dataTensor); // Convert labels to array if it's a tensor const labels = (0, tensor_utils_1.isTensor)(labelsTensor) ? (await labelsTensor.data()) : labelsTensor; // Calculate metrics let silhouette = 0; let daviesBouldin = Infinity; let calinskiHarabasz = 0; if (metrics.includes('silhouette')) { silhouette = (0, silhouette_1.silhouetteScore)(dataTensor, labels); } if (metrics.includes('daviesBouldin')) { daviesBouldin = (0, davies_bouldin_1.daviesBouldinEfficient)(dataTensor, labels); } if (metrics.includes('calinskiHarabasz')) { calinskiHarabasz = (0, calinski_harabasz_1.calinskiHarabaszEfficient)(dataTensor, labels); } // Calculate combined score const evaluation = { k, silhouette, daviesBouldin, calinskiHarabasz, combinedScore: 0, labels: Array.from(labels), }; // Dispose labels tensor if needed if ((0, tensor_utils_1.isTensor)(labelsTensor)) { labelsTensor.dispose(); } // Use custom scoring function or default if (scoringFunction) { evaluation.combinedScore = scoringFunction(evaluation); } else { // Default: higher silhouette and calinski, lower davies = better evaluation.combinedScore = silhouette + calinskiHarabasz - daviesBouldin; } evaluations.push(evaluation); } // Sort by combined score (descending) evaluations.sort((a, b) => b.combinedScore - a.combinedScore); // Store result before cleanup const result = { optimal: evaluations[0], evaluations, }; // Clean up tensor if we created it if (!isInputTensor) { dataTensor.dispose(); } return result; }