clustering-tfjs
Version:
High-performance TypeScript clustering algorithms (K-Means, Spectral, Agglomerative) with TensorFlow.js acceleration and scikit-learn compatibility
165 lines (164 loc) • 6.61 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.findOptimalClusters = findOptimalClusters;
const tf = __importStar(require("../tf-adapter"));
const kmeans_1 = require("../clustering/kmeans");
const spectral_1 = require("../clustering/spectral");
const agglomerative_1 = require("../clustering/agglomerative");
const silhouette_1 = require("../validation/silhouette");
const davies_bouldin_1 = require("../validation/davies_bouldin");
const calinski_harabasz_1 = require("../validation/calinski_harabasz");
const tensor_utils_1 = require("./tensor-utils");
/**
* Automatically finds the optimal number of clusters for a dataset by evaluating
* multiple k values using validation metrics.
*
* @param X - Input data matrix (samples × features)
* @param options - Configuration options
* @returns Object containing optimal k and detailed results for all tested k values
*
* @example
* ```typescript
* import { findOptimalClusters } from 'clustering-tfjs';
*
* const data = [[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]];
* const result = await findOptimalClusters(data, { maxClusters: 5 });
*
* console.log(`Optimal number of clusters: ${result.optimal.k}`);
* console.log(`Best silhouette score: ${result.optimal.silhouette}`);
* ```
*/
async function findOptimalClusters(X, options = {}) {
const { minClusters = 2, maxClusters = 10, algorithm = 'kmeans', algorithmParams = {}, metrics = ['silhouette', 'daviesBouldin', 'calinskiHarabasz'], scoringFunction, } = options;
// Validate inputs
if (minClusters < 2) {
throw new Error('minClusters must be at least 2');
}
if (maxClusters < minClusters) {
throw new Error('maxClusters must be greater than or equal to minClusters');
}
// Convert data to tensor if needed
const isInputTensor = (0, tensor_utils_1.isTensor)(X);
const dataTensor = isInputTensor ? X : tf.tensor2d(X);
const nSamples = dataTensor.shape[0];
// Adjust maxClusters if it exceeds number of samples
const effectiveMaxClusters = Math.min(maxClusters, nSamples - 1);
if (effectiveMaxClusters < minClusters) {
// Clean up tensor if we created it
if (!isInputTensor) {
dataTensor.dispose();
}
throw new Error(`Not enough samples (${nSamples}) for minimum clusters (${minClusters})`);
}
const evaluations = [];
// Test each k value
for (let k = minClusters; k <= effectiveMaxClusters; k++) {
// Create clustering instance
let clusterer;
switch (algorithm) {
case 'kmeans':
clusterer = new kmeans_1.KMeans({ nClusters: k, ...algorithmParams });
break;
case 'spectral':
clusterer = new spectral_1.SpectralClustering({
nClusters: k,
...algorithmParams,
});
break;
case 'agglomerative':
clusterer = new agglomerative_1.AgglomerativeClustering({
nClusters: k,
...algorithmParams,
});
break;
default:
throw new Error(`Unknown algorithm: ${algorithm}`);
}
// Fit and predict
const labelsTensor = await clusterer.fitPredict(dataTensor);
// Convert labels to array if it's a tensor
const labels = (0, tensor_utils_1.isTensor)(labelsTensor)
? (await labelsTensor.data())
: labelsTensor;
// Calculate metrics
let silhouette = 0;
let daviesBouldin = Infinity;
let calinskiHarabasz = 0;
if (metrics.includes('silhouette')) {
silhouette = (0, silhouette_1.silhouetteScore)(dataTensor, labels);
}
if (metrics.includes('daviesBouldin')) {
daviesBouldin = (0, davies_bouldin_1.daviesBouldinEfficient)(dataTensor, labels);
}
if (metrics.includes('calinskiHarabasz')) {
calinskiHarabasz = (0, calinski_harabasz_1.calinskiHarabaszEfficient)(dataTensor, labels);
}
// Calculate combined score
const evaluation = {
k,
silhouette,
daviesBouldin,
calinskiHarabasz,
combinedScore: 0,
labels: Array.from(labels),
};
// Dispose labels tensor if needed
if ((0, tensor_utils_1.isTensor)(labelsTensor)) {
labelsTensor.dispose();
}
// Use custom scoring function or default
if (scoringFunction) {
evaluation.combinedScore = scoringFunction(evaluation);
}
else {
// Default: higher silhouette and calinski, lower davies = better
evaluation.combinedScore = silhouette + calinskiHarabasz - daviesBouldin;
}
evaluations.push(evaluation);
}
// Sort by combined score (descending)
evaluations.sort((a, b) => b.combinedScore - a.combinedScore);
// Store result before cleanup
const result = {
optimal: evaluations[0],
evaluations,
};
// Clean up tensor if we created it
if (!isInputTensor) {
dataTensor.dispose();
}
return result;
}