UNPKG

ml-kmeans

Version:
138 lines 5.36 kB
import { squaredEuclidean } from 'ml-distance-euclidean'; import { Matrix } from 'ml-matrix'; import Random from 'ml-random'; /** * Choose K different random points from the original data * @ignore * @param {Array<Array<number>>} data - Points in the format to cluster [x,y,z,...] * @param {number} K - number of clusters * @param {number} seed - seed for random number generation * @return {Array<Array<number>>} - Initial random points */ export function random(data, K, seed) { const random = new Random(seed); return random.choice(data, { size: K }); } /** * Chooses the most distant points to a first random pick * @ignore * @param {Array<Array<number>>} data - Points in the format to cluster [x,y,z,...] * @param {number} K - number of clusters * @param {Array<Array<number>>} distanceMatrix - matrix with the distance values * @param {number} seed - seed for random number generation * @return {Array<Array<number>>} - Initial random points */ export function mostDistant(data, K, distanceMatrix, seed) { const random = new Random(seed); let ans = new Array(K); // chooses a random point as initial cluster ans[0] = Math.floor(random.random() * data.length); if (K > 1) { // chooses the more distant point let maxDist = { dist: -1, index: -1 }; for (let l = 0; l < data.length; ++l) { if (distanceMatrix[ans[0]][l] > maxDist.dist) { maxDist.dist = distanceMatrix[ans[0]][l]; maxDist.index = l; } } ans[1] = maxDist.index; if (K > 2) { // chooses the set of points that maximises the min distance for (let k = 2; k < K; ++k) { let center = { dist: -1, index: -1 }; for (let m = 0; m < data.length; ++m) { // minimum distance to centers let minDistCent = { dist: Number.MAX_VALUE, index: -1 }; for (let n = 0; n < k; ++n) { if (distanceMatrix[n][m] < minDistCent.dist && !ans.includes(m)) { minDistCent = { dist: distanceMatrix[n][m], index: m, }; } } if (minDistCent.dist !== Number.MAX_VALUE && minDistCent.dist > center.dist) { center = { ...minDistCent }; } } ans[k] = center.index; } } } return ans.map((index) => data[index]); } // Implementation inspired from scikit export function kmeanspp(X, K, options = {}) { const m = new Matrix(X); const nSamples = m.rows; const random = new Random(options.seed); // Set the number of trials const centers = []; const localTrials = options.localTrials || 2 + Math.floor(Math.log(K)); // Pick the first center at random from the dataset const firstCenterIdx = random.randInt(nSamples); centers.push(m.getRow(firstCenterIdx)); // Init closest distances let closestDistSquared = new Matrix(1, m.rows); for (let i = 0; i < m.rows; i++) { closestDistSquared.set(0, i, squaredEuclidean(m.getRow(i), centers[0])); } let cumSumClosestDistSquared = [cumSum(closestDistSquared.getRow(0))]; const factor = 1 / cumSumClosestDistSquared[0][nSamples - 1]; let probabilities = Matrix.mul(closestDistSquared, factor); // Iterate over the remaining centers for (let i = 1; i < K; i++) { const candidateIdx = random.choice(nSamples, { replace: true, size: localTrials, probabilities: probabilities.getRow(0), }); const candidates = m.selection(candidateIdx, range(m.columns)); const distanceToCandidates = euclideanDistances(candidates, m); let bestCandidate = Infinity; let bestPot = Infinity; let bestDistSquared = closestDistSquared; for (let j = 0; j < localTrials; j++) { const newDistSquared = Matrix.min(closestDistSquared, [ distanceToCandidates.getRow(j), ]); const newPot = newDistSquared.sum(); if (newPot < bestPot) { bestCandidate = candidateIdx[j]; bestPot = newPot; bestDistSquared = newDistSquared; } } centers[i] = m.getRow(bestCandidate); closestDistSquared = bestDistSquared; cumSumClosestDistSquared = [cumSum(closestDistSquared.getRow(0))]; probabilities = Matrix.mul(closestDistSquared, 1 / cumSumClosestDistSquared[0][nSamples - 1]); } return centers; } function euclideanDistances(A, B) { const result = new Matrix(A.rows, B.rows); for (let i = 0; i < A.rows; i++) { for (let j = 0; j < B.rows; j++) { result.set(i, j, squaredEuclidean(A.getRow(i), B.getRow(j))); } } return result; } function range(l) { let r = []; for (let i = 0; i < l; i++) { r.push(i); } return r; } function cumSum(arr) { let cumSum = [arr[0]]; for (let i = 1; i < arr.length; i++) { cumSum[i] = cumSum[i - 1] + arr[i]; } return cumSum; } //# sourceMappingURL=initialization.js.map