node-kmeans
Version:
Node.js asynchronous implementation of the clustering algorithm k-means
236 lines (211 loc) • 6.9 kB
JavaScript
/*
* node-kmeans
* Copyright(c) 2012 Philmod <philippe.modard@gmail.com>
* MIT Licensed
*/
/*
Asynchronous implementation of the k-means clustering algorithm.
The kmeans function takes as input the number k of clusters and a list of N
input vectors and it outputs an object with two attributes:
- centroids: an Array of k vectors containing the centroid of each cluster
- assignments: An Array of size N representing for each input vector the
index of the cluster
The kmeans will return an error if:
- N < k
- The number of different input vectors is smaller than k
*/
const _ = require('underscore');
/**
* Compute the Euclidean distance
*
* @param {Array} a
* @param {Array} b
* @api private
*/
function euclidianDistance(a, b) {
if (a.length !== b.length) {
return (new Error('The vectors must have the same length'));
}
let d = 0.0;
for (let i = 0, max = a.length; i < max; ++i) {
d += (a[i] - b[i]) ** 2;
}
return Math.sqrt(d);
}
class Group {
constructor() {
this.centroidMoved = true;
}
initCluster() {
this.cluster = []; // dimensions
this.clusterInd = []; // index
}
/**
* Define Centroid
* - if they exist, calculate the new position
* - otherwise, randomly choose one existing item
*/
defineCentroid(self) {
this.centroidOld = (this.centroid) ? this.centroid : [];
if (this.centroid && this.cluster.length > 0) {
this.calculateCentroid();
} else { // random selection
const i = Math.floor(Math.random() * self.indexes.length);
this.centroidIndex = self.indexes[i];
self.indexes.splice(i, 1);
this.centroid = [];
if (!_.isArray(self.v[this.centroidIndex])) { // only one dimension
this.centroid[0] = self.v[this.centroidIndex];
} else {
for (let j = 0, max = self.v[this.centroidIndex].length; j < max; ++j) {
this.centroid[j] = self.v[this.centroidIndex][j];
}
}
}
// Centroid has moved if old value != new value
this.centroidMoved = !_.isEqual(this.centroid, this.centroidOld);
return this;
}
calculateCentroid() {
this.centroid = [];
for (let i = 0; i < this.cluster.length; ++i) { // loop through the cluster elements
for (let j = 0, max = this.cluster[i].length; j < max; ++j) { // loop through the dimensions
this.centroid[j] = this.centroid[j]
? this.centroid[j] + this.cluster[i][j]
: this.cluster[i][j];
}
}
for (let i = 0, max = this.centroid.length; i < max; ++i) {
this.centroid[i] = this.centroid[i] / this.cluster.length; // average
}
return this;
}
distanceObjects(self) {
if (!this.distances) {
this.distances = [];
}
for (let i = 0, max = self.v.length; i < max; ++i) {
this.distances[i] = self.distanceFunction(this.centroid, self.v[i]);
}
return this;
}
}
class Clusterize {
constructor(vector, options, callback) {
if (!callback || !options || !vector) {
throw new Error('Provide 3 arguments: vector, options, callback');
}
if (typeof callback !== 'function') {
throw new Error('Provide a callback function');
}
if (!options || !options.k || options.k < 1) {
return callback(new Error('Provide a correct number k of clusters'));
}
if (options.distance
&& (typeof options.distance !== 'function' || options.distance.length !== 2)) {
return callback(new Error('options.distance must be a function with two arguments'));
}
if (!_.isArray(vector)) {
return callback(new Error('Provide an array of data'));
}
this.options = options;
this.v = this.checkV(vector);
this.k = this.options.k;
this.distanceFunction = this.options.distance || euclidianDistance;
if (this.v.length < this.k) {
const errMessage = `The number of points must be greater than
the number k of clusters`;
return callback(new Error(errMessage));
}
this.initialize(); // initialize the group arrays
const self = this;
let moved = -1;
function iterate() {
if (moved === 0) {
return callback(null, self.output()); // converged if 0 centroid has moved
}
moved = 0;
for (let i = 0, max = self.groups.length; i < max; ++i) {
self.groups[i].defineCentroid(self); // define the new centroids
self.groups[i].distanceObjects(self); // distances from centroids to items
}
self.clustering(); // clustering by choosing the centroid the closest of each item
for (let i = 0, max = self.groups.length; i < max; ++i) {
// check how many centroids have moved in this iteration
if (self.groups[i].centroidMoved) {
moved++;
}
}
return process.nextTick(iterate);
}
return iterate();
}
checkV(v) {
let dim = 1;
if (_.isArray(v[0])) {
dim = v[0].length;
}
for (let i = 0, max = v.length; i < max; ++i) {
if (!_.isArray(v[i])) {
if (dim !== 1) {
throw new Error('All the elements must have the same dimension');
}
v[i] = Number(v[i]);
if (Number.isNaN(v[i])) {
throw new Error('All the elements must be float type');
}
} else {
if (v[i].length !== dim) {
throw new Error('All the elements must have the same dimension');
}
for (let j = 0, max2 = v[i].length; j < max2; ++j) {
v[i][j] = Number(v[i][j]);
if (Number.isNaN(v[i][j])) {
throw new Error('All the elements must be float type');
}
}
}
}
return v;
}
initialize() {
this.groups = [];
for (let i = 0, max = this.k; i < max; ++i) {
this.groups[i] = new Group(this);
}
this.indexes = []; // used to choose randomly the initial centroids
for (let i = 0, max = this.v.length; i < max; ++i) {
this.indexes[i] = i;
}
return this;
}
clustering() {
for (let j = 0, max = this.groups.length; j < max; ++j) {
this.groups[j].initCluster();
}
for (let i = 0, max = this.v.length; i < max; ++i) {
let min = this.groups[0].distances[i];
let indexGroup = 0;
for (let j = 1, max2 = this.groups.length; j < max2; ++j) {
if (this.groups[j].distances[i] < min) {
min = this.groups[j].distances[i];
indexGroup = j;
}
}
this.groups[indexGroup].cluster.push(this.v[i]);
this.groups[indexGroup].clusterInd.push(i);
}
return this;
}
output() {
const out = [];
for (let j = 0, max = this.groups.length; j < max; ++j) {
out[j] = _.pick(this.groups[j], 'centroid', 'cluster', 'clusterInd');
}
return out;
}
}
exports.clusterize = (vector, options, callback) => {
return new Clusterize(vector, options, callback);
};
exports._class = Clusterize;