UNPKG

silhouette-coefficient

Version:

Javascript implementation that calculates the Silhouette coefficient of a clustering.

129 lines (111 loc) 4.72 kB
const euclidianDistance = (a, b) => { let distance = 0; Object.keys(a).forEach((key) => { distance += Math.pow(a[key] - b[key], 2); }); return Math.sqrt(distance); }; /** * * The silhouette coefficient measures how similar a point is to its own cluster * and how dissimilar to others. It ranges from -1 to 1, where higher values indicate * better clustering quality. * * @param {number[][]} vectors - An array of points where each array index is an array of coordinates for said point. * @param {number[]} labels - An array of cluster labels corresponding to each point in the vector. * @param {Function} [distanceFormula] - A function that computes the distance between two points. * @returns {number} The silhouette coefficient of the clustering. */ function silhouetteCoefficient(vectors, labels, distanceFormula) { if (vectors.length !== labels.length) { throw new Error("Number of points and their corresponding labels must be the same length."); } // normalize label number to start from zero and not skip a number let labelNormalization = new Map(); let n = 0; for (let i = 0; i < labels.length; i++) { if (!labelNormalization.has(labels[i])) { labelNormalization.set(labels[i], n); n++; } labels[i] = labelNormalization.get(labels[i]); } // // convert data and labels into clusters let clusters = []; for (let i = 0; i < Math.max(...labels) + 1; i++) { clusters.push([]); } if (clusters.length <= 1) { throw new Error("The number of clusters must be more than one."); } const dataPointDimension = vectors[0].length; for (let i = 0; i < vectors.length; i++) { if (dataPointDimension != vectors[i].length) { throw new Error("Data points must all be of the same dimensions"); } clusters[labels[i]].push(vectors[i]); } // -------------------------- // If a cluster is empty then throw unexpected error because it shouldn't trigger for (let i = 0; i < clusters.length; i++) { if (!clusters[i].length) { // throw new Error("Unexpected condition error."); throw new Error( "Unexpected error. At least on cluster is empty. All clusters must have at least one data point.", ); } } if (!distanceFormula) { distanceFormula = euclidianDistance; } let sTotal = 0; let datapointsCounter = 0; // calculate mean silhouette score across all clusters for (let i = 0; i < clusters.length; i++) { let cluster = clusters[i]; datapointsCounter += cluster.length; // number of datapoints in this cluster. Needed to find the mean later // let s_c; // silhouette of the cluster if (cluster.length <= 1) { // s_c = 0; // if only one data point in cluster then the silhouette equals zero so it add nothing to the sum. so I continue with next cluster continue; } // iterate all data points in the cluster for (let j = 0; j < cluster.length; j++) { // cohesion-- let totalInnerClusterDistance = 0; for (let k = 0; k < cluster.length; k++) { if (j === k) { // if it's the same data point then skip because I need its distance with the other data points in the same cluster, not with itself continue; } totalInnerClusterDistance += distanceFormula(cluster[j], cluster[k]); // measure distance between each data point in the cluster } let a_i = totalInnerClusterDistance / (cluster.length - 1); // console.log("ai", a_i); // seperation-- let b_iAll = []; for (let c = 0; c < clusters.length; c++) { if (c === i) { // if it's the same cluster then skip because I search for seperation here. I searched innercluster distance in cohesion continue; } let clusterB = clusters[c]; let totalOuterClusterDistance = 0; for (let k = 0; k < clusterB.length; k++) { totalOuterClusterDistance += distanceFormula(cluster[j], clusterB[k]); } b_iAll.push(totalOuterClusterDistance / clusterB.length); } let b_i = Math.min(...b_iAll); // console.log("biall", b_iAll, "bi", b_i); // let s_i = (b_i - a_i) / Math.max(a_i, b_i); // console.log("s_i", (b_i - a_i) / Math.max(a_i, b_i)); sTotal += (b_i - a_i) / Math.max(a_i, b_i); // } // end iteration of all data points in the cluster } // end iteration of all clusters // console.log("sTotal", sTotal, "datapointscounter", datapointsCounter); return sTotal / datapointsCounter; } export { silhouetteCoefficient as default };