UNPKG

qminer

Version:

A C++ based data analytics platform for processing large-scale real-time streams containing structured and unstructured data

440 lines (417 loc) 19.1 kB
// JavaScript source code /** * Copyright (c) 2015, Jozef Stefan Institute, Quintelligence d.o.o. and contributors * All rights reserved. * * This source code is licensed under the FreeBSD license found in the * LICENSE file in the root directory of this source tree. */ module.exports = exports = function (pathQmBinary) { var qm = require(pathQmBinary); // This loads only c++ functions of qm var fs = qm.fs; var la = qm.la; var stat = qm.statistics; exports = qm.deprecated; var assert = require('assert'); var qm_util = require(__dirname + '/qm_util.js'); /** * @classdesc KMeans clustering * @class * @property {number} iter - The maximum number of iterations. * @property {number} k - The number of centroids. * @property {boolean} verbose - If false, the console output is supressed. * @property {Array} fitIdx - Array of indexes that should be used as starting centroids. Optional. * @property {model} fitStart - Model from another KMeans algorithm (obtained via getModel() method). Its centroids are used as starting centroids for this model. Optional. * @example * // import analytics and la modules * var analytics = require('qminer').analytics; * var la = require('qminer').la; * // create a KMeans object * var KMeans = new analytics.KMeans(); * // create the matrix to be fitted * var X = new la.Matrix([[1, -2, -1], [1, 1, -3]]); * // create the model * KMeans.fit(X); */ exports.KMeans = function (param) { // Fit params // var iter = param.iter == undefined ? 100 : param.iter; // var k = param.k == undefined ? 2 : param.k; // var verbose = param.verbose == undefined ? false : param.verbose; // var fitIdx = param.fitIdx == undefined ? undefined : param.fitIdx; // Model var C = undefined; var idxv = undefined; var norC2 = undefined; var iter = undefined; var k = undefined; var verbose = undefined; var fitIdx = undefined; var fitStart = undefined; var medoids = new la.Vector(); if (param != undefined && param instanceof fs.FIn) { C = new la.Matrix(); C.load(param); norC2 = new la.Vector(); norC2.load(param); idxv = new la.IntVector(); idxv.load(param); var fin_params = param.readJson(); iter = fin_params.iter; k = fin_params.k; verbose = fin_params.verbose; medoids.load(param); } else if (param == undefined || typeof param == 'object') { param = param == undefined ? {} : param; // Fit params iter = (param.iter == undefined) ? 100 : param.iter; k = (param.k == undefined) ? 2 : param.k; verbose = (param.verbose == undefined) ? false : param.verbose; fitIdx = param.fitIdx == undefined ? undefined : param.fitIdx; fitStart = param.fitStart == undefined ? undefined : param.fitStart; } else { throw "KMeans.constructor: parameter must be a JSON object or a fs.FIn!"; } param = { iter: iter, k: k, verbose: verbose }; /** * Permutes centroid with given mapping. * @param {object} mapping - object that contains the mapping. E.g. mapping[4]=2 means "map cluster 4 into cluster 2" */ this.permuteCentroids = function (mapping) { var cl_count = C.cols; var perm_matrix = la.zeros(cl_count, cl_count); for (var i = 0; i < cl_count; i++) { perm_matrix.put(i, mapping[i], 1); } var C_new = C.multiply(perm_matrix); var idxv_new = new la.Vector(idxv); for (var i = 0; i < idxv_new.length; i++) { idxv_new[i] = mapping[idxv[i]] } C = C_new; norC2 = la.square(C.colNorms()); idxv = idxv_new; if (medoids.length != 0) { var medoids_new = new la.Vector(medoids); for (var i = 0; i < medoids_new.length; i++) { medoids_new[i] = mapping[medoids[i]] } medoids = medoids_new; } } /** * Returns the model * @returns {Object} The model object whose keys are: C (centroids) and idxv (cluster ids of the training data). * @example * // import modules * var analytics = require('qminer').analytics; * var la = require('qminer').la; * // create the KMeans object * var KMeans = new analytics.KMeans({ iter: 1000 }); * // create a matrix to be fitted * var X = new la.Matrix([[1, -2, -1], [1, 1, -3]]); * // create the model * KMeans.fit(X); * // get the model * var model = KMeans.getModel(); */ this.getModel = function () { return { C: C, idxv: idxv }; } /** * Sets the parameters. * @param {Object} p - Object whose keys are: k (number of centroids), iter (maximum iterations) and verbose (if false, console output is supressed). * @returns {module:analytics.KMeans} Self. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create a new KMeans object * var KMeans = new analytics.KMeans(); * // change the parameters of the KMeans object * KMeans.setParams({ iter: 1000, k: 5 }); */ this.setParams = function (p) { param = p; iter = param.iter == undefined ? iter : param.iter; k = param.k == undefined ? k : param.k; verbose = param.verbose == undefined ? verbose : param.verbose; fitIdx = param.fitIdx == undefined ? fitIdx : param.fitIdx; fitStart = param.fitStart == undefined ? undefined : param.fitStart; } /** * Returns the parameters. * @returns Object whose keys are: k (number of centroids), iter (maximum iterations) and verbose (if false, console output is supressed). * @example * // import analytics module * var analytics = require('qminer').analytics; * // create a new KMeans object * var KMeans = new analytics.KMeans({ iter: 1000, k: 5 }); * // get the parameters * var json = KMeans.getParams(); */ this.getParams = function () { return { iter: iter, k: k, verbose: verbose } } /** * Computes the centroids. * @param {(module:la.Matrix | module:la.SparseMatrix)} X - Matrix whose columns correspond to examples. * @param {module:la.IntVector} [recIds] - IDs of columns of X. The fit function stores the IDs of the medoids, which are used by the KMeans.explain function. * @returns {module:analytics.KMeans} Self. It stores the info about the new model. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create a new KMeans object * var KMeans = new analytics.KMeans({ iter: 1000, k: 3 }); * // create a matrix to be fitted * var X = new la.Matrix([[1, -2, -1], [1, 1, -3]]); * // create the model with the matrix X * KMeans.fit(X); */ this.fit = function (X, recIds) { // select random k columns of X, returns a dense C++ matrix var selectCols = function (X, k) { if (fitStart) { assert(fitStart.C.cols == k, "Error: fitStart.C.cols is not of length k!"); var result = {}; result.C = fitStart.C; result.idx = la.randi(X.cols, k); // this assignment is irrelevant, really return result; } var idx; if (fitIdx == undefined) { idx = la.randi(X.cols, k); } else { assert(fitIdx.length == k, "Error: fitIdx is not of length k!"); assert(Math.max.apply(Math, fitIdx) < X.cols, "Error: fitIdx contains index greater than number of columns in matrix. Index out of range!"); idx = fitIdx; } var idxMat = new la.SparseMatrix({ cols: 0, rows: X.cols }); for (var i = 0; i < idx.length; i++) { var spVec = new la.SparseVector([[idx[i], 1.0]], X.cols); idxMat.push(spVec); } var C = X.multiply(idxMat); var result = {}; result.C = C; result.idx = idx; return result; }; // modified k-means algorithm that avoids empty centroids // A Modified k-means Algorithm to Avoid Empty Clusters, Malay K. Pakhira // http://www.academypublisher.com/ijrte/vol01/no01/ijrte0101220226.pdf var getCentroids = function (X, idx, oldC) { // select random k columns of X, returns a dense matrix // 1. construct a sparse matrix (coordinate representation) that encodes the closest centroids var idxvec = new la.IntVector(idx); var rangeV = la.rangeVec(0, X.cols - 1); var ones_cols = la.ones(X.cols); var idxMat = new la.SparseMatrix(idxvec, rangeV, ones_cols, X.cols); idxMat = idxMat.transpose(); var ones_n = la.ones(X.cols); // 2. compute the number of points that belong to each centroid, invert var colSum = idxMat.multiplyT(ones_n); for (var i = 0; i < colSum.length; i++) { var val = 1.0 / (1.0 + colSum.at(i)); // modification colSum.put(i, val); } // 3. compute the centroids //var w = new qm_util.clsStopwatch(); //w.tic(); var sD = colSum.spDiag(); var C = oldC; if (idxMat.cols == oldC.cols) C = ((X.multiply(idxMat)).plus(oldC)).multiply(sD); // modification return C; }; // X: column examples // k: number of centroids // iter: number of iterations assert(k <= X.cols, "k <= X.cols"); var w = new qm_util.clsStopwatch(); var norX2 = la.square(X.colNorms()); var initialCentroids = selectCols(X, k); C = initialCentroids.C; var idxvOld = initialCentroids.idx; //printArray(idxvOld); // DEBUG var ones_n = la.ones(X.cols).multiply(0.5); var ones_k = la.ones(k).multiply(0.5); w.tic(); for (var i = 0; i < iter; i++) { //console.say("iter: " + i); norC2 = la.square(C.colNorms()); //D = full(C'* X) - norC2' * (0.5* ones(1, n)) - (0.5 * ones(k,1) )* norX2'; var D = C.multiplyT(X).minus(norC2.outer(ones_n)).minus(ones_k.outer(norX2)); idxv = new la.IntVector(la.findMaxIdx(D)); if (verbose) { var energy = 0.0; for (var j = 0; j < X.cols; j++) { if (D.at(idxv[j], j) < 0) { energy += Math.sqrt(-2 * D.at(idxv[j], j)); } } console.log("energy: " + 1.0 / X.cols * energy); } if (qm_util.arraysIdentical(idxv, idxvOld)) { if (verbose) { console.log("converged at iter: " + i); //DEBUG } break; } idxvOld = new la.IntVector(idxv); C = getCentroids(X, idxv, C); //drag } if (verbose) { w.toc("end"); } norC2 = la.square(C.colNorms()); if (recIds != undefined) { assert(recIds.length == X.cols); var D = X.multiplyT(C).minus(ones_n.outer(norC2)).minus(norX2.outer(ones_k)); medoidIdx = la.findMaxIdx(D); medoids = new la.Vector(medoidIdx); for (var i = 0; i < medoids.length; i++) { medoids[i] = recIds[medoidIdx[i]]; } } }; /** * Returns an vector of cluster id assignments. * @param {(module:la.Matrix | module:la.SparseMatrix)} A - Matrix whose columns correspond to examples. * @returns {module:la.IntVector} Vector of cluster assignments. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create a new KMeans object * var KMeans = new analytics.KMeans({ iter: 1000, k: 3 }); * // create a matrix to be fitted * var X = new la.Matrix([[1, -2, -1], [1, 1, -3]]); * // create the model with the matrix X * KMeans.fit(X); * // create the matrix of the prediction vectors * var pred = new la.Matrix([[2, -1, 1], [1, 0, -3]]); * // predict the values * var prediction = KMeans.predict(pred); */ this.predict = function (X) { var ones_n = la.ones(X.cols).multiply(0.5); var ones_k = la.ones(k).multiply(0.5); var norX2 = la.square(X.colNorms()); var D = C.multiplyT(X).minus(norC2.outer(ones_n)).minus(ones_k.outer(norX2)); return la.findMaxIdx(D); } /** * @typedef KMeansExplanation * @type {Object} * @property {number} medoidID - The ID of the nearest medoids * @property {module:la.IntVector} featureIDs - The IDs of features, sorted by contribution * @property {module:la.Vector} featureContributions - Weights of each feature contribution (sum to 1.0) */ /** * Returns the IDs of the nearest medoid for each example. * @param {(module:la.Matrix | module:la.SparseMatrix)} X - Matrix whose columns correspond to examples. * @returns {Array.<KMeansExplanation>} Object containing the vector of medoid IDs. * @example * // import analytics module * var analytics = require('qminer').analytics; * // import linear algebra module * var la = require('qminer').la; * // create a new KMeans object * var KMeans = new analytics.KMeans({ iter: 1000, k: 3 }); * // create a matrix to be fitted * var X = new la.Matrix([[1, -2, -1], [1, 1, -3]]); * // create the model with the matrix X using the column IDs [0,1,2] * KMeans.fit(X, [1234,1142,2355]); * // create the matrix of the prediction vectors * var test = new la.Matrix([[2, -1, 1], [1, 0, -3]]); * // predict/explain - return the closest medoids * var explanation = KMeans.explain(test); */ this.explain = function (X) { if (medoids == undefined) { return { medoidIDs: null }; } var ones_n = la.ones(X.cols).multiply(0.5); var ones_k = la.ones(k).multiply(0.5); var norX2 = la.square(X.colNorms()); var D = C.multiplyT(X).minus(norC2.outer(ones_n)).minus(ones_k.outer(norX2)); var centroids = la.findMaxIdx(D); var medoidIDs = new la.IntVector(centroids); assert(medoids.length == k); var result = []; for (var i = 0; i < centroids.length; i++) { var explanation = featureContrib(X.getCol(i), C.getCol(centroids[i])); result[i] = { medoidID: medoids[centroids[i]], featureIDs: explanation.featureIDs, featureContributions: explanation.featureContributions } } return result; } /** * Returns the weights and feature IDs that contributed to the distance between two vectors * @param {(module:la.Vector | module:la.SparseVector)} x - Vector * @param {(module:la.Vector | module:la.SparseVector)} y - Vector * @returns {Object} Feature IDs and feature contributions **/ function featureContrib(x, y) { var fx = x.constructor.name == 'SparseVector' ? x.full() : x; var fy = y.constructor.name == 'SparseVector' ? y.full() : y; var diff = fx.minus(fy); var nor2 = Math.pow(diff.norm(), 2); for (var i = 0; i < diff.length; i++) { diff[i] = Math.pow(diff[i], 2) / nor2; } var sorted = diff.sortPerm(false); // sort descending return { featureIDs: sorted.perm, featureContributions: sorted.vec }; } /** * Transforms the points to vectors of squared distances to centroids. * @param {(module:la.Matrix | module:la.SparseMatrix)} A - Matrix whose columns correspond to examples. * @returns {module:la.Matrix} Matrix where each column represents the squared distances to the centroid vectors. * @example * // import modules * var analytics = require('qminer').analytics; * var la = require('qminer').la; * // create a new KMeans object * var KMeans = new analytics.KMeans({ iter: 1000, k: 3 }); * // create a matrix to be fitted * var X = new la.Matrix([[1, -2, -1], [1, 1, -3]]); * // create the model with the matrix X * KMeans.fit(X); * // create the matrix of the transform vectors * var matrix = new la.Matrix([[-2, 0], [0, -3]]); * // get the transform values of matrix * // returns the matrix * // 10 17 * // 1 20 * // 10 1 * KMeans.transform(matrix); */ this.transform = function (X) { var ones_n = la.ones(X.cols).multiply(0.5); var ones_k = la.ones(k).multiply(0.5); var norX2 = la.square(X.colNorms()); var D = C.multiplyT(X).minus(norC2.outer(ones_n)).minus(ones_k.outer(norX2)); D = D.multiply(-2); return D; } /** * Saves KMeans internal state into (binary) file. * @param {module:fs.FOut} arg - The output stream. * @returns {module:fs.FOut} The output stream fout. */ this.save = function (fout) { if (!C) { throw new Error("KMeans.save() - model not created yet"); } C.save(fout); norC2.save(fout); idxv.save(fout); fout.writeJson({ iter: iter, k: k, verbose: verbose }); medoids.save(fout); return fout; } } //!ENDJSDOC return exports; }