UNPKG

qminer

Version:

A C++ based data analytics platform for processing large-scale real-time streams containing structured and unstructured data

1,161 lines (1,089 loc) 93.3 kB
"use strict"; /** * Copyright (c) 2015, Jozef Stefan Institute, Quintelligence d.o.o. and contributors * All rights reserved. * * This source code is licensed under the FreeBSD license found in the * LICENSE file in the root directory of this source tree. */ var sget = require(__dirname + '/third_party/sget/sget.js'); var override = require(__dirname + '/third_party/json-override/json-override.js'); var assert = require('assert'); var qm_util = require(__dirname + '/qm_util.js'); module.exports = exports = function (pathQmBinary) { var qm = require(pathQmBinary); // This loads only c++ functions of qm var fs = qm.fs; var la = qm.la; var stat = qm.statistics; exports = qm.analytics; //!STARTJSDOC /////////////////////////////////////////////////// ///////////// DATA PREPROCESSING ////////////// /////////////////////////////////////////////////// /** * PreprocessingF * @namespace * @desc Preprocessing functions for preparing labels in formats accepted * by learning modules in {@link module:analytics}. */ var preprocessing = preprocessing || {}; // namespacing: http://addyosmani.com/blog/essential-js-namespacing/ /** * Transforming arrays with labels to vector appropriate for binary classifiers. * @class * @classdesc * Transform given array of labels into binary vector with different * numeric value for elements when label matches specified label and * for other elements. By default, these values are +1 for matching * labels, and -1 for the rest. * @param {Array} y - Labels. * @param {string | number} positiveLabel - Positive label. * @param {number} [positiveId = 1] - Value when matching positive label. * @param {number} [negativeId = -1] - Value when not matching positive label. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create binarizer with 'b' as positive label * var binarizer = new analytics.preprocessing.Binarizer('b'); */ preprocessing.Binarizer = function (positiveLabel, positiveId, negativeId) { if (positiveLabel == undefined) { throw "Binarizer needs positive label"; } this.positiveLabel = positiveLabel; this.positiveId = (positiveId == undefined) ? 1 : positiveId; this.negativeId = (negativeId == undefined) ? -1 : negativeId; this.fit = function () { // do nothing } /** * Transform given array of labels to binary numeric vector. * @param {(Array<number> | Array<string> | module:la.Vector | module:la.StrVector)} y - Labels. * @return {modul:la.Vector} Binarized vector. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create binarizer with 'b' as positive label * var binarizer = new analytics.preprocessing.Binarizer('b'); * // get vector with binarized labels * var bins = binarizer.transform(['a','b','a','c']); */ this.transform = function (y) { var target = new la.Vector(); for (var i = 0; i < y.length; i++) { target.push(y[i] === this.positiveLabel ? this.positiveId : this.negativeId); } return target; } }; /** * Applies the model's `decisionFunction` method (if exists) on each column of matrix `X`. * @param {Object} model - The model, that has the `decisionFunction` method. * @param {module:la.SparseMatrix} X - The matrix. * @returns {module:la.Vector} The dense vector where the i-th value is the value the `model.decisionFunction` * returned for the sparse vector `X[i]`. * @example * // TODO */ preprocessing.applyModel = function (model, X) { if (model.decisionFunction == undefined) { throw "preprocessing.applyModel: model doesn't have a method called decisionFunction!"; } var target = new la.Vector(); for (var i = 0; i < X.cols; i++) { target.push(model.decisionFunction(X[i])); } return target; } // Exports preprocessing namespace exports.preprocessing = preprocessing; // SVM /** * Get the model. * @returns {Object} The `svmModel` object containing the property: * <br> 1. `svmModel.weights` - The weights of the model. Type {@link module:la.Vector}. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create a SVC model * var SVC = new analytics.SVC(); * // get the properties of the model * var model = SVC.getModel(); */ exports.SVC.prototype.getModel = function() { return { weights: this.weights, bias: this.bias }; } /** * Get the model. * @returns {Object} The `svmModel` object containing the property: * <br> 1. `svmModel.weights` - The weights of the model. Type {@link module:la.Vector}. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create a SVR model * var SVR = new analytics.SVR(); * // get the properties of the model * var model = SVR.getModel(); */ exports.SVR.prototype.getModel = function() { return { weights: this.weights, bias: this.bias }; } // Ridge Regression /** * Gets the model. * @returns {Object} The `ridgeRegModel` object containing the property: * <br> 1. `ridgeRegModel.weights` - The weights of the model. Type {@link module:la.Vector}. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create the Ridge Regression model * var regmod = new analytics.RidgeReg(); * // get the model * var model = regmod.getModel(); */ exports.RidgeReg.prototype.getModel = function () { return { weights: this.weights }; } // Recursive Linear Regression /** * Gets the model. * @returns {Object} The `recLinRegModel` object containing the property: * <br> 1. `recLinRegModel.weights` - The weights of the model. Type {@link module:la.Vector}. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create the Recursive Linear Regression model * var linreg = new analytics.RecLinReg({ dim: 10 }); * // get the model * var model = linreg.getModel(); // returns { weights: new require('qminer').la.Vector(); } */ exports.RecLinReg.prototype.getModel = function () { return { weights: this.weights } } /** * @typedef {Object} oneVsAllParam * An object used for the construction of {@link module:analytics.OneVsAll}. * @property {function} [model] - Constructor for binary model to be * used internaly. Constructor should expect only one parameter. * @property {Object} [modelParam] - Parameter for `oneVsAllParam.model` constructor. * @property {number} [categories] - Number of categories. * @property {boolean} [verbose = false] - If false, the console output is supressed. */ /** * @classdesc One vs All model for multiclass prediction. Builds binary model * for each category and predicts the one with the highest score. Binary model is * provided as part of the constructor. * @class * @param {module:analytics~oneVsAllParam} [arg] - Construction arguments. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create a new OneVsAll object with the model analytics.SVC * var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 }); */ exports.OneVsAll = function (arg) { // remember parameters var model = arg.model; var modelParam = arg.modelParam; var cats = arg.cats; var verbose = arg.verbose == undefined ? false : arg.verbose; // trained models var models = [ ]; /** * Gets the parameters. * @returns {module:analytics~oneVsAllParam} The constructor parameters. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create a new OneVsAll object with the model analytics.SVC * var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 }); * // get the parameters * // returns the JSon object * // { model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2, models: [] } * var params = onevsall.getParams(); */ this.getParams = function () { return { model: model, modelParam: modelParam, cats: cats, models: models } }; /** * Sets the parameters. * @param {module:analytics~OneVsAllParam} params - The constructor parameters. * @returns {module:analytics.OneVsAll} Self. The parameters are changed. * @example * // import analytics module * var analytics = require('qminer').analytics; * // create a new OneVsAll object with the model analytics.SVC * var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 }); * // set the parameters * var params = onevsall.setParams({ model: analytics.SVR, modelParam: { c: 12, maxTime: 10000}, cats: 3, verbose: true }); */ this.setParams = function (params) { model = params.model == undefined ? model : params.model; modelParam = params.modelParam == undefined ? modelParam : params.modelParam; cats = params.cats == undefined ? cats : params.cats; verbose = params.verbose == undefined ? verbose : params.verbose; } /** * Apply all models to the given vector and returns a vector of scores, one for each category. * Semantic of scores depend on the provided binary model. * @param {module:la.Vector | module:la.SparseVector | module:la.Matrix | module:la.SparseMatrix} X - * Feature vector or matrix with feature vectors as columns. * @returns {module:la.Vector | module:la.Matrix} The score and label of the input `X`: * <br>1. {@link module:la.Vector} of scores, if `X` is of type {@link module:la.Vector} or {@link module:la.SparseVector}. * <br>2. {@link module:la.Matrix} with columns corresponding to instances, and rows corresponding to labels, if `X` is of type {@link module:la.Matrix} or {@link module:la.SparseMatrix}. * @example * // import modules * var analytics = require('qminer').analytics; * var la = require('qminer').la; * // create a new OneVsAll object with the model analytics.SVC * var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 }); * // create the data (matrix and vector) used to fit the model * var matrix = new la.Matrix([[1, 2, 1, 1], [2, 1, -3, -4]]); * var vector = new la.Vector([0, 0, 1, 1]); * // fit the model * onevsall.fit(matrix, vector); * // create the vector for the decisionFunction * var test = new la.Vector([1, 2]); * // give the vector to the decision function * var prediction = onevsall.decisionFunction(test); // returns the vector of scores */ this.decisionFunction = function(X) { // check what is our input if (X instanceof la.Vector || X instanceof la.SparseVector) { // evaluate all models var scores = new la.Vector(); for (var cat = 0; cat < cats; cat++) { scores.push(models[cat].decisionFunction(X)); } return scores; } else if (X instanceof la.Matrix || X instanceof la.SparseMatrix) { // create matrix where cols are instances and rows are scores for categories var scores = new la.Matrix({rows: cats, cols: X.cols}); for (var i = 0; i < X.cols; i++) { var x_i = X.getCol(i); for (var cat = 0; cat < cats; cat++) { scores.put(cat, i, models[cat].decisionFunction(x_i)); } } return scores; } else { throw "analytics.OneVsAll.decisionFunction: Input data of unsupported type!"; } } /** * Apply all models to the given vector and returns category with the highest score. * @param {module:la.Vector | module:la.SparseVector | module:la.Matrix | module:la.SparseMatrix} X - * Feature vector or matrix with feature vectors as columns. * @returns {number | module:la.IntVector} * <br>1. number of the category with the higher score, if `X` is {@link module:la.Vector} or {@link module:la.SparseVector}. * <br>2. {@link module:la.IntVector} of categories with the higher score for each column of `X`, if `X` is {@link module:la.Matrix} or {@link module:la.SparseMatrix}. * @example * // import modules * var analytics = require('qminer').analytics; * var la = require('qminer').la; * // create a new OneVsAll object with the model analytics.SVC * var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 }); * // create the data (matrix and vector) used to fit the model * var matrix = new la.Matrix([[1, 2, 1, 1], [2, 1, -3, -4]]); * var vector = new la.Vector([0, 0, 1, 1]); * // fit the model * onevsall.fit(matrix, vector); * // create the vector for the prediction * var test = new la.Vector([1, 2]); * // get the prediction of the vector * var prediction = onevsall.predict(test); // returns 0 */ this.predict = function(X) { // evaluate all models var scores = this.decisionFunction(X); // select maximal one if (scores instanceof la.Vector) { return scores.getMaxIdx(); } else if (scores instanceof la.Matrix) { var predictions = new la.IntVector(); for (var i = 0; i < scores.cols; i++) { predictions.push(scores.getCol(i).getMaxIdx()); } return predictions; } else { throw "analytics.OneVsAll.predict: decisionFunction returns unsupported type!"; } } // X = feature matrix // y = target label from 0..cats /** * Apply all models to the given vector and returns category with the highest score. * @param {module:la.Matrix | module:la.SparseMatrix} X - training instance feature vectors. * @param {module:la.Vector} y - target category for each training instance. Categories must * be integer numbers between `0` and `oneVsAllParam.categories-1`. * @returns {module:analytics.OneVsAll} Self. The models have been fitted. * @example * // import modules * var analytics = require('qminer').analytics; * var la = require('qminer').la; * // create a new OneVsAll object with the model analytics.SVC * var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 }); * // create the data (matrix and vector) used to fit the model * var matrix = new la.Matrix([[1, 2, 1, 1], [2, 1, -3, -4]]); * var vector = new la.Vector([0, 0, 1, 1]); * // fit the model * onevsall.fit(matrix, vector); */ this.fit = function(X, y) { models = [ ]; // make model for each category for (var cat = 0; cat < cats; cat++) { if (verbose) { console.log("Fitting label", (cat + 1), "/", cats); }; // prepare targert vector for current category var target = (y instanceof la.Matrix) ? // we have a special bianary vector for each category, make it into -1/+1 (new exports.preprocessing.Binarizer(1)).transform(y.getRow(cat)) : // we have a vector with label for each element, get out -1/+1 vector (new exports.preprocessing.Binarizer(cat)).transform(y); // get the model var catModel = new model(modelParam); models.push(catModel.fit(X, target)); } if (verbose) { console.log("Done!"); }; return this; } }; /** * Threshold Model * @class * @classdesc The Threshold model. Uses the methods from the {@link module:analytics.metrics}. * @param {Object} [arg] - The constructor parameters. * @param {string} [arg.target] - Target type. Possible options are `"recall"` and `"precision"`. * @param {TODO} [arg.level] - TODO * @example * // TODO */ exports.ThresholdModel = function(params) { // what do we optimize this.target = params.target; if (this.target === "recall" || this.target === "precision") { this.level = params.level; } // threshold model this.model = null; // apply all models to the given vector and return distance to the class boundary // x = dense vector with prediction score for each class // result = traslated predictions based on thresholds /** * Apply all models to the given vector and returns the distance to the class boundary. * @param {number | module:la.Vector} x - The prediction score for each class. * @returns {number | module:la.Vector} * <br>1. value of the translated prediction based on the threshold, if `x` is `number`, * <br>2. {@link module:la.Vector} of translated prediction based on the threshold, if `x` is {@link module:la.Vector}. * @example * // TODO */ this.decisionFunction = function(x) { if (x instanceof Number) { // just transate based on the model's threshold return x - this.model; } else if (x instanceof la.Vector) { // each element is a new instance var scores = new la.Vector(); for (var i = 0; i < x.length; i++) { scores.push(x[i] - this.model); } return scores; } else { throw "analytics.ThresholdModel.decisionFunction: Input data of unsupported type!"; } } // return the most likely category // x = dense vector with prediction score for each class // result = array of positive label ids /** * Returns the most likely category. * @param {number | module:la.Vector} x - The prediction score for each class. * @returns {number | module:la.Vector} * <br>1. value of the positive label IDs, if `x` is `number`, * <br>2. {@link module:la.Vector} of the positive label IDs, if `x` is {@link module:la.Vector}. * @example * // TODO */ this.predict = function(x) { // evaluate all models var scores = this.decisionFunction(x) // check what we get if (scores instanceof la.Vector) { return res = new la.Vector(); for (var i = 0; i < scores.length; i++) { res.push(scores[i] > 0 ? 1 : -1); } return res; } else { return scores > 0 ? 1 : -1; } } // X = vector of predictions for each instance (output of decision_funcition) // y = target labels (1 or -1) /** * Fits the model. * @param {module:la.Vector} X - Prediction for each instance (output of descisionFunction). * @param {number} y - The target labels (1 or -1). * @example * // TODO */ this.fit = function(X, y) { if (this.target === "f1") { // find threshold that maximizes F1 measure this.model = exports.metrics.bestF1Threshold(y, X); } else if (this.target === "recall") { // find threshold that results in desired recall this.model = exports.metrics.desiredRecallThreshold(y, X, this.level); } else if (this.target === "precision") { // find threshold that results in desired precision this.model = exports.metrics.desiredPrecisionThreshold(y, X, this.level); } else { throw "Unknown threshold model target: " + this.target; } } } /** * Metrics * @namespace * @desc Classification and regression metrics. * @example <caption>Batch classification example</caption> * // import metrics module * var analytics = require('qminer').analytics; * * // true and predicted lables * var true_lables = [0, 1, 0, 0, 1]; * var pred_prob = [0.3, 0.5, 0.2, 0.5, 0.8]; * * // compute ROC curve * var roc = analytics.metrics.rocCurve(true_lables, pred_prob); * @example <caption>Online classification example</caption> * // import analytics module * var analytics = require('qminer').analytics; * // true and predicted lables * var true_lables = [0, 1, 0, 0, 1]; * var pred_prob = [0.3, 0.5, 0.2, 0.5, 0.8]; * * // create predictionCurve instance * var predictionCurve = new analytics.metrics.PredictionCurve(); * * // simulate data flow * for (var i in true_lables) { * // push new value * predictionCurve.push(true_lables[i], pred_prob[i]); *} * * var roc = predictionCurve.roc(); // get ROC * @example <caption>Batch regression example</caption> * // import analytics module * var analytics = require('qminer').analytics; * // true and predicted data * var true_vals = [1, 2, 3, 4, 5]; * var pred_vals = [3, 4, 5, 6, 7]; * * // use batch MAE method * analytics.metrics.meanAbsoluteError(true_vals, pred_vals); * @example <caption>Online regression example</caption> * // import analytics module * var analytics = require('qminer').analytics; * // true and predicted data * var true_vals = [1, 2, 3, 4, 5]; * var pred_vals = [3, 4, 5, 6, 7]; * * // create online MAE metric instance * var mae = new analytics.metrics.MeanAbsoluteError(); * * // simulate data flow * for (var i in true_vals) { * // push new value * mae.push(true_vals[i], pred_vals[i]); * } * // get updated error * mae.getError(); */ var metrics = metrics || {}; // namespacing: http://addyosmani.com/blog/essential-js-namespacing/ /////////////////////////////////////////////////// ///////////// CLASSIFICATION METRICS ////////////// /////////////////////////////////////////////////// /** * For evaluating provided categories from binary? classifiers. * @class * @classdesc Class implements several classification measures (precision, recall, F1, accuracy). * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lable(s). * @param {(Array<number> | module:la.Vector)} yPred - Predicted (estimated) lable(s). */ metrics.ClassificationScore = function (yTrue, yPred) { /** * Returns `Object` containing different classification measures. * @returns {Object} scores - Object with different classification socres. * @returns {number} scores.count - Count. * @returns {number} scores.TP - Number of true positives. * @returns {number} scores.TN - Number of true negative. * @returns {number} scores.FP - Number of false positives. * @returns {number} scores.FN - Number of false positives. * @returns {number} scores.all - Number of all results. * @returns {number} scores.accuracy - Accuracy score. Formula: `(tp + tn) / (tp + fp + fn + tn)`. * @returns {number} scores.precision - Precision score. Formula: `tp / (tp + fp)`. * @returns {number} scores.recall - Recall score. Formula: `tp / (tp + fn)`. * @returns {number} scores.f1 - F1 score. Formula: `2 * (precision * recall) / (precision + recall)`. */ this.scores = { count: 0, predictionCount: 0, TP: 0, TN: 0, FP: 0, FN: 0, all: function () { return this.TP + this.FP + this.TN + this.FN; }, precision: function () { return (this.FP == 0) ? 1 : this.TP / (this.TP + this.FP); }, recall: function () { return (this.FN == 0) ? 1 : this.TP / (this.TP + this.FN); }, f1: function () { return ((this.precision() + this.recall()) == 0) ? 0 : 2 * this.precision() * this.recall() / (this.precision() + this.recall()); }, accuracy: function () { return (this.TP + this.TN) / this.all(); } }; /** * Adds prediction to the current statistics. Labels can be either integers. * or integer array (when there are zero or more then one lables). * @param {number} correct - Correct lable. * @param {number} predicted - Predicted lable. */ this.push = function (correct, predicted) { var catCorrect = (correct > 0); var catPredicted = (predicted > 0); // update counts for correct categories if (catCorrect) { this.scores.count++; } // update counts for how many times category was predicted if (catPredicted) { this.scores.predictionCount++; } // update true/false positive/negative count if (catCorrect && catPredicted) { // both predicted and correct say true this.scores.TP++; } else if (catCorrect) { // this was only correct but not predicted this.scores.FN++; } else if (catPredicted) { // this was only predicted but not correct this.scores.FP++; } else { // both predicted and correct say false this.scores.TN++; } }; // initialize if we are passed the data if (arguments.length >= 2) { for (var i = 0; i < yTrue.length; i++) { this.push(yTrue[i], yPred[i]); } } // check if input parameters are of correct type and binary for (var i = 0; i < arguments.length; i++) { // check type var argumentType = arguments[i].constructor.name; if (argumentType !== "Array" && argumentType !== "Vector") { throw new TypeError('input param must be of type "Array" or "Vector", but is ' + argumentType + ' instead'); } } }; /** * Accuracy score is the proportion of true results (both true positives and true negatives) * among the total number of cases examined. * Formula: `(tp + tn) / (tp + fp + fn + tn)`. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Predicted (estimated) lables. * @returns {number} Accuracy value. */ metrics.accuracyScore = function (yTrue, yPred) { return new metrics.ClassificationScore(yTrue, yPred).scores.accuracy(); }; /** * Precision score is defined as the proportion of the true positives against all the * positive results (both true positives and false positives). * Formula: `tp / (tp + fp)`. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Predicted (estimated) lables. * @returns {number} Precission score. */ metrics.precisionScore = function (yTrue, yPred) { return new metrics.ClassificationScore(yTrue, yPred).scores.precision(); }; /** * Recall score is intuitively the ability of the classifier to find all the positive samples. * Formula: `tp / (tp + fn)`. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Predicted (estimated) lables. * @returns {number} Recall score. */ metrics.recallScore = function (yTrue, yPred) { return new metrics.ClassificationScore(yTrue, yPred).scores.recall(); }; /** * The F1 score can be interpreted as a weighted average of the precision and recall, where * an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of * precision and recall to the F1 score are equal. * Formula: `2 * (precision * recall) / (precision + recall)`. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Predicted (estimated) lables. * @returns {number} F1 score. */ metrics.f1Score = function (yTrue, yPred) { return new metrics.ClassificationScore(yTrue, yPred).scores.f1(); }; /** * Class implements several prediction curve measures (ROC, AOC, Precision-Recall, ...). * @class * @classdesc Used for computing ROC curve and other related measures such as AUC. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lable(s) of binary classification in range {-1, 1} or {0, 1}. * @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities. * @example * // import metrics module * var metrics = require('qminer').analytics.metrics; * * // true and predicted lables * var true_lables = [0, 1, 0, 0, 1]; * var pred_prob = [0.3, 0.5, 0.2, 0.5, 0.8]; * * // create predictionCurve instance * var predictionCurve = new metrics.PredictionCurve(); * * // simulate data flow * for (var i in true_lables) { * // push new value * predictionCurve.push(true_lables[i], pred_prob[i]); *} * * var roc = predictionCurve.roc(); // get ROC * var auc = predictionCurve.auc(); // get AUC * var pr = predictionCurve.precisionRecallCurve() // get precision-recall curve */ metrics.PredictionCurve = function (yTrue, yPred) { /** * Count of all examples. * @name module:analytics~metrics.PredictionCurve#length * @type number */ this.length = 0; /** * Count of all positive examples. * @name module:analytics~metrics.PredictionCurve#allPositives * @type number */ this.allPositives = 0; /** * Count of all negative examples. * @name module:analytics~metrics.PredictionCurve#allNegatives * @type number */ this.allNegatives = 0; // store of predictions and ground truths /** * Store of ground truths. * @name module:analytics~metrics.PredictionCurve#grounds * @type module:la.Vector */ this.grounds = new la.Vector(); /** * Store of predictions. * @name module:analytics~metrics.PredictionCurve#predictions * @type module:la.Vector */ this.predictions = new la.Vector(); /** * Add new measurement with ground score (1 or -1) and predicted value * or integer array (when there are zero or more then one lables). * @param {number} ground - Correct lable. * @param {number} predicted - Estimated probabilities. */ this.push = function (ground, predict) { // remember the scores this.grounds.push(ground) this.predictions.push(predict); // update counts this.length++; if (ground > 0) { this.allPositives++; } else { this.allNegatives++; } }; // initialize if we are given data if (arguments.length >= 2) { for (var i = 0; i < yTrue.length; i++) { this.push(yTrue[i], yPred[i]); } } // check if input parameters are of correct type and binary for (var i = 0; i < arguments.length; i++) { // check type var argumentType = arguments[i].constructor.name; if (argumentType !== "Array" && argumentType !== "Vector") { throw new TypeError('input param must be of type "Array" or "Vector", but is ' + argumentType + ' instead'); } } /** * Get Receiver Operating Characteristic (ROC) parametrization sampled on `sample` points. * @param {number} [sample=10] - Desired number of samples in output. * @returns {module:la.Matrix} A matrix with increasing false and true positive rates. */ this.roc = function (sample) { // default sample size is 10 sample = sample || 10; // sort according to predictions var perm = this.predictions.sortPerm(false); // maintaining the results as we go along var TP = 0, FP = 0, ROC = [[0, 0]]; // check input samples if (this.allNegatives == 0) throw new Error('No positive samples in yTrue, true positive value should be meaningless.'); if (this.allNegatives == this.length) throw new Error('No negative samples in yTrue, false positive value should be meaningless.'); // for figuring out when to dump a new ROC sample var unique = 1; for (var i = 1; i < perm.perm.length; i++) { if (Math.abs(perm.vec[i] - perm.vec[i - 1]) > 1e-8) { unique++; } } var next = Math.floor(unique / sample); // go over the sorted results for (var i = 0; i < perm.perm.length; i++) { // get the ground var ground = this.grounds[perm.perm[i]]; // update TP/FP counts according to the ground if (ground > 0) { TP++ } else { FP++; } // see if time to do next save if ((i < perm.perm.length - 1) && (Math.abs(perm.vec[i] - perm.vec[i + 1]) > 1e-8)) { next = next - 1; } if (next < 0) { // add new datapoint to the curve ROC.push([FP / this.allNegatives, TP / this.allPositives]); // setup next timer next = Math.floor(unique / sample); } } // add the last point ROC.push([1, 1]); // return ROC return ROC; } /** * Get Area Under the Curve (AUC) of the current curve. * @param {number} [sample=10] - Desired number of samples in output. * @returns {number} Area under ROC curve. */ this.auc = function (sample) { // default sample size is 10 sample = sample || 10; // get the curve var curve = this.roc(sample); // compute the area var result = 0; for (var i = 1; i < curve.length; i++) { // get edge points var left = curve[i - 1]; var right = curve[i]; // first the rectangle bellow result = result + (right[0] - left[0]) * left[1]; // an then the triangle above result = result + (right[0] - left[0]) * (right[1] - left[1]) / 2; } return result; } /** * evalPrecisionRecall. * @private * @param {callback} callback. */ this.evalPrecisionRecall = function (callback) { // sort according to predictions var perm = this.predictions.sortPerm(false); // maintaining the results as we go along var TP = 0, FP = 0, TN = this.allNegatives, FN = this.allPositives; // go over the sorted results for (var i = 0; i < perm.perm.length; i++) { // get the ground var ground = this.grounds[perm.perm[i]]; // update TP/FP counts according to the ground if (ground > 0) { TP++; FN--; } else { FP++; TN--; } // do the update if ((TP + FP) > 0 && (TP + FN) > 0 && TP > 0) { // compute current precision and recall var precision = TP / (TP + FP); var recall = TP / (TP + FN); // see if we need to update current bep callback.update(ground, perm.vec[i], precision, recall); } } return callback.finish(); } /** * Get precision recall curve sampled on `sample` points. * @param {number} [sample=10] - Desired number of samples in output. * @returns {module:la.Matrix} Precision-recall pairs. */ this.precisionRecallCurve = function (sample) { return this.evalPrecisionRecall(new function (sample, length) { // default sample size is 10 this.sample = sample || 10; // curve this.curve = [[0, 1]]; // for figuring out when to dump a new ROC sample this.next = Math.floor(length / (this.sample)); this.counter = this.next; // keep last value this.precision = 0; this.recall = 0; // handlers this.update = function (yTrue, yPred, precision, recall) { this.counter = this.counter - 1; if (this.counter <= 0) { // add to the curve this.curve.push([recall, precision]); // setup next timer this.counter = this.next; } // always remember last value this.precision = precision; this.recall = recall; } this.finish = function () { // add the last point this.curve.push([this.recall, this.precision]); return this.curve; } }(sample, this.length)); }; /** * Get break-even point, the value where precision and recall intersect. * @returns {number} Break-even point. */ this.breakEvenPoint = function () { return this.evalPrecisionRecall(new function () { this.minDiff = 1.0; this.bep = -1.0; this.update = function (yTrue, yPred, precision, recall) { var diff = Math.abs(precision - recall); if (diff < this.minDiff) { this.minDiff = diff; bep = (precision + recall) / 2; } } this.finish = function () { return this.bep; } }()); } /** * Gets threshold for prediction score, which results in the highest F1. * @returns {number} Threshold with highest F1 score. */ this.bestF1 = function () { return this.evalPrecisionRecall(new function () { this.maxF1 = 0.0; this.threshold = 0.0; this.update = function (yTrue, yPred, precision, recall) { var f1 = 2 * precision * recall / (precision + recall); if (f1 > this.maxF1) { this.maxF1 = f1; this.threshold = yPred; } } this.finish = function () { return this.threshold; } }()); } /** * Gets threshold for prediction score, nearest to specified recall. * @param {number} desiredRecall - Desired recall score. * @returns {number} Recal Score Threshold. Threshold for recall score, nearest to specified `recall`. */ this.desiredRecall = function (desiredRecall) { return this.evalPrecisionRecall(new function () { this.recallDiff = 1.0; this.threshold = 0.0; this.update = function (yTrue, yPred, precision, recall) { var diff = Math.abs(desiredRecall - recall); if (diff < this.recallDiff) { this.recallDiff = diff; this.threshold = yPred; } } this.finish = function () { return this.threshold; } }()); } /** * Gets threshold for prediction score, nearest to specified precision. * @param {number} desiredPrecision - Desired precision score. * @returns {number} Threshold for prediction score, nearest to specified `precision`. */ this.desiredPrecision = function (desiredPrecision) { return this.evalPrecisionRecall(new function () { this.precisionDiff = 1.0; this.threshold = 0.0; this.update = function (yTrue, yPred, precision, recall) { var diff = Math.abs(desiredPrecision - precision); if (diff < this.precisionDiff) { this.precisionDiff = diff; this.threshold = yPred; } } this.finish = function () { return this.threshold; } }()); } }; /** * Get ROC parametrization sampled on `sample` points. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities. * @param {number} [sample=10] - Desired number of samples in output. * @returns {module:la.Matrix} A matrix with increasing false and true positive rates. * @example * // import metrics module * var metrics = require('qminer').analytics.metrics; * * // true and predicted lables * var true_lables = [0, 1, 0, 0, 1]; * var pred_prob = [0.3, 0.5, 0.2, 0.5, 0.8]; * * // compute ROC curve * var roc = metrics.rocCurve(true_lables, pred_prob); // output: [ [ 0, 0 ], [0, 0.5], [[ 0.34, 1 ],], [ 0.67, 0 ], [ 1, 1 ] ] */ metrics.rocCurve = function (yTrue, yPred, sample) { return new metrics.PredictionCurve(yTrue, yPred).roc(sample); }; /** * Get AUC of the current curve. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities. * @param {number} [sample=10] - Desired number of samples in output. * @returns {number} Area under ROC curve. * @example * // import metrics module * var metrics = require('qminer').analytics.metrics; * * // true and predicted lables * var true_lables = [0, 1, 0, 0, 1]; * var pred_prob = [0.3, 0.5, 0.2, 0.5, 0.8]; * * // compute ROC curve * var auc = metrics.rocAucScore(true_lables, pred_prob); // output: 0.92 */ metrics.rocAucScore = function (yTrue, yPred, sample) { return new metrics.PredictionCurve(yTrue, yPred).auc(sample); }; /** * Get precision recall curve sampled on `sample` points. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities. * @param {number} [sample=10] - Desired number of samples in output. * @returns {module:la.Matrix} Precision-recall pairs. */ metrics.precisionRecallCurve = function (yTrue, yPred, sample) { return new metrics.PredictionCurve(yTrue, yPred).precisionRecallCurve(sample); }; /** * Get break-even point, the value where precision and recall intersect. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities. * @returns {number} Break-even point score. */ metrics.breakEventPointScore = function (yTrue, yPred) { return new metrics.PredictionCurve(yTrue, yPred).breakEvenPoint(); }; /** * Gets threshold for prediction score, which results in the highest F1. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities. * @returns {number} Threshold with highest F1 score. */ metrics.bestF1Threshold = function (yTrue, yPred) { return new metrics.PredictionCurve(yTrue, yPred).bestF1(); }; /** * Gets threshold for recall score, nearest to specified recall. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities. * @param {number} desiredRecall - Desired recall score. * @returns {number} Threshold for recall score, nearest to specified `recall`. */ metrics.desiredRecallThreshold = function (yTrue, yPred, desiredRecall) { return new metrics.PredictionCurve(yTrue, yPred).desiredRecall(desiredRecall); }; /** * Gets threshold for prediction score, nearest to specified precision. * @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables. * @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities. * @param {number} desiredPrecision - Desired precision score. * @returns {number} Threshold for prediction score, nearest to specified `precision`. */ metrics.desiredPrecisionThreshold = function (yTrue, yPred, desiredPrecision) { return new metrics.PredictionCurve(yTrue, yPred).desiredPrecision(desiredPrecision); }; /////////////////////////////////////////////////// //////////// ONLINE REGRESSION METRICS //////////// /////////////////////////////////////////////////// // Online regression metrics used for evaluating online models // Main object for online metrics model /** * createOnlineMetric * @ignore * @class * * This provides methods used for event handling. It's not meant to * be used directly. * */ function createOnlineMetric(callback) { var error = -1; this.metric = new callback(); // We can hide this later (just delete this) // check if input types are of correct type function checkPushParams() { for (var i = 0, j = arguments.length; i < j; i++) { var argumentType = arguments[i].constructor.name; if (argumentType !== "Number") { throw new TypeError('input param ' + i + ' must be of type "Number", but is ' + argumentType + ' instead'); } } } /** * Updates metric with ground truth target value `yTrue` and estimated target value `yPred`. * @ignore * @param {number} yTrue - Ground truth (correct) target value. * @param {number} yPred - Estimated target value. */ this.push = function (yTrue, yPred, ref_num) { // set default values of optional input parameters var yPred = yPred == null ? 0 : yPred; var ref_num = ref_num == null ? 0 : ref_num; // check if input types are of correct type checkPushParams(yTrue, yPred, ref_num); // calculate the error with provided function from the callback function error = this.metric.update(yTrue, yPred); } /** * Returns error value. * @ignore * @returns {number} Error value. */ this.getError = function () { return error; } /** * Save metric state to provided output stream `fout`. * @ignore * @param {module:fs.FOut} fout - The output stream. * @returns {module:fs.FOut} The output stream `fout`. */ this.save = function (fout) { fout.writeJson(this.metric.state); return fout; } /** * Load metric state from provided input stream `fin`. * @ignore * @param {module:fs.FIn} fin - The output stream. * @returns {module:fs.FIn} The output stream `fin`. */ this.load = function (fin) { this.metric.state = fin.readJson(); error = this.metric.state.error; return fin; } } // MEAN ERROR (ME) /** * Create new (online) mean error instance. * @class * @classdesc Online Mean Error (ME) instance. * @param {module:fs.FIn} [fin] - Saved state can be loaded via constructor. * @extends module:analytics~createOnlineMetric */ metrics.MeanError = function (fin) { function metric() { this.name = "Mean Error" this.shortName = "ME" this.state = { sumErr: 0, count: 0, error: 0 } // update function this.update = function (yTrue, yPred) { var