qminer
Version:
A C++ based data analytics platform for processing large-scale real-time streams containing structured and unstructured data
1,161 lines (1,089 loc) • 93.3 kB
JavaScript
"use strict";
/**
* Copyright (c) 2015, Jozef Stefan Institute, Quintelligence d.o.o. and contributors
* All rights reserved.
*
* This source code is licensed under the FreeBSD license found in the
* LICENSE file in the root directory of this source tree.
*/
var sget = require(__dirname + '/third_party/sget/sget.js');
var override = require(__dirname + '/third_party/json-override/json-override.js');
var assert = require('assert');
var qm_util = require(__dirname + '/qm_util.js');
module.exports = exports = function (pathQmBinary) {
var qm = require(pathQmBinary); // This loads only c++ functions of qm
var fs = qm.fs;
var la = qm.la;
var stat = qm.statistics;
exports = qm.analytics;
//!STARTJSDOC
///////////////////////////////////////////////////
///////////// DATA PREPROCESSING //////////////
///////////////////////////////////////////////////
/**
* PreprocessingF
* @namespace
* @desc Preprocessing functions for preparing labels in formats accepted
* by learning modules in {@link module:analytics}.
*/
var preprocessing = preprocessing || {};
// namespacing: http://addyosmani.com/blog/essential-js-namespacing/
/**
* Transforming arrays with labels to vector appropriate for binary classifiers.
* @class
* @classdesc
* Transform given array of labels into binary vector with different
* numeric value for elements when label matches specified label and
* for other elements. By default, these values are +1 for matching
* labels, and -1 for the rest.
* @param {Array} y - Labels.
* @param {string | number} positiveLabel - Positive label.
* @param {number} [positiveId = 1] - Value when matching positive label.
* @param {number} [negativeId = -1] - Value when not matching positive label.
* @example
* // import analytics module
* var analytics = require('qminer').analytics;
* // create binarizer with 'b' as positive label
* var binarizer = new analytics.preprocessing.Binarizer('b');
*/
preprocessing.Binarizer = function (positiveLabel, positiveId, negativeId) {
if (positiveLabel == undefined) { throw "Binarizer needs positive label"; }
this.positiveLabel = positiveLabel;
this.positiveId = (positiveId == undefined) ? 1 : positiveId;
this.negativeId = (negativeId == undefined) ? -1 : negativeId;
this.fit = function () {
// do nothing
}
/**
* Transform given array of labels to binary numeric vector.
* @param {(Array<number> | Array<string> | module:la.Vector | module:la.StrVector)} y - Labels.
* @return {modul:la.Vector} Binarized vector.
* @example
* // import analytics module
* var analytics = require('qminer').analytics;
* // create binarizer with 'b' as positive label
* var binarizer = new analytics.preprocessing.Binarizer('b');
* // get vector with binarized labels
* var bins = binarizer.transform(['a','b','a','c']);
*/
this.transform = function (y) {
var target = new la.Vector();
for (var i = 0; i < y.length; i++) {
target.push(y[i] === this.positiveLabel ? this.positiveId : this.negativeId);
}
return target;
}
};
/**
* Applies the model's `decisionFunction` method (if exists) on each column of matrix `X`.
* @param {Object} model - The model, that has the `decisionFunction` method.
* @param {module:la.SparseMatrix} X - The matrix.
* @returns {module:la.Vector} The dense vector where the i-th value is the value the `model.decisionFunction`
* returned for the sparse vector `X[i]`.
* @example
* // TODO
*/
preprocessing.applyModel = function (model, X) {
if (model.decisionFunction == undefined) {
throw "preprocessing.applyModel: model doesn't have a method called decisionFunction!";
}
var target = new la.Vector();
for (var i = 0; i < X.cols; i++) {
target.push(model.decisionFunction(X[i]));
}
return target;
}
// Exports preprocessing namespace
exports.preprocessing = preprocessing;
// SVM
/**
* Get the model.
* @returns {Object} The `svmModel` object containing the property:
* <br> 1. `svmModel.weights` - The weights of the model. Type {@link module:la.Vector}.
* @example
* // import analytics module
* var analytics = require('qminer').analytics;
* // create a SVC model
* var SVC = new analytics.SVC();
* // get the properties of the model
* var model = SVC.getModel();
*/
exports.SVC.prototype.getModel = function() { return { weights: this.weights, bias: this.bias }; }
/**
* Get the model.
* @returns {Object} The `svmModel` object containing the property:
* <br> 1. `svmModel.weights` - The weights of the model. Type {@link module:la.Vector}.
* @example
* // import analytics module
* var analytics = require('qminer').analytics;
* // create a SVR model
* var SVR = new analytics.SVR();
* // get the properties of the model
* var model = SVR.getModel();
*/
exports.SVR.prototype.getModel = function() { return { weights: this.weights, bias: this.bias }; }
// Ridge Regression
/**
* Gets the model.
* @returns {Object} The `ridgeRegModel` object containing the property:
* <br> 1. `ridgeRegModel.weights` - The weights of the model. Type {@link module:la.Vector}.
* @example
* // import analytics module
* var analytics = require('qminer').analytics;
* // create the Ridge Regression model
* var regmod = new analytics.RidgeReg();
* // get the model
* var model = regmod.getModel();
*/
exports.RidgeReg.prototype.getModel = function () { return { weights: this.weights }; }
// Recursive Linear Regression
/**
* Gets the model.
* @returns {Object} The `recLinRegModel` object containing the property:
* <br> 1. `recLinRegModel.weights` - The weights of the model. Type {@link module:la.Vector}.
* @example
* // import analytics module
* var analytics = require('qminer').analytics;
* // create the Recursive Linear Regression model
* var linreg = new analytics.RecLinReg({ dim: 10 });
* // get the model
* var model = linreg.getModel(); // returns { weights: new require('qminer').la.Vector(); }
*/
exports.RecLinReg.prototype.getModel = function () { return { weights: this.weights } }
/**
* @typedef {Object} oneVsAllParam
* An object used for the construction of {@link module:analytics.OneVsAll}.
* @property {function} [model] - Constructor for binary model to be
* used internaly. Constructor should expect only one parameter.
* @property {Object} [modelParam] - Parameter for `oneVsAllParam.model` constructor.
* @property {number} [categories] - Number of categories.
* @property {boolean} [verbose = false] - If false, the console output is supressed.
*/
/**
* @classdesc One vs All model for multiclass prediction. Builds binary model
* for each category and predicts the one with the highest score. Binary model is
* provided as part of the constructor.
* @class
* @param {module:analytics~oneVsAllParam} [arg] - Construction arguments.
* @example
* // import analytics module
* var analytics = require('qminer').analytics;
* // create a new OneVsAll object with the model analytics.SVC
* var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 });
*/
exports.OneVsAll = function (arg) {
// remember parameters
var model = arg.model;
var modelParam = arg.modelParam;
var cats = arg.cats;
var verbose = arg.verbose == undefined ? false : arg.verbose;
// trained models
var models = [ ];
/**
* Gets the parameters.
* @returns {module:analytics~oneVsAllParam} The constructor parameters.
* @example
* // import analytics module
* var analytics = require('qminer').analytics;
* // create a new OneVsAll object with the model analytics.SVC
* var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 });
* // get the parameters
* // returns the JSon object
* // { model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2, models: [] }
* var params = onevsall.getParams();
*/
this.getParams = function () {
return { model: model, modelParam: modelParam, cats: cats, models: models }
};
/**
* Sets the parameters.
* @param {module:analytics~OneVsAllParam} params - The constructor parameters.
* @returns {module:analytics.OneVsAll} Self. The parameters are changed.
* @example
* // import analytics module
* var analytics = require('qminer').analytics;
* // create a new OneVsAll object with the model analytics.SVC
* var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 });
* // set the parameters
* var params = onevsall.setParams({ model: analytics.SVR, modelParam: { c: 12, maxTime: 10000}, cats: 3, verbose: true });
*/
this.setParams = function (params) {
model = params.model == undefined ? model : params.model;
modelParam = params.modelParam == undefined ? modelParam : params.modelParam;
cats = params.cats == undefined ? cats : params.cats;
verbose = params.verbose == undefined ? verbose : params.verbose;
}
/**
* Apply all models to the given vector and returns a vector of scores, one for each category.
* Semantic of scores depend on the provided binary model.
* @param {module:la.Vector | module:la.SparseVector | module:la.Matrix | module:la.SparseMatrix} X -
* Feature vector or matrix with feature vectors as columns.
* @returns {module:la.Vector | module:la.Matrix} The score and label of the input `X`:
* <br>1. {@link module:la.Vector} of scores, if `X` is of type {@link module:la.Vector} or {@link module:la.SparseVector}.
* <br>2. {@link module:la.Matrix} with columns corresponding to instances, and rows corresponding to labels, if `X` is of type {@link module:la.Matrix} or {@link module:la.SparseMatrix}.
* @example
* // import modules
* var analytics = require('qminer').analytics;
* var la = require('qminer').la;
* // create a new OneVsAll object with the model analytics.SVC
* var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 });
* // create the data (matrix and vector) used to fit the model
* var matrix = new la.Matrix([[1, 2, 1, 1], [2, 1, -3, -4]]);
* var vector = new la.Vector([0, 0, 1, 1]);
* // fit the model
* onevsall.fit(matrix, vector);
* // create the vector for the decisionFunction
* var test = new la.Vector([1, 2]);
* // give the vector to the decision function
* var prediction = onevsall.decisionFunction(test); // returns the vector of scores
*/
this.decisionFunction = function(X) {
// check what is our input
if (X instanceof la.Vector || X instanceof la.SparseVector) {
// evaluate all models
var scores = new la.Vector();
for (var cat = 0; cat < cats; cat++) {
scores.push(models[cat].decisionFunction(X));
}
return scores;
} else if (X instanceof la.Matrix || X instanceof la.SparseMatrix) {
// create matrix where cols are instances and rows are scores for categories
var scores = new la.Matrix({rows: cats, cols: X.cols});
for (var i = 0; i < X.cols; i++) {
var x_i = X.getCol(i);
for (var cat = 0; cat < cats; cat++) {
scores.put(cat, i, models[cat].decisionFunction(x_i));
}
}
return scores;
} else {
throw "analytics.OneVsAll.decisionFunction: Input data of unsupported type!";
}
}
/**
* Apply all models to the given vector and returns category with the highest score.
* @param {module:la.Vector | module:la.SparseVector | module:la.Matrix | module:la.SparseMatrix} X -
* Feature vector or matrix with feature vectors as columns.
* @returns {number | module:la.IntVector}
* <br>1. number of the category with the higher score, if `X` is {@link module:la.Vector} or {@link module:la.SparseVector}.
* <br>2. {@link module:la.IntVector} of categories with the higher score for each column of `X`, if `X` is {@link module:la.Matrix} or {@link module:la.SparseMatrix}.
* @example
* // import modules
* var analytics = require('qminer').analytics;
* var la = require('qminer').la;
* // create a new OneVsAll object with the model analytics.SVC
* var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 });
* // create the data (matrix and vector) used to fit the model
* var matrix = new la.Matrix([[1, 2, 1, 1], [2, 1, -3, -4]]);
* var vector = new la.Vector([0, 0, 1, 1]);
* // fit the model
* onevsall.fit(matrix, vector);
* // create the vector for the prediction
* var test = new la.Vector([1, 2]);
* // get the prediction of the vector
* var prediction = onevsall.predict(test); // returns 0
*/
this.predict = function(X) {
// evaluate all models
var scores = this.decisionFunction(X);
// select maximal one
if (scores instanceof la.Vector) {
return scores.getMaxIdx();
} else if (scores instanceof la.Matrix) {
var predictions = new la.IntVector();
for (var i = 0; i < scores.cols; i++) {
predictions.push(scores.getCol(i).getMaxIdx());
}
return predictions;
} else {
throw "analytics.OneVsAll.predict: decisionFunction returns unsupported type!";
}
}
// X = feature matrix
// y = target label from 0..cats
/**
* Apply all models to the given vector and returns category with the highest score.
* @param {module:la.Matrix | module:la.SparseMatrix} X - training instance feature vectors.
* @param {module:la.Vector} y - target category for each training instance. Categories must
* be integer numbers between `0` and `oneVsAllParam.categories-1`.
* @returns {module:analytics.OneVsAll} Self. The models have been fitted.
* @example
* // import modules
* var analytics = require('qminer').analytics;
* var la = require('qminer').la;
* // create a new OneVsAll object with the model analytics.SVC
* var onevsall = new analytics.OneVsAll({ model: analytics.SVC, modelParam: { c: 10, maxTime: 1000 }, cats: 2 });
* // create the data (matrix and vector) used to fit the model
* var matrix = new la.Matrix([[1, 2, 1, 1], [2, 1, -3, -4]]);
* var vector = new la.Vector([0, 0, 1, 1]);
* // fit the model
* onevsall.fit(matrix, vector);
*/
this.fit = function(X, y) {
models = [ ];
// make model for each category
for (var cat = 0; cat < cats; cat++) {
if (verbose) {
console.log("Fitting label", (cat + 1), "/", cats);
};
// prepare targert vector for current category
var target = (y instanceof la.Matrix) ?
// we have a special bianary vector for each category, make it into -1/+1
(new exports.preprocessing.Binarizer(1)).transform(y.getRow(cat)) :
// we have a vector with label for each element, get out -1/+1 vector
(new exports.preprocessing.Binarizer(cat)).transform(y);
// get the model
var catModel = new model(modelParam);
models.push(catModel.fit(X, target));
}
if (verbose) {
console.log("Done!");
};
return this;
}
};
/**
* Threshold Model
* @class
* @classdesc The Threshold model. Uses the methods from the {@link module:analytics.metrics}.
* @param {Object} [arg] - The constructor parameters.
* @param {string} [arg.target] - Target type. Possible options are `"recall"` and `"precision"`.
* @param {TODO} [arg.level] - TODO
* @example
* // TODO
*/
exports.ThresholdModel = function(params) {
// what do we optimize
this.target = params.target;
if (this.target === "recall" || this.target === "precision") {
this.level = params.level;
}
// threshold model
this.model = null;
// apply all models to the given vector and return distance to the class boundary
// x = dense vector with prediction score for each class
// result = traslated predictions based on thresholds
/**
* Apply all models to the given vector and returns the distance to the class boundary.
* @param {number | module:la.Vector} x - The prediction score for each class.
* @returns {number | module:la.Vector}
* <br>1. value of the translated prediction based on the threshold, if `x` is `number`,
* <br>2. {@link module:la.Vector} of translated prediction based on the threshold, if `x` is {@link module:la.Vector}.
* @example
* // TODO
*/
this.decisionFunction = function(x) {
if (x instanceof Number) {
// just transate based on the model's threshold
return x - this.model;
} else if (x instanceof la.Vector) {
// each element is a new instance
var scores = new la.Vector();
for (var i = 0; i < x.length; i++) {
scores.push(x[i] - this.model);
}
return scores;
} else {
throw "analytics.ThresholdModel.decisionFunction: Input data of unsupported type!";
}
}
// return the most likely category
// x = dense vector with prediction score for each class
// result = array of positive label ids
/**
* Returns the most likely category.
* @param {number | module:la.Vector} x - The prediction score for each class.
* @returns {number | module:la.Vector}
* <br>1. value of the positive label IDs, if `x` is `number`,
* <br>2. {@link module:la.Vector} of the positive label IDs, if `x` is {@link module:la.Vector}.
* @example
* // TODO
*/
this.predict = function(x) {
// evaluate all models
var scores = this.decisionFunction(x)
// check what we get
if (scores instanceof la.Vector) {
return res = new la.Vector();
for (var i = 0; i < scores.length; i++) {
res.push(scores[i] > 0 ? 1 : -1);
}
return res;
} else {
return scores > 0 ? 1 : -1;
}
}
// X = vector of predictions for each instance (output of decision_funcition)
// y = target labels (1 or -1)
/**
* Fits the model.
* @param {module:la.Vector} X - Prediction for each instance (output of descisionFunction).
* @param {number} y - The target labels (1 or -1).
* @example
* // TODO
*/
this.fit = function(X, y) {
if (this.target === "f1") {
// find threshold that maximizes F1 measure
this.model = exports.metrics.bestF1Threshold(y, X);
} else if (this.target === "recall") {
// find threshold that results in desired recall
this.model = exports.metrics.desiredRecallThreshold(y, X, this.level);
} else if (this.target === "precision") {
// find threshold that results in desired precision
this.model = exports.metrics.desiredPrecisionThreshold(y, X, this.level);
} else {
throw "Unknown threshold model target: " + this.target;
}
}
}
/**
* Metrics
* @namespace
* @desc Classification and regression metrics.
* @example <caption>Batch classification example</caption>
* // import metrics module
* var analytics = require('qminer').analytics;
*
* // true and predicted lables
* var true_lables = [0, 1, 0, 0, 1];
* var pred_prob = [0.3, 0.5, 0.2, 0.5, 0.8];
*
* // compute ROC curve
* var roc = analytics.metrics.rocCurve(true_lables, pred_prob);
* @example <caption>Online classification example</caption>
* // import analytics module
* var analytics = require('qminer').analytics;
* // true and predicted lables
* var true_lables = [0, 1, 0, 0, 1];
* var pred_prob = [0.3, 0.5, 0.2, 0.5, 0.8];
*
* // create predictionCurve instance
* var predictionCurve = new analytics.metrics.PredictionCurve();
*
* // simulate data flow
* for (var i in true_lables) {
* // push new value
* predictionCurve.push(true_lables[i], pred_prob[i]);
*}
*
* var roc = predictionCurve.roc(); // get ROC
* @example <caption>Batch regression example</caption>
* // import analytics module
* var analytics = require('qminer').analytics;
* // true and predicted data
* var true_vals = [1, 2, 3, 4, 5];
* var pred_vals = [3, 4, 5, 6, 7];
*
* // use batch MAE method
* analytics.metrics.meanAbsoluteError(true_vals, pred_vals);
* @example <caption>Online regression example</caption>
* // import analytics module
* var analytics = require('qminer').analytics;
* // true and predicted data
* var true_vals = [1, 2, 3, 4, 5];
* var pred_vals = [3, 4, 5, 6, 7];
*
* // create online MAE metric instance
* var mae = new analytics.metrics.MeanAbsoluteError();
*
* // simulate data flow
* for (var i in true_vals) {
* // push new value
* mae.push(true_vals[i], pred_vals[i]);
* }
* // get updated error
* mae.getError();
*/
var metrics = metrics || {};
// namespacing: http://addyosmani.com/blog/essential-js-namespacing/
///////////////////////////////////////////////////
///////////// CLASSIFICATION METRICS //////////////
///////////////////////////////////////////////////
/**
* For evaluating provided categories from binary? classifiers.
* @class
* @classdesc Class implements several classification measures (precision, recall, F1, accuracy).
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lable(s).
* @param {(Array<number> | module:la.Vector)} yPred - Predicted (estimated) lable(s).
*/
metrics.ClassificationScore = function (yTrue, yPred) {
/**
* Returns `Object` containing different classification measures.
* @returns {Object} scores - Object with different classification socres.
* @returns {number} scores.count - Count.
* @returns {number} scores.TP - Number of true positives.
* @returns {number} scores.TN - Number of true negative.
* @returns {number} scores.FP - Number of false positives.
* @returns {number} scores.FN - Number of false positives.
* @returns {number} scores.all - Number of all results.
* @returns {number} scores.accuracy - Accuracy score. Formula: `(tp + tn) / (tp + fp + fn + tn)`.
* @returns {number} scores.precision - Precision score. Formula: `tp / (tp + fp)`.
* @returns {number} scores.recall - Recall score. Formula: `tp / (tp + fn)`.
* @returns {number} scores.f1 - F1 score. Formula: `2 * (precision * recall) / (precision + recall)`.
*/
this.scores = {
count: 0, predictionCount: 0,
TP: 0, TN: 0, FP: 0, FN: 0,
all: function () { return this.TP + this.FP + this.TN + this.FN; },
precision: function () { return (this.FP == 0) ? 1 : this.TP / (this.TP + this.FP); },
recall: function () { return (this.FN == 0) ? 1 : this.TP / (this.TP + this.FN); },
f1: function () { return ((this.precision() + this.recall()) == 0) ? 0 :
2 * this.precision() * this.recall() / (this.precision() + this.recall()); },
accuracy: function () { return (this.TP + this.TN) / this.all(); }
};
/**
* Adds prediction to the current statistics. Labels can be either integers.
* or integer array (when there are zero or more then one lables).
* @param {number} correct - Correct lable.
* @param {number} predicted - Predicted lable.
*/
this.push = function (correct, predicted) {
var catCorrect = (correct > 0);
var catPredicted = (predicted > 0);
// update counts for correct categories
if (catCorrect) { this.scores.count++; }
// update counts for how many times category was predicted
if (catPredicted) { this.scores.predictionCount++; }
// update true/false positive/negative count
if (catCorrect && catPredicted) {
// both predicted and correct say true
this.scores.TP++;
} else if (catCorrect) {
// this was only correct but not predicted
this.scores.FN++;
} else if (catPredicted) {
// this was only predicted but not correct
this.scores.FP++;
} else {
// both predicted and correct say false
this.scores.TN++;
}
};
// initialize if we are passed the data
if (arguments.length >= 2) {
for (var i = 0; i < yTrue.length; i++) {
this.push(yTrue[i], yPred[i]);
}
}
// check if input parameters are of correct type and binary
for (var i = 0; i < arguments.length; i++) {
// check type
var argumentType = arguments[i].constructor.name;
if (argumentType !== "Array" && argumentType !== "Vector") {
throw new TypeError('input param must be of type "Array" or "Vector", but is ' + argumentType + ' instead');
}
}
};
/**
* Accuracy score is the proportion of true results (both true positives and true negatives)
* among the total number of cases examined.
* Formula: `(tp + tn) / (tp + fp + fn + tn)`.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Predicted (estimated) lables.
* @returns {number} Accuracy value.
*/
metrics.accuracyScore = function (yTrue, yPred) {
return new metrics.ClassificationScore(yTrue, yPred).scores.accuracy();
};
/**
* Precision score is defined as the proportion of the true positives against all the
* positive results (both true positives and false positives).
* Formula: `tp / (tp + fp)`.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Predicted (estimated) lables.
* @returns {number} Precission score.
*/
metrics.precisionScore = function (yTrue, yPred) {
return new metrics.ClassificationScore(yTrue, yPred).scores.precision();
};
/**
* Recall score is intuitively the ability of the classifier to find all the positive samples.
* Formula: `tp / (tp + fn)`.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Predicted (estimated) lables.
* @returns {number} Recall score.
*/
metrics.recallScore = function (yTrue, yPred) {
return new metrics.ClassificationScore(yTrue, yPred).scores.recall();
};
/**
* The F1 score can be interpreted as a weighted average of the precision and recall, where
* an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of
* precision and recall to the F1 score are equal.
* Formula: `2 * (precision * recall) / (precision + recall)`.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Predicted (estimated) lables.
* @returns {number} F1 score.
*/
metrics.f1Score = function (yTrue, yPred) {
return new metrics.ClassificationScore(yTrue, yPred).scores.f1();
};
/**
* Class implements several prediction curve measures (ROC, AOC, Precision-Recall, ...).
* @class
* @classdesc Used for computing ROC curve and other related measures such as AUC.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lable(s) of binary classification in range {-1, 1} or {0, 1}.
* @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities.
* @example
* // import metrics module
* var metrics = require('qminer').analytics.metrics;
*
* // true and predicted lables
* var true_lables = [0, 1, 0, 0, 1];
* var pred_prob = [0.3, 0.5, 0.2, 0.5, 0.8];
*
* // create predictionCurve instance
* var predictionCurve = new metrics.PredictionCurve();
*
* // simulate data flow
* for (var i in true_lables) {
* // push new value
* predictionCurve.push(true_lables[i], pred_prob[i]);
*}
*
* var roc = predictionCurve.roc(); // get ROC
* var auc = predictionCurve.auc(); // get AUC
* var pr = predictionCurve.precisionRecallCurve() // get precision-recall curve
*/
metrics.PredictionCurve = function (yTrue, yPred) {
/**
* Count of all examples.
* @name module:analytics~metrics.PredictionCurve#length
* @type number
*/
this.length = 0;
/**
* Count of all positive examples.
* @name module:analytics~metrics.PredictionCurve#allPositives
* @type number
*/
this.allPositives = 0;
/**
* Count of all negative examples.
* @name module:analytics~metrics.PredictionCurve#allNegatives
* @type number
*/
this.allNegatives = 0;
// store of predictions and ground truths
/**
* Store of ground truths.
* @name module:analytics~metrics.PredictionCurve#grounds
* @type module:la.Vector
*/
this.grounds = new la.Vector();
/**
* Store of predictions.
* @name module:analytics~metrics.PredictionCurve#predictions
* @type module:la.Vector
*/
this.predictions = new la.Vector();
/**
* Add new measurement with ground score (1 or -1) and predicted value
* or integer array (when there are zero or more then one lables).
* @param {number} ground - Correct lable.
* @param {number} predicted - Estimated probabilities.
*/
this.push = function (ground, predict) {
// remember the scores
this.grounds.push(ground)
this.predictions.push(predict);
// update counts
this.length++;
if (ground > 0) {
this.allPositives++;
} else {
this.allNegatives++;
}
};
// initialize if we are given data
if (arguments.length >= 2) {
for (var i = 0; i < yTrue.length; i++) {
this.push(yTrue[i], yPred[i]);
}
}
// check if input parameters are of correct type and binary
for (var i = 0; i < arguments.length; i++) {
// check type
var argumentType = arguments[i].constructor.name;
if (argumentType !== "Array" && argumentType !== "Vector") {
throw new TypeError('input param must be of type "Array" or "Vector", but is ' + argumentType + ' instead');
}
}
/**
* Get Receiver Operating Characteristic (ROC) parametrization sampled on `sample` points.
* @param {number} [sample=10] - Desired number of samples in output.
* @returns {module:la.Matrix} A matrix with increasing false and true positive rates.
*/
this.roc = function (sample) {
// default sample size is 10
sample = sample || 10;
// sort according to predictions
var perm = this.predictions.sortPerm(false);
// maintaining the results as we go along
var TP = 0, FP = 0, ROC = [[0, 0]];
// check input samples
if (this.allNegatives == 0) throw new Error('No positive samples in yTrue, true positive value should be meaningless.');
if (this.allNegatives == this.length) throw new Error('No negative samples in yTrue, false positive value should be meaningless.');
// for figuring out when to dump a new ROC sample
var unique = 1;
for (var i = 1; i < perm.perm.length; i++) {
if (Math.abs(perm.vec[i] - perm.vec[i - 1]) > 1e-8) {
unique++;
}
}
var next = Math.floor(unique / sample);
// go over the sorted results
for (var i = 0; i < perm.perm.length; i++) {
// get the ground
var ground = this.grounds[perm.perm[i]];
// update TP/FP counts according to the ground
if (ground > 0) { TP++ } else { FP++; }
// see if time to do next save
if ((i < perm.perm.length - 1) && (Math.abs(perm.vec[i] - perm.vec[i + 1]) > 1e-8)) {
next = next - 1;
}
if (next < 0) {
// add new datapoint to the curve
ROC.push([FP / this.allNegatives, TP / this.allPositives]);
// setup next timer
next = Math.floor(unique / sample);
}
}
// add the last point
ROC.push([1, 1]);
// return ROC
return ROC;
}
/**
* Get Area Under the Curve (AUC) of the current curve.
* @param {number} [sample=10] - Desired number of samples in output.
* @returns {number} Area under ROC curve.
*/
this.auc = function (sample) {
// default sample size is 10
sample = sample || 10;
// get the curve
var curve = this.roc(sample);
// compute the area
var result = 0;
for (var i = 1; i < curve.length; i++) {
// get edge points
var left = curve[i - 1];
var right = curve[i];
// first the rectangle bellow
result = result + (right[0] - left[0]) * left[1];
// an then the triangle above
result = result + (right[0] - left[0]) * (right[1] - left[1]) / 2;
}
return result;
}
/**
* evalPrecisionRecall.
* @private
* @param {callback} callback.
*/
this.evalPrecisionRecall = function (callback) {
// sort according to predictions
var perm = this.predictions.sortPerm(false);
// maintaining the results as we go along
var TP = 0, FP = 0, TN = this.allNegatives, FN = this.allPositives;
// go over the sorted results
for (var i = 0; i < perm.perm.length; i++) {
// get the ground
var ground = this.grounds[perm.perm[i]];
// update TP/FP counts according to the ground
if (ground > 0) { TP++; FN--; } else { FP++; TN--; }
// do the update
if ((TP + FP) > 0 && (TP + FN) > 0 && TP > 0) {
// compute current precision and recall
var precision = TP / (TP + FP);
var recall = TP / (TP + FN);
// see if we need to update current bep
callback.update(ground, perm.vec[i], precision, recall);
}
}
return callback.finish();
}
/**
* Get precision recall curve sampled on `sample` points.
* @param {number} [sample=10] - Desired number of samples in output.
* @returns {module:la.Matrix} Precision-recall pairs.
*/
this.precisionRecallCurve = function (sample) {
return this.evalPrecisionRecall(new function (sample, length) {
// default sample size is 10
this.sample = sample || 10;
// curve
this.curve = [[0, 1]];
// for figuring out when to dump a new ROC sample
this.next = Math.floor(length / (this.sample));
this.counter = this.next;
// keep last value
this.precision = 0; this.recall = 0;
// handlers
this.update = function (yTrue, yPred, precision, recall) {
this.counter = this.counter - 1;
if (this.counter <= 0) {
// add to the curve
this.curve.push([recall, precision]);
// setup next timer
this.counter = this.next;
}
// always remember last value
this.precision = precision; this.recall = recall;
}
this.finish = function () {
// add the last point
this.curve.push([this.recall, this.precision]);
return this.curve;
}
}(sample, this.length));
};
/**
* Get break-even point, the value where precision and recall intersect.
* @returns {number} Break-even point.
*/
this.breakEvenPoint = function () {
return this.evalPrecisionRecall(new function () {
this.minDiff = 1.0; this.bep = -1.0;
this.update = function (yTrue, yPred, precision, recall) {
var diff = Math.abs(precision - recall);
if (diff < this.minDiff) { this.minDiff = diff; bep = (precision + recall) / 2; }
}
this.finish = function () { return this.bep; }
}());
}
/**
* Gets threshold for prediction score, which results in the highest F1.
* @returns {number} Threshold with highest F1 score.
*/
this.bestF1 = function () {
return this.evalPrecisionRecall(new function () {
this.maxF1 = 0.0; this.threshold = 0.0;
this.update = function (yTrue, yPred, precision, recall) {
var f1 = 2 * precision * recall / (precision + recall);
if (f1 > this.maxF1) {
this.maxF1 = f1;
this.threshold = yPred;
}
}
this.finish = function () { return this.threshold; }
}());
}
/**
* Gets threshold for prediction score, nearest to specified recall.
* @param {number} desiredRecall - Desired recall score.
* @returns {number} Recal Score Threshold. Threshold for recall score, nearest to specified `recall`.
*/
this.desiredRecall = function (desiredRecall) {
return this.evalPrecisionRecall(new function () {
this.recallDiff = 1.0; this.threshold = 0.0;
this.update = function (yTrue, yPred, precision, recall) {
var diff = Math.abs(desiredRecall - recall);
if (diff < this.recallDiff) {
this.recallDiff = diff;
this.threshold = yPred;
}
}
this.finish = function () { return this.threshold; }
}());
}
/**
* Gets threshold for prediction score, nearest to specified precision.
* @param {number} desiredPrecision - Desired precision score.
* @returns {number} Threshold for prediction score, nearest to specified `precision`.
*/
this.desiredPrecision = function (desiredPrecision) {
return this.evalPrecisionRecall(new function () {
this.precisionDiff = 1.0; this.threshold = 0.0;
this.update = function (yTrue, yPred, precision, recall) {
var diff = Math.abs(desiredPrecision - precision);
if (diff < this.precisionDiff) {
this.precisionDiff = diff;
this.threshold = yPred;
}
}
this.finish = function () { return this.threshold; }
}());
}
};
/**
* Get ROC parametrization sampled on `sample` points.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities.
* @param {number} [sample=10] - Desired number of samples in output.
* @returns {module:la.Matrix} A matrix with increasing false and true positive rates.
* @example
* // import metrics module
* var metrics = require('qminer').analytics.metrics;
*
* // true and predicted lables
* var true_lables = [0, 1, 0, 0, 1];
* var pred_prob = [0.3, 0.5, 0.2, 0.5, 0.8];
*
* // compute ROC curve
* var roc = metrics.rocCurve(true_lables, pred_prob); // output: [ [ 0, 0 ], [0, 0.5], [[ 0.34, 1 ],], [ 0.67, 0 ], [ 1, 1 ] ]
*/
metrics.rocCurve = function (yTrue, yPred, sample) {
return new metrics.PredictionCurve(yTrue, yPred).roc(sample);
};
/**
* Get AUC of the current curve.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities.
* @param {number} [sample=10] - Desired number of samples in output.
* @returns {number} Area under ROC curve.
* @example
* // import metrics module
* var metrics = require('qminer').analytics.metrics;
*
* // true and predicted lables
* var true_lables = [0, 1, 0, 0, 1];
* var pred_prob = [0.3, 0.5, 0.2, 0.5, 0.8];
*
* // compute ROC curve
* var auc = metrics.rocAucScore(true_lables, pred_prob); // output: 0.92
*/
metrics.rocAucScore = function (yTrue, yPred, sample) {
return new metrics.PredictionCurve(yTrue, yPred).auc(sample);
};
/**
* Get precision recall curve sampled on `sample` points.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities.
* @param {number} [sample=10] - Desired number of samples in output.
* @returns {module:la.Matrix} Precision-recall pairs.
*/
metrics.precisionRecallCurve = function (yTrue, yPred, sample) {
return new metrics.PredictionCurve(yTrue, yPred).precisionRecallCurve(sample);
};
/**
* Get break-even point, the value where precision and recall intersect.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities.
* @returns {number} Break-even point score.
*/
metrics.breakEventPointScore = function (yTrue, yPred) {
return new metrics.PredictionCurve(yTrue, yPred).breakEvenPoint();
};
/**
* Gets threshold for prediction score, which results in the highest F1.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities.
* @returns {number} Threshold with highest F1 score.
*/
metrics.bestF1Threshold = function (yTrue, yPred) {
return new metrics.PredictionCurve(yTrue, yPred).bestF1();
};
/**
* Gets threshold for recall score, nearest to specified recall.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities.
* @param {number} desiredRecall - Desired recall score.
* @returns {number} Threshold for recall score, nearest to specified `recall`.
*/
metrics.desiredRecallThreshold = function (yTrue, yPred, desiredRecall) {
return new metrics.PredictionCurve(yTrue, yPred).desiredRecall(desiredRecall);
};
/**
* Gets threshold for prediction score, nearest to specified precision.
* @param {(Array<number> | module:la.Vector)} yTrue - Ground truth (correct) lables.
* @param {(Array<number> | module:la.Vector)} yPred - Estimated probabilities.
* @param {number} desiredPrecision - Desired precision score.
* @returns {number} Threshold for prediction score, nearest to specified `precision`.
*/
metrics.desiredPrecisionThreshold = function (yTrue, yPred, desiredPrecision) {
return new metrics.PredictionCurve(yTrue, yPred).desiredPrecision(desiredPrecision);
};
///////////////////////////////////////////////////
//////////// ONLINE REGRESSION METRICS ////////////
///////////////////////////////////////////////////
// Online regression metrics used for evaluating online models
// Main object for online metrics model
/**
* createOnlineMetric
* @ignore
* @class
*
* This provides methods used for event handling. It's not meant to
* be used directly.
*
*/
function createOnlineMetric(callback) {
var error = -1;
this.metric = new callback(); // We can hide this later (just delete this)
// check if input types are of correct type
function checkPushParams() {
for (var i = 0, j = arguments.length; i < j; i++) {
var argumentType = arguments[i].constructor.name;
if (argumentType !== "Number") {
throw new TypeError('input param ' + i + ' must be of type "Number", but is ' + argumentType + ' instead');
}
}
}
/**
* Updates metric with ground truth target value `yTrue` and estimated target value `yPred`.
* @ignore
* @param {number} yTrue - Ground truth (correct) target value.
* @param {number} yPred - Estimated target value.
*/
this.push = function (yTrue, yPred, ref_num) {
// set default values of optional input parameters
var yPred = yPred == null ? 0 : yPred;
var ref_num = ref_num == null ? 0 : ref_num;
// check if input types are of correct type
checkPushParams(yTrue, yPred, ref_num);
// calculate the error with provided function from the callback function
error = this.metric.update(yTrue, yPred);
}
/**
* Returns error value.
* @ignore
* @returns {number} Error value.
*/
this.getError = function () {
return error;
}
/**
* Save metric state to provided output stream `fout`.
* @ignore
* @param {module:fs.FOut} fout - The output stream.
* @returns {module:fs.FOut} The output stream `fout`.
*/
this.save = function (fout) {
fout.writeJson(this.metric.state);
return fout;
}
/**
* Load metric state from provided input stream `fin`.
* @ignore
* @param {module:fs.FIn} fin - The output stream.
* @returns {module:fs.FIn} The output stream `fin`.
*/
this.load = function (fin) {
this.metric.state = fin.readJson();
error = this.metric.state.error;
return fin;
}
}
// MEAN ERROR (ME)
/**
* Create new (online) mean error instance.
* @class
* @classdesc Online Mean Error (ME) instance.
* @param {module:fs.FIn} [fin] - Saved state can be loaded via constructor.
* @extends module:analytics~createOnlineMetric
*/
metrics.MeanError = function (fin) {
function metric() {
this.name = "Mean Error"
this.shortName = "ME"
this.state = {
sumErr: 0,
count: 0,
error: 0
}
// update function
this.update = function (yTrue, yPred) {
var