UNPKG

@stdlib/ml

Version:

Machine learning algorithms.

433 lines (394 loc) 14.8 kB
/** * @license Apache-2.0 * * Copyright (c) 2018 The Stdlib Authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ 'use strict'; // MODULES // var isPositiveInteger = require( '@stdlib/assert/is-positive-integer' ).isPrimitive; var isMatrixLike = require( '@stdlib/assert/is-matrix-like' ); var isVectorLike = require( '@stdlib/assert/is-vector-like' ); var setReadOnly = require( '@stdlib/utils/define-nonenumerable-read-only-property' ); var format = require( '@stdlib/string/format' ); var minstd = require( '@stdlib/random/base/minstd-shuffle' ); var floor = require( '@stdlib/math/base/special/floor' ); var ln = require( '@stdlib/math/base/special/ln' ); var dcopy = require( '@stdlib/blas/base/dcopy' ); var createMatrix = require( './matrix.js' ); var copyMatrix = require( './copy_matrix.js' ); var createVector = require( './vector.js' ); var copyVector = require( './copy_vector.js' ); var validate = require( './validate.js' ); var INIT_DEFAULTS = require( './init_defaults.json' ); var initialization = require( './init.js' ); var statistics = require( './stats.js' ); var incrstatistics = require( './incrstats.js' ); var squaredEuclidean = require( './squared_euclidean.js' ); var squaredCosine = require( './squared_cosine.js' ); var squaredCorrelation = require( './squared_correlation.js' ); var closestCentroid = require( './find_closest_centroid.js' ); var updateCentroid = require( './update_centroid.js' ); var normalize = require( './normalize.js' ); var normalizeMatrix = require( './normalize_matrix.js' ); var standardize = require( './standardize.js' ); var standardizeMatrix = require( './standardize_matrix.js' ); // VARIABLES // // Number of cluster statistics: var NSTATS = 4; // [ n_obs, sum_squared_dist, mean_squared_dist, stdev_squared_dist ] // FUNCTIONS // /** * Returns a results object. * * @private * @param {PositiveInteger} k - number of clusters * @param {PositiveInteger} ndims - number of dimensions * @returns {Object} results object */ function createResults( k, ndims ) { var out = {}; out.centroids = createMatrix( k, ndims, false ); // high-level out.stats = createMatrix( k, NSTATS, false ); // high-level return out; } // MAIN // /** * Returns an accumulator function which incrementally partitions data into `k` clusters. * * @param {(PositiveInteger|ndarray)} k - number of clusters or a `k x ndims` matrix containing initial centroids * @param {PositiveInteger} [ndims] - number of dimensions (should only be provided if provided a numeric `k` argument) * @param {Options} [options] - function options * @param {string} [options.metric="euclidean"] - distance metric * @param {ArrayLikeObject} [options.init] - method for determining initial centroids * @param {boolean} [options.normalize=true] - boolean indicating whether to normalize incoming data (only relevant for non-Euclidean distance metrics) * @param {boolean} [options.copy=true] - boolean indicating whether to copy incoming data to prevent mutation during normalization * @param {*} [options.seed] - PRNG seed * @throws {TypeError} first argument must be a positive integer * @throws {TypeError} second argument must be a positive integer * @throws {TypeError} options argument must be an object * @throws {TypeError} must provide valid options * @throws {RangeError} when using sampling to generate initial centroids, the sample size must be greater than or equal to the number of clusters * @returns {Function} accumulator function * * @example * var Float64Array = require( '@stdlib/array/float64' ); * var ndarray = require( '@stdlib/ndarray/ctor' ); * * // Define initial centroid locations: * var buffer = [ * 0.0, 0.0, * 1.0, 1.0, * 1.0, -1.0, * -1.0, -1.0, * -1.0, 1.0 * ]; * var shape = [ 5, 2 ]; * var strides = [ 2, 1 ]; * var offset = 0; * var order = 'row-major'; * * var centroids = ndarray( 'float64', buffer, shape, strides, offset, order ); * * // Create a k-means accumulator: * var accumulator = incrkmeans( centroids ); * * var out = accumulator(); * // returns {...} * * // Create a data vector: * buffer = new Float64Array( 2 ); * shape = [ 2 ]; * strides = [ 1 ]; * * var vec = ndarray( 'float64', buffer, shape, strides, offset, order ); * * // Provide data to the accumulator: * vec.set( 0, 2.0 ); * vec.set( 1, 1.0 ); * * out = accumulator( vec ); * // returns {...} * * vec.set( 0, -5.0 ); * vec.set( 1, 3.14 ); * * out = accumulator( vec ); * // returns {...} * * // Retrieve the current cluster results: * out = accumulator(); * // returns {...} */ function incrkmeans() { var clusterstats; var centroids; var incrstats; var options; var results; var vcopy; var stats; var ndims; var dist; var opts; var init; var err; var FLG; var k; if ( isMatrixLike( arguments[ 0 ] ) ) { k = arguments[ 0 ].shape[ 0 ]; ndims = arguments[ 0 ].shape[ 1 ]; centroids = createMatrix( k, ndims, true ); // low-level centroids = copyMatrix( centroids, arguments[ 0 ] ); if ( arguments.length > 1 ) { options = arguments[ 1 ]; FLG = true; } } else if ( isPositiveInteger( arguments[ 0 ] ) ) { k = arguments[ 0 ]; ndims = arguments[ 1 ]; if ( !isPositiveInteger( ndims ) ) { throw new TypeError( format( 'invalid argument. Argument specifying number of dimensions must be a positive integer. Value: `%s`.', ndims ) ); } if ( arguments.length > 2 ) { options = arguments[ 2 ]; FLG = true; } } else { throw new TypeError( format( 'invalid argument. First argument must either be a positive integer specifying the number of clusters or a matrix containing initial centroids. Value: `%s`.', arguments[ 0 ] ) ); } opts = { 'metric': 'euclidean', 'init': INIT_DEFAULTS[ 'kmeans++' ].slice(), 'seed': minstd(), 'normalize': true, 'copy': true }; opts.init[ 1 ] = k; // Note: this default applies to all initialization methods opts.init[ 2 ] = 2 + floor( ln( k ) ); // Note: from Arthur's and Vassilvitskii's paper "kmeans++: The Advantages of Careful Seeding" (see conclusion) if ( FLG ) { err = validate( opts, options ); if ( err ) { throw err; } } if ( opts.init[ 1 ] < k ) { throw new RangeError( format( 'invalid option. First `%s` parameter option must be greater than or equal to the number of clusters. Options: `%f`.', 'init', opts.init[ 1 ] ) ); } // Initialize a results object: results = createResults( k, ndims ); // Initialize an internal matrix for tabulating cluster statistics: stats = createMatrix( k, NSTATS, true ); // low-level // Initialize an internal cluster statistics accumulator: clusterstats = statistics( stats, k ); // Initialize metric-related variables... if ( opts.metric === 'cosine' ) { dist = squaredCosine; // Initialize a scratch vector for copying input vectors: if ( opts.copy ) { vcopy = createVector( ndims, true ); // low-level } } else if ( opts.metric === 'correlation' ) { dist = squaredCorrelation; // Initialize an accumulator for computing the mean vector and associated standard deviation along each dimension: if ( opts.normalize ) { incrstats = incrstatistics( ndims ); } // Initialize a scratch vector for copying input vectors: if ( opts.copy ) { vcopy = createVector( ndims, true ); // low-level } } else { dist = squaredEuclidean; } // Check if we need to compute initial centroids... if ( centroids === void 0 ) { // Initialize an internal matrix for storing centroids: centroids = createMatrix( k, ndims, true ); // low-level // Initialize an accumulator for computing initial centroids: init = initialization( centroids, stats, clusterstats, incrstats, dist, opts ); // eslint-disable-line max-len } else { // Update cluster results to include the initial centroids (why? so that, even if no data is provided, the `results` object contains the provided centroids): copyMatrix( results.centroids, centroids ); } // Attach properties and methods to the accumulator: setReadOnly( accumulator, 'seed', opts.seed ); setReadOnly( accumulator, 'predict', predict ); return accumulator; /** * If provided a data point vector, the accumulator function returns updated cluster results. If not provided a data point vector, the accumulator function returns the current cluster results. * * @private * @param {ndarray} [vec] - data vector * @throws {TypeError} must provide a 1-dimensional ndarray * @throws {Error} vector length must match centroid dimensions * @returns {(Object|null)} cluster results or null */ function accumulator( vec ) { var bool; var cbuf; var vbuf; var sbuf; var sv; var sc; var ov; var oc; var v; var N; var d; var c; if ( arguments.length === 0 ) { if ( init ) { return null; } return results; } v = vec; // Why? We mention `arguments` in the function and perform a subsequent reassignment. if ( !isVectorLike( v ) ) { throw new TypeError( format( 'invalid argument. Must provide a one-dimensional ndarray. Value: `%s`.', v ) ); } if ( v.shape[ 0 ] !== ndims ) { throw new Error( format( 'invalid argument. Vector length must match centroid dimensions. Expected: `%u``. Actual: `%u``.', ndims, v.shape[ 0 ] ) ); } // Check if we need to update the data point mean vector... if ( incrstats ) { incrstats( v ); } // Check if we have yet to compute initial centroids... if ( init ) { bool = init( v ); if ( bool === false ) { return null; } // De-reference `init` so that it and its internal variables can be garbage collected: init = void 0; } else { // If required by the metric, normalize the data vector... if ( opts.normalize ) { if ( opts.metric === 'cosine' ) { if ( opts.copy ) { v = copyVector( vcopy, v ); } normalize( ndims, v.data, v.strides[ 0 ], v.offset ); } else if ( opts.metric === 'correlation' ) { if ( opts.copy ) { v = copyVector( vcopy, v ); } sbuf = incrstats(); // Magic numbers come from knowing that `sbuf` is an interleaved strided array... standardize( ndims, v.data, v.strides[ 0 ], v.offset, sbuf, 2, 0, sbuf, 2, 1 ); // eslint-disable-line max-len } } cbuf = centroids.data; sc = centroids.strides[ 0 ]; vbuf = v.data; sv = v.strides[ 0 ]; ov = v.offset; // Find the closest centroid by computing the distance from the provided data point to each centroid: c = closestCentroid( dist, k, ndims, cbuf, sc, 0, vbuf, sv, ov ); // Magic number `0` for offset as we know that the matrix view begins at the first buffer element // Compute the centroids buffer index offset to point to the closest centroid: oc = sc * c; // Update the closest centroid: N = stats.get( c, 0 ) + 1; updateCentroid( ndims, N, cbuf, 1, oc, vbuf, sv, ov ); // Magic number `1` as we know that the matrix is row-major single-segment contiguous // Recompute the distance based on the updated centroid position: d = dist( ndims, cbuf, 1, oc, vbuf, sv, ov ); // Magic number `1` as we know that the matrix is row-major single-segment contiguous // Update cluster statistics: clusterstats( c, d ); } // Update the results object: dcopy( centroids.length, centroids.data, 1, results.centroids.data, 1 ); // Magic number `1` as we know that these matrices are row-major single-segment contiguous dcopy( stats.length, stats.data, 1, results.stats.data, 1 ); // Magic number `1` as we know that these matrices are row-major single-segment contiguous return results; } /** * Computes data point distances to centroids and returns centroid assignment predictions. * * @private * @param {ndarray} [out] - output vector for storing centroid assignment predictions * @param {ndarray} X - matrix containing data points (`n x d`, where `n` is the number of data points and `d` is the number of dimensions) * @throws {TypeError} output argument must be a vector * @throws {TypeError} must provide a matrix * @throws {Error} vector length must match number of data points * @throws {Error} number of matrix columns must match centroid dimensions * @returns {(ndarray|null)} vector containing centroid (index) predictions or null */ function predict( out, X ) { var xbuf; var cbuf; var npts; var sx1; var sx2; var sc; var ox; var x; var o; var c; var i; if ( arguments.length > 1 ) { if ( !isVectorLike( out ) ) { throw new TypeError( format( 'invalid argument. Output argument must be a one-dimensional ndarray. Value: `%s`.', out ) ); } o = out; x = X; } else { x = out; } if ( !isMatrixLike( x ) ) { throw new TypeError( format( 'invalid argument. Must provide a two-dimensional ndarray. Value: `%s`.', x ) ); } if ( x.shape[ 1 ] !== ndims ) { throw new Error( format( 'invalid argument. Number of matrix columns must match centroid dimensions. Expected: `%u`. Actual: `%u`.', ndims, x.shape[ 1 ] ) ); } if ( o === void 0 ) { o = createVector( x.shape[ 0 ], false ); // high-level } else if ( o.length !== x.shape[ 0 ] ) { throw new Error( format( 'invalid argument. Output vector length must match the number of data points. Expected: `%u`. Actual: `%u`.', x.shape[ 0 ], o.length ) ); } if ( init ) { return null; } npts = x.shape[ 0 ]; // If required by the metric, normalize the data vectors along the dimensions... if ( opts.normalize ) { if ( opts.metric === 'cosine' ) { if ( opts.copy ) { x = copyMatrix( createMatrix( npts, ndims, true ), x ); // low-level } x = normalizeMatrix( x ); } else if ( opts.metric === 'correlation' ) { if ( opts.copy ) { x = copyMatrix( createMatrix( npts, ndims, true ), x ); // low-level } x = standardizeMatrix( x, incrstats() ); } } cbuf = centroids.data; sc = centroids.strides[ 0 ]; xbuf = x.data; sx1 = x.strides[ 0 ]; sx2 = x.strides[ 1 ]; ox = x.offset; // For each data point, find the closest centroid... for ( i = 0; i < npts; i++ ) { c = closestCentroid( dist, k, ndims, cbuf, sc, 0, xbuf, sx2, ox ); // Magic number `0` for offset as we know that the matrix view begins at the first buffer element // Update the output vector: o.set( i, c ); // Compute the data point buffer index offset to point to the next data point: ox += sx1; } return o; } } // EXPORTS // module.exports = incrkmeans;