@ai-on-browser/data-analysis-models
Version:
Data analysis model package without any dependencies
230 lines (215 loc) • 5.8 kB
JavaScript
/**
* PROjected CLUStering algorithm
*/
export default class PROCLUS {
// Fast Algorithms for Projected Clustering
// https://dl.acm.org/doi/pdf/10.1145/304181.304188
/**
* @param {number} k Number of clusters
* @param {number} a Number to multiply the number of clusters for sample size
* @param {number} b Number to multiply the number of clusters for final set size
* @param {number} l Average dimensions
* @param {number} [minDeviation] Minimum deviation to check the medoid is bad
*/
constructor(k, a, b, l, minDeviation = 0.1) {
this._k = k
this._a = a
this._b = b
this._l = l
this._minDeviation = minDeviation
this._d = (a, b) => Math.sqrt(a.reduce((s, v, i) => s + (v - b[i]) ** 2, 0))
}
_sample(n, k) {
const idx = []
for (let i = 0; i < k && i < n; i++) {
idx.push(Math.floor(Math.random() * (n - i)))
}
for (let i = idx.length - 1; i >= 0; i--) {
for (let j = idx.length - 1; j > i; j--) {
if (idx[i] <= idx[j]) {
idx[j]++
}
}
}
return idx
}
/**
* Initialize model.
* @param {Array<Array<number>>} datas Training data
*/
init(datas) {
this._x = datas
this._dists = []
for (let i = 0; i < this._x.length; i++) {
this._dists[i] = []
this._dists[i][i] = 0
for (let j = 0; j < i; j++) {
this._dists[i][j] = this._dists[j][i] = this._d(this._x[i], this._x[j])
}
}
const idx = this._sample(this._x.length, this._k * this._a)
this._m = [idx[Math.floor(Math.random() * idx.length)]]
const dist = Array(idx.length).fill(Infinity)
for (let i = 1; i < this._k * this._b; i++) {
let max_k = -1
let max_d = 0
const xi = this._m[this._m.length - 1]
for (let k = 0; k < idx.length; k++) {
dist[k] = Math.min(dist[k], this._dists[xi][idx[k]])
if (max_d < dist[k]) {
max_d = dist[k]
max_k = idx[k]
}
}
this._m.push(max_k)
}
this._bestObjective = Infinity
this._mcurrent = this._sample(this._m.length, this._k).map(i => this._m[i])
}
/**
* Fit model.
*/
fit() {
const n = this._x.length
const L = []
for (let i = 0; i < this._k; i++) {
let di = Infinity
for (let k = 0; k < this._k; k++) {
if (i === k) continue
const d = this._dists[this._mcurrent[i]][this._mcurrent[k]]
if (d < di) {
di = d
}
}
L[i] = []
for (let k = 0; k < n; k++) {
if (k === this._mcurrent[i]) continue
if (this._dists[this._mcurrent[i]][k] < di) {
L[i].push(k)
}
}
}
const D = this._findDimensions(this._mcurrent, L)
const C = this._assignPoints(this._mcurrent, D)
const w = Array(this._k).fill(0)
for (let k = 0; k < this._k; k++) {
for (let i = 0; i < C[k].length; i++) {
for (const d of D[k]) {
w[k] += Math.abs(this._x[C[k][i]][d] - this._x[this._mcurrent[k]][d])
}
}
w[k] /= D[k].length
}
const of = w.reduce((s, v) => s + v, 0) / n
if (of < this._bestObjective) {
this._bestObjective = of
this._mbest = this._mcurrent
this._clusters = C
}
this._mcurrent = this._mbest.concat()
for (let k = 0; k < this._k; k++) {
if (this._clusters[k].length < (n / this._k) * this._minDeviation) {
for (let t = 0; t < 100; t++) {
const i = this._m[Math.floor(Math.random() * this._m.length)]
if (!this._mcurrent.includes(i)) {
this._mcurrent[k] = i
break
}
}
}
}
}
_findDimensions(m, L) {
const dim = this._x[0].length
const D = []
const Z = []
for (let i = 0; i < this._k; i++) {
const x = Array(dim).fill(0)
for (let k = 0; k < L[i].length; k++) {
for (let d = 0; d < dim; d++) {
x[d] += Math.abs(this._x[m[i]][d] - this._x[L[i][k]][d])
}
}
for (let d = 0; d < dim; d++) {
x[d] /= L[i].length
}
const y = x.reduce((s, v) => s + v, 0) / dim
D[i] = []
const s = Math.sqrt(x.reduce((s, v) => s + (v - y) ** 2, 0) / (dim - 1))
for (let d = 0; d < dim; d++) {
Z.push([i, d, (x[d] - y) / s])
}
}
Z.sort((a, b) => a[2] - b[2])
let extValues = this._k * (this._l - 2)
for (let i = 0; i < Z.length; i++) {
if (D[Z[i][0]].length < 2) {
D[Z[i][0]].push(Z[i][1])
} else if (extValues > 0) {
D[Z[i][0]].push(Z[i][1])
extValues--
}
}
return D
}
_assignPoints(m, D) {
const C = Array.from({ length: this._k }, () => [])
for (let i = 0; i < this._x.length; i++) {
let min_d = Infinity
let min_k = -1
for (let k = 0; k < this._k; k++) {
const d = D[k].reduce((s, d) => s + Math.abs(this._x[i][d] - this._x[m[k]][d]), 0)
if (d < min_d) {
min_d = d
min_k = k
}
}
C[min_k].push(i)
}
return C
}
/**
* Returns predicted categories.
* @returns {number[]} Predicted values
*/
predict() {
this._D = this._findDimensions(this._mbest, this._clusters)
const C = this._assignPoints(this._mbest, this._D)
const p = []
for (let k = 0; k < C.length; k++) {
for (let i = 0; i < C[k].length; i++) {
p[C[k][i]] = k
}
}
return p
}
/**
* Returns a list of the data predicted as outliers or not.
* @returns {boolean[]} Predicted values
*/
outliers() {
this._D = this._findDimensions(this._mbest, this._clusters)
const C = this._assignPoints(this._mbest, this._D)
const outliers = Array(this._x.length).fill(false)
for (let k = 0; k < this._k; k++) {
let min_d = Infinity
for (let j = 0; j < this._k; j++) {
if (k === j) continue
const d = this._D[k].reduce(
(s, d) => s + Math.abs(this._x[this._mbest[k]][d] - this._x[this._mbest[j]][d]),
0
)
if (d < min_d) {
min_d = d
}
}
for (let i = 0; i < C[k].length; i++) {
const d = this._D[k].reduce((s, d) => s + Math.abs(this._x[this._mbest[k]][d] - this._x[C[k][i]][d]), 0)
if (d > min_d) {
outliers[C[k][i]] = true
}
}
}
return outliers
}
}