@ai-on-browser/data-analysis-models
Version:
Data analysis model package without any dependencies
231 lines (206 loc) • 5.09 kB
JavaScript
import Matrix from '../util/matrix.js'
class KMeans {
constructor(x, k) {
this._x = x
this._k = k
const n = this._x.length
const idx = []
for (let i = 0; i < this._k; i++) {
idx.push(Math.floor(Math.random() * (n - i)))
}
for (let i = idx.length - 1; i >= 0; i--) {
for (let j = idx.length - 1; j > i; j--) {
if (idx[i] <= idx[j]) {
idx[j]++
}
}
}
this._c = idx.map(v => this._x[v])
this._d = (a, b) => Math.sqrt(a.reduce((s, v, i) => s + (v - b[i]) ** 2, 0))
}
get centroids() {
return this._c
}
fit() {
const p = this.predict()
const c = this._c.map(p => Array.from(p, () => 0))
const count = Array(this._k).fill(0)
const n = this._x.length
for (let i = 0; i < n; i++) {
for (let j = 0; j < this._x[i].length; j++) {
c[p[i]][j] += this._x[i][j]
}
count[p[i]]++
}
let d = 0
for (let k = 0; k < this._k; k++) {
const mc = c[k].map(v => v / count[k])
d += this._c[k].reduce((s, v, j) => s + (v - mc[j]) ** 2, 0)
this._c[k] = c[k].map(v => v / count[k])
}
return d
}
predict() {
const p = []
const n = this._x.length
for (let i = 0; i < n; i++) {
let min_d = Infinity
p[i] = -1
for (let k = 0; k < this._k; k++) {
const d = this._d(this._x[i], this._c[k])
if (d < min_d) {
min_d = d
p[i] = k
}
}
}
return p
}
}
const cvTable = [
[0.514, 0.578, 0.683, 0.779, 0.926],
[0.528, 0.591, 0.704, 0.815, 0.969],
[0.546, 0.616, 0.735, 0.861, 1.021],
[0.559, 0.631, 0.754, 0.884, 1.047],
[0.576, 0.656, 0.787, 0.918, 1.092],
]
const AndersonDarling = (data, p) => {
// https://en.wikipedia.org/wiki/Anderson%E2%80%93Darling_test
data.sort((a, b) => a - b)
const n = data.length
const mean = data.reduce((s, v) => s + v, 0) / n
const vari = data.reduce((s, v) => s + (v - mean) ** 2, 0) / (n - 1)
const std = Math.sqrt(vari)
const z = data.map(v => (v - mean) / std)
const zcdf = z.map(v => 1 / (1 + Math.exp(-1.7 * v)))
let s = 0
for (let i = 0; i < n; i++) {
s += (2 * (i + 1) - 1) * (Math.log(zcdf[i]) + Math.log(1 - zcdf[n - i - 1]))
}
const a2 = -n - s / n
const as2 = a2 * (1 + 4 / n - 25 / n ** 2)
const cv =
cvTable[n <= 10 ? 0 : n <= 20 ? 1 : n <= 50 ? 2 : n <= 100 ? 3 : 4][
p === 15 ? 0 : p === 10 ? 1 : p === 5 ? 2 : p === 2.5 ? 3 : p === 1 ? 4 : -1
]
return as2 <= cv
}
/**
* G-means
*/
export default class GMeans {
// https://qiita.com/nagomiso/items/fae8a63e06d7c03c7ded
constructor() {
this._centroids = []
this._init_k = 2
}
/**
* Centroids
* @type {Array<Array<number>>}
*/
get centroids() {
return this._centroids
}
/**
* Number of clusters.
* @type {number}
*/
get size() {
return this._centroids.length
}
_distance(a, b) {
return Math.sqrt(a.reduce((acc, v, i) => acc + (v - b[i]) ** 2, 0))
}
/**
* Clear all clusters.
*/
clear() {
this._centroids = []
}
/**
* Fit model.
* @param {Array<Array<number>>} datas Training data
* @param {number} iterations Iteration count
*/
fit(datas, iterations = -1) {
let clusters = null
if (this._centroids.length === 0) {
clusters = this._split_cluster(datas, this._init_k)
iterations--
} else {
clusters = this._create_clusters(this, datas)
}
const centers = []
while (clusters.length > 0 && (iterations < 0 || iterations-- > 0)) {
const new_clusters = []
while (clusters.length > 0) {
const c = clusters.shift()
if (c.size <= 3) {
centers.push(c.centroid)
continue
}
const x = Matrix.fromArray(c.data)
const [ev, m] = x.cov().eigenPowerIteration()
m.mult(Math.sqrt((2 * ev) / Math.PI))
const v = Matrix.mult(m, 2)
const xd = x.dot(v)
xd.div(v.norm())
const test = AndersonDarling(xd.value, 5)
if (test) {
centers.push(c.centroid)
} else {
const [c1, c2] = this._split_cluster(c.data)
new_clusters.push(c1, c2)
}
}
clusters = new_clusters
}
if (clusters.length > 0) {
centers.push(...clusters.map(c => c.centroid))
}
this._centroids = centers
}
_split_cluster(datas, k = 2) {
const kmeans = new KMeans(datas, k)
while (kmeans.fit() > 0);
return this._create_clusters(kmeans, datas)
}
_create_clusters(model, datas) {
const k = model.centroids.length
const p = model.predict(datas)
const ds = []
for (let i = 0; i < k; ds[i++] = []);
datas.forEach((d, i) => ds[p[i]].push(d))
const clusters = []
for (let i = 0; i < k; i++) {
clusters[i] = {
size: ds[i].length,
data: ds[i],
centroid: model.centroids[i],
}
}
return clusters
}
/**
* Returns predicted categories.
* @param {Array<Array<number>>} datas Sample data
* @returns {number[]} Predicted values
*/
predict(datas) {
if (this._centroids.length === 0) {
throw new Error('Call fit before predict.')
}
return datas.map(value => {
let mind = Infinity
let mini = -1
for (let i = 0; i < this._centroids.length; i++) {
const d = this._distance(value, this._centroids[i])
if (d < mind) {
mind = d
mini = i
}
}
return mini
})
}
}