@ai-on-browser/data-analysis-models
Version:
Data analysis model package without any dependencies
232 lines (214 loc) • 5.6 kB
JavaScript
const logGamma = z => {
// https://ja.wikipedia.org/wiki/%E3%82%AC%E3%83%B3%E3%83%9E%E9%96%A2%E6%95%B0
let x = 0
if (Number.isInteger(z)) {
for (let i = 2; i < z; i++) {
x += Math.log(i)
}
} else {
const n = z - 0.5
x = Math.log(Math.sqrt(Math.PI)) - Math.log(2) * n
for (let i = 2 * n - 1; i > 0; i -= 2) {
x += Math.log(i)
}
}
return x
}
const gammaStar = (a, z) => {
let v = 0
const logz = Math.log(z)
for (let n = 0; n < 1000; n++) {
const vn = Math.exp(n * logz - logGamma(a + n + 1))
v += vn
if (vn / v < 1.0e-12) {
break
}
}
return Math.exp(-z) * v
}
const regularizedIncompleteGamma = (a, z) => {
// gamma distribution of the first kind
// https://math-functions-1.watson.jp/sub1_spec_050.html#section010
return gammaStar(a, z) * z ** a
}
const cumChiSquared = (x, k) => {
return regularizedIncompleteGamma(k / 2, x / 2)
}
/**
* Distribution Based Clustering of LArge Spatial Databases
*/
export default class DBCLASD {
// A Distribution-Based Clustering Algorithm for Mining in Large Spatial Databases
// https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=933cba585a12e56a8f60511ebeb74b8cb42634b1
// https://github.com/spalaciob/py-dbclasd
constructor() {
this._d = (a, b) => Math.sqrt(a.reduce((s, v, i) => s + (v - b[i]) ** 2, 0))
}
/**
* Returns predicted categories.
* @param {Array<Array<number>>} datas Training data
* @returns {number[]} Predicted values
*/
predict(datas) {
const n = datas.length
const d = Array(n)
for (let i = 0; i < n; i++) {
d[i] = []
d[i][i] = 0
for (let j = 0; j < i; j++) {
const v = this._d(datas[i], datas[j])
d[i][j] = d[j][i] = v
}
}
const nn = []
for (let i = 0; i < n; i++) {
nn[i] = d[i].map((v, j) => ({ d: v, i: j }))
nn[i].sort((a, b) => a.d - b.d)
}
const processed = Array(n).fill(false)
const clusters = []
for (let i = 0; i < n; i++) {
if (processed[i]) continue
processed[i] = true
let candidates = []
const cluster = nn[i].slice(0, 30).map(v => v.i)
let m = 0
for (let s = 0; s < cluster.length; s++) {
processed[cluster[s]] = true
for (let t = 0; t < s; t++) {
m = Math.max(m, d[cluster[s]][cluster[t]])
}
}
for (const p1 of cluster) {
for (let j = 1; j < n; j++) {
if (nn[p1][j].d > m) {
break
}
const ans = nn[p1][j].i
if (!processed[ans]) {
processed[ans] = true
candidates.push(ans)
}
}
}
let change = true
while (change) {
change = false
const unsuccess = []
while (candidates.length > 0) {
const p = candidates.shift()
const tmpcluster = [...cluster, p]
const nndist = []
for (let s = 0; s < tmpcluster.length; s++) {
nndist[s] = Infinity
for (let t = 0; t < tmpcluster.length; t++) {
if (s === t) continue
nndist[s] = Math.min(nndist[s], d[tmpcluster[s]][tmpcluster[t]])
}
}
const area = this._area(datas, tmpcluster, nndist)
const acc = this._chiSquareTest(datas, area, nndist)
if (acc) {
cluster.push(p)
for (let s = 0; s < cluster.length; s++) {
m = Math.max(m, d[cluster[s]][p])
}
for (let j = 1; j < n; j++) {
if (nn[p][j].d > m) {
break
}
const ans = nn[p][j].i
if (!processed[ans]) {
processed[ans] = true
candidates.push(ans)
}
}
change = true
} else {
unsuccess.push(p)
}
}
candidates = unsuccess
}
clusters.push(cluster)
for (const i of candidates) {
processed[i] = false
}
}
clusters.reverse()
const pred = Array(n).fill(-1)
for (let k = 0; k < clusters.length; k++) {
for (const i of clusters[k]) {
pred[i] = k
}
}
return pred
}
_chiSquareTest(datas, area, nndist) {
const d = datas[0].length
const obsdd = nndist.concat()
obsdd.sort((a, b) => a - b)
let chi2 = 0
for (let i = 0; i < obsdd.length; i++) {
const spvol = Math.exp(d * Math.log(obsdd[i]) - logGamma(d / 2 + 1) + (d / 2) * Math.log(Math.PI))
const expect = 1 - (1 - Math.max(0, Math.min(1, spvol / area))) ** (i + 1)
chi2 += ((i + 1) / obsdd.length - expect) ** 2 / expect
}
return cumChiSquared(chi2, obsdd.length) < 0.95
}
_area(datas, c, nndist) {
const gl = nndist.reduce((s, v) => Math.max(s, v), 0)
const dim = datas[0].length
const min = Array(dim).fill(Infinity)
const max = Array(dim).fill(-Infinity)
for (let i = 0; i < c.length; i++) {
for (let d = 0; d < dim; d++) {
min[d] = Math.min(min[d], datas[c[i]][d])
max[d] = Math.max(max[d], datas[c[i]][d])
}
}
const counts = []
const steps = []
let curcnt = [counts]
for (let d = 0; d < dim; d++) {
steps[d] = []
const len = Math.ceil((max[d] - min[d]) / gl)
const st = (len * gl - (max[d] - min[d])) / 2 + min[d]
for (let i = 0; i <= len; i++) {
steps[d][i] = st + i * gl
}
const nxtcnt = []
for (let i = 0; i < curcnt.length; i++) {
for (let k = 0; k <= len; k++) {
if (d === dim - 1) {
curcnt[i].push(0)
} else {
nxtcnt.push((curcnt[i][k] = []))
}
}
}
curcnt = nxtcnt
}
let volume = 0
for (const i of c) {
let cnt = counts
for (let d = 0; d < dim; d++) {
let k = 0
for (; k < steps[d].length - 1; k++) {
if (steps[d][k] <= datas[i][d] && datas[i][d] < steps[d][k + 1]) {
break
}
}
if (d === dim - 1) {
if (cnt[k] === 0) {
volume++
}
cnt[k]++
} else {
cnt = cnt[k]
}
}
}
return volume * gl ** dim
}
}