ml-bayes
Version:
Naive Bayes Document Classification Algorithm
140 lines (139 loc) • 5.53 kB
JavaScript
(function (root, factory) {
if(typeof module === "object" && module.exports) {
module.exports = factory();
} else {
root.Bayes = factory(root);
}
}(this, function() {
Array.prototype.unique = function () {
var u = {}, a = [];
for (var i = 0, l = this.length; i < l; ++i) {
if (u.hasOwnProperty(this[i])) {
continue;
}
a.push(this[i]);
u[this[i]] = 1;
}
return a;
};
function Bayes (paramsObject) {
paramsObject = paramsObject || {};
this.labelsArray = [];
this.stemKey = {};
this.stemCountKey = {};
this.docCountKey = {};
this.tokenize = paramsObject.tokenize || this.defaultTokenize;
this.log = paramsObject.log || this.defaultLog;
}
Bayes.prototype.toJSON = function() {
return JSON.stringify({
labelsArray: this.labelsArray,
stemKey: this.stemKey,
stemCountKey: this.stemCountKey,
docCountKey: this.docCountKey,
tokenize: this.tokenize.toString(),
log: this.log.toString()
});
};
Bayes.prototype.fromJSON = function(jsonString) {
var jsonObject = JSON.parse(jsonString);
this.labelsArray = jsonObject.labelsArray;
this.stemKey = jsonObject.stemKey;
this.stemCountKey = jsonObject.stemCountKey;
this.docCountKey = jsonObject.docCountKey;
eval('this.tokenize = ' + jsonObject.tokenize);
eval('this.log = ' + jsonObject.log);
};
Bayes.prototype.defaultLog = function () {};
Bayes.prototype.defaultTokenize = function (text) {
return text.toLowerCase().replace(/\W/g, ' ').replace(/\s+/g, ' ').trim().split(' ').unique();
};
Bayes.prototype.registerLabel = function (label) {
if(this.labelsArray.indexOf(label) === -1) {
this.labelsArray.push(label);
return true;
}
return false;
};
Bayes.prototype.getLabels = function () {
return this.labelsArray;
};
Bayes.prototype.getStemLabelCount = function (stem, label) {
return this.stemKey[stem] ? this.stemKey[stem][label] ? this.stemKey[stem][label] : 0 : 0;
};
Bayes.prototype.getStemInverseLabelCount = function (stem, label) {
var that = this;
return this.getLabels().reduce(function (total, labelFromArray) {
return total + (labelFromArray !== label ? that.getStemLabelCount(stem, labelFromArray) : 0);
}, 0);
};
Bayes.prototype.getStemTotalCount = function (stem) {
return this.stemCountKey[stem] ? this.stemCountKey[stem] : 0;
};
Bayes.prototype.getDocCount = function (label) {
return this.docCountKey[label] ? this.docCountKey[label] : 0;
};
Bayes.prototype.getDocInverseCount = function (label) {
var that = this;
return this.getLabels().reduce(function (total, labelFromArray) {
return total + (labelFromArray !== label ? that.getDocCount(labelFromArray) : 0);
}, 0);
};
Bayes.prototype.incrementStem = function (stem, label) {
this.stemCountKey[stem] = this.stemCountKey[stem] ? this.stemCountKey[stem] + 1 : 1;
if(this.stemKey[stem]) {
(this.stemKey[stem][label] = this.stemKey[stem][label] ? this.stemKey[stem][label] + 1 : 1);
} else {
this.stemKey[stem] = {};
this.stemKey[stem][label] = 1;
}
};
Bayes.prototype.incrementDocCount = function (label) {
this.docCountKey[label] = this.docCountKey[label] ? this.docCountKey[label] + 1 : 1;
};
Bayes.prototype.train = function (text, label) {
this.registerLabel(label);
var words = this.tokenize(text),
that = this;
words.forEach(function (word) {
that.incrementStem(word, label);
});
this.incrementDocCount(label);
};
Bayes.prototype.guess = function (text) {
var that = this,
words = this.tokenize(text),
scores = {};
this.getLabels().forEach(function (label) {
var logSum = 0,
wordicity;
words.forEach(function (word) {
var stemTotalCount = that.getStemTotalCount(word);
if(stemTotalCount !== 0) {
var wordProbability = that.getStemLabelCount(word, label) / that.getDocCount(label),
wordInverseProbability = that.getStemInverseLabelCount(word, label) / that.getDocInverseCount(label);
wordicity = wordProbability / (wordProbability + wordInverseProbability);
wordicity = ( (1 * 0.5) + (stemTotalCount * wordicity) ) / ( 1 + stemTotalCount );
if (wordicity === 0) wordicity = 0.01;
else if (wordicity === 1) wordicity = 0.99;
logSum += (Math.log(1 - wordicity) - Math.log(wordicity));
that.log(label + "icity of " + word + ": " + wordicity);
}
});
scores[label] = 1 / ( 1 + Math.exp(logSum) );
});
return scores;
};
Bayes.prototype.extractWinner = function (scores) {
var bestScore = 0;
var bestLabel = null;
for (var label in scores) {
if (scores[label] > bestScore) {
bestScore = scores[label];
bestLabel = label;
}
}
return {label: bestLabel, score: bestScore};
};
return Bayes;
}));