UNPKG

classificator

Version:

Naive Bayes classifier with verbose informations for node.js

405 lines (335 loc) 12.2 kB
const Decimal = require('decimal.js').default; // handles arbitrary-precision arithmetics. /* Expose our naive-bayes generator function */ module.exports = function(options) { return new Naivebayes(options); }; // keys we use to serialize a classifier's state const STATE_KEYS = (module.exports.STATE_KEYS = [ 'categories', 'docCount', 'totalDocuments', 'vocabulary', 'vocabularySize', 'wordCount', 'wordFrequencyCount', 'options', ]); const DEFAULT_ALPHA = 1; const DEFAULT_FIT_PRIOR = true; /** * Initializes a NaiveBayes instance from a JSON state representation. * Use this with classifier.toJson(). * * @param {String|Object} jsonStrOrObject state representation obtained by classifier.toJson() * @return {NaiveBayes} Classifier */ module.exports.fromJson = (jsonStrOrObject) => { let parameters; try { switch (typeof jsonStrOrObject) { case 'string': parameters = JSON.parse(jsonStrOrObject); break; case 'object': parameters = jsonStrOrObject; break; default: throw new Error(''); } } catch (e) { console.error(e); throw new Error('Naivebays.fromJson expects a valid JSON string or an object.'); } // init a new classifier const classifier = new Naivebayes(parameters.options); // override the classifier's state STATE_KEYS.forEach((k) => { if (typeof parameters[k] === 'undefined') { throw new Error( `Naivebayes.fromJson: JSON string is missing an expected property: [${k}].` ); } classifier[k] = parameters[k]; }); return classifier; }; /** * Given an input string, tokenize it into an array of word tokens. * This is the default tokenization function used if user does not provide one in `options`. * * @param {String} text * @return {Array} */ const defaultTokenizer = (text) => { // remove punctuation from text - remove anything that isn't a word char or a space const rgxPunctuation = /[^(a-zA-ZA-Яa-я0-9_)+\s]/g; const sanitized = text.replace(rgxPunctuation, ' '); // tokens = tokens.filter(function(token) { // return token.length >= _that.config.minimumLength; // }); return sanitized.split(/\s+/); }; /** * Naive-Bayes Classifier * * This is a naive-bayes classifier that uses Laplace Smoothing. * * Takes an (optional) options object containing: * - `tokenizer` => custom tokenization function * */ function Naivebayes(options) { // set options object this.options = {}; if (typeof options !== 'undefined') { if (!options || typeof options !== 'object' || Array.isArray(options)) { throw TypeError( `NaiveBayes got invalid 'options': ${options}'. Pass in an object.` ); } this.options = options; } this.tokenizer = this.options.tokenizer || defaultTokenizer; this.alpha = this.options.alpha || DEFAULT_ALPHA; this.fitPrior = this.options.fitPrior === undefined ? DEFAULT_FIT_PRIOR : this.options.fitPrior; // initialize our vocabulary and its size this.vocabulary = {}; this.vocabularySize = 0; // number of documents we have learned from this.totalDocuments = 0; // document frequency table for each of our categories //= > for each category, how often were documents mapped to it this.docCount = {}; // for each category, how many words total were mapped to it this.wordCount = {}; // word frequency table for each category //= > for each category, how frequent was a given word mapped to it this.wordFrequencyCount = {}; // hashmap of our category names this.categories = {}; } /** * Initialize each of our data structure entries for this new category * * @param {String} categoryName */ Naivebayes.prototype.initializeCategory = function(categoryName) { if (!this.categories[categoryName]) { this.docCount[categoryName] = 0; this.wordCount[categoryName] = 0; this.wordFrequencyCount[categoryName] = {}; this.categories[categoryName] = true; } return this; }; /** * Properly remove a category, unlearning all words that were associated to it. * * @param {String} categoryName */ Naivebayes.prototype.removeCategory = function(categoryName) { if (!this.categories[categoryName]) { return this; } // update the total number of documents we have learned from this.totalDocuments -= this.docCount[categoryName]; Object.keys(this.wordFrequencyCount[categoryName]).forEach((token) => { this.vocabulary[token]--; if (this.vocabulary[token] === 0) this.vocabularySize--; }); delete this.docCount[categoryName]; delete this.wordCount[categoryName]; delete this.wordFrequencyCount[categoryName]; delete this.categories[categoryName]; return this; }; /** * train our naive-bayes classifier by telling it what `category` * the `text` corresponds to. * * @param {String} text * @param {String} category Category to learn as being text */ Naivebayes.prototype.learn = function(text, category) { // initialize category data structures if we've never seen this category this.initializeCategory(category); // update our count of how many documents mapped to this category this.docCount[category]++; // update the total number of documents we have learned from this.totalDocuments++; // normalize the text into a word array const tokens = this.tokenizer(text); // get a frequency count for each token in the text const frequencyTable = this.frequencyTable(tokens); Object.keys(frequencyTable).forEach((token) => { const frequencyInText = frequencyTable[token]; // add this word to our vocabulary if not already existing if (!this.vocabulary[token] || this.vocabulary[token] === 0) { this.vocabularySize++; this.vocabulary[token] = 1; // this.vocabulary[token] = frequencyInText; } else if (this.vocabulary[token] > 0) { this.vocabulary[token]++; // this.vocabulary[token] += frequencyInText; } // update the frequency information for this word in this category if (!this.wordFrequencyCount[category][token]) { this.wordFrequencyCount[category][token] = frequencyInText; } else this.wordFrequencyCount[category][token] += frequencyInText; // update the count of all words we have seen mapped to this category this.wordCount[category] += frequencyInText; }); return this; }; /** * untrain our naive-bayes classifier by telling it what `category` * the `text` to remove corresponds to. * * @param {String} text * @param {String} category Category to unlearn as being text */ Naivebayes.prototype.unlearn = function(text, category) { // update our count of how many documents mapped to this category this.docCount[category]--; if (this.docCount[category] === 0) { delete this.docCount[category]; } // update the total number of documents we have learned from this.totalDocuments--; // normalize the text into a word array const tokens = this.tokenizer(text); // get a frequency count for each token in the text const frequencyTable = this.frequencyTable(tokens); /* Update our vocabulary and our word frequency count for this category */ Object.keys(frequencyTable).forEach((token) => { const frequencyInText = frequencyTable[token]; // add this word to our vocabulary if not already existing if (this.vocabulary[token] && this.vocabulary[token] > 0) { this.vocabulary[token] -= frequencyInText; if (this.vocabulary[token] === 0) this.vocabularySize--; } this.wordFrequencyCount[category][token] -= frequencyInText; if (this.wordFrequencyCount[category][token] === 0) { delete this.wordFrequencyCount[category][token]; } // update the count of all words we have seen mapped to this category this.wordCount[category] -= frequencyInText; if (this.wordCount[category] === 0) { delete this.wordCount[category]; delete this.wordFrequencyCount[category]; } }); return this; }; /** * Determine what category `text` belongs to. * * @param {String} text * * @return {Object} The predicted category, and the likelihoods stats. */ Naivebayes.prototype.categorize = function(text) { const tokens = this.tokenizer(text); const frequencyTable = this.frequencyTable(tokens); const categories = Object.keys(this.categories); const likelihoods = []; // iterate through our categories to find the one with max probability for this text categories.forEach((category) => { // start by calculating the overall probability of this category //= > out of all documents we've ever looked at, how many were // mapped to this category let categoryLikelihood; if (this.fitPrior) { categoryLikelihood = this.docCount[category] / this.totalDocuments; } else { categoryLikelihood = 1; } // take the log to avoid underflow // let logLikelihood = Math.log(categoryLikelihood); let logLikelihood = Decimal(categoryLikelihood); logLikelihood = logLikelihood.naturalLogarithm(); // now determine P( w | c ) for each word `w` in the text Object.keys(frequencyTable).forEach((token) => { if (this.vocabulary[token] && this.vocabulary[token] > 0) { const termFrequencyInText = frequencyTable[token]; const tokenProbability = this.tokenProbability(token, category); // determine the log of the P( w | c ) for this word // logLikelihood += termFrequencyInText * Math.log(tokenProbability); let logTokenProbability = Decimal(tokenProbability); logTokenProbability = logTokenProbability.naturalLogarithm(); logLikelihood = logLikelihood.plus(termFrequencyInText * logTokenProbability); } }); if (logLikelihood == Number.NEGATIVE_INFINITY) { console.warn(`[Classificator] category ${category} had -Infinity odds`); } likelihoods.push({ category, logLikelihood }); }); const logsumexp = (likelihoods) => { let sum = new Decimal(0); likelihoods.forEach((likelihood) => { const x = Decimal(likelihood.logLikelihood); const a = Decimal.exp(x); sum = sum.plus(a); }); return sum.naturalLogarithm(); }; const logProbX = logsumexp(likelihoods); likelihoods.forEach((likelihood) => { likelihood.logProba = Decimal(likelihood.logLikelihood).minus(logProbX); likelihood.proba = likelihood.logProba.naturalExponential(); likelihood.logProba = likelihood.logProba.toNumber(); likelihood.proba = likelihood.proba.toNumber(); likelihood.logLikelihood = likelihood.logLikelihood.toNumber(); }); // sort to have first element with biggest probability likelihoods.sort((a, b) => b.proba - a.proba); return { likelihoods, predictedCategory: likelihoods[0].category }; }; /** * Calculate probability that a `token` belongs to a `category` * * @param {String} token * @param {String} category * @return {Number} probability */ Naivebayes.prototype.tokenProbability = function(token, category) { // how many times this word has occurred in documents mapped to this category const wordFrequencyCount = this.wordFrequencyCount[category][token] || 0; // what is the count of all words that have ever been mapped to this category const wordCount = this.wordCount[category]; // use laplace Add-1 Smoothing equation return (wordFrequencyCount + this.alpha) / (wordCount + this.alpha * this.vocabularySize); }; /** * Build a frequency hashmap where * - the keys are the entries in `tokens` * - the values are the frequency of each entry in `tokens` * * @param {Array} tokens Normalized word array * @return {Object} */ Naivebayes.prototype.frequencyTable = function(tokens) { const frequencyTable = Object.create(null); tokens.forEach((token) => { if (!frequencyTable[token]) frequencyTable[token] = 1; else frequencyTable[token]++; }); return frequencyTable; }; /** * Dump the classifier's state as a JSON string. * @return {String} Representation of the classifier. */ Naivebayes.prototype.toJson = function() { const state = {}; STATE_KEYS.forEach(k => (state[k] = this[k])); return JSON.stringify(state); };