bravey
Version:
A simple JavaScript NLP-like library to help you creating your own bot.
253 lines (213 loc) • 10 kB
HTML
<html lang="en">
<head>
<meta charset="utf-8">
<title>JSDoc: Source: classifiers/DocumentClassifier.js</title>
<script src="scripts/prettify/prettify.js"> </script>
<script src="scripts/prettify/lang-css.js"> </script>
<!--[if lt IE 9]>
<script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
<![endif]-->
<link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css">
<link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css">
</head>
<body>
<div id="main">
<h1 class="page-title">Source: classifiers/DocumentClassifier.js</h1>
<section>
<article>
<pre class="prettyprint source linenums"><code>/**
* The Bravey document classifier, based on Naive Bayes.
* @constructor
* @param {string} [extensions.stemmer] - A stemmer instance to be used for classifying.
*/
Bravey.DocumentClassifier = function(extensions) {
extensions = extensions || {};
var storage = {};
var stemKey = function(stem, label) {
return 'stem:' + stem + '::label:' + label;
};
var docCountKey = function(label) {
return 'docCount:' + label;
};
var stemCountKey = function(stem) {
return 'stemCount:' + stem;
};
var log = function(text) {
//console.log(text);
};
var getLabels = function() {
var labels = storage['registeredLabels'];
if (!labels) labels = '';
return labels.split(',').filter(function(a) {
return a.length;
});
};
var registerLabel = function(label) {
var labels = getLabels();
if (labels.indexOf(label) === -1) {
labels.push(label);
storage['registeredLabels'] = labels.join(',');
}
return true;
};
var stemLabelCount = function(stem, label) {
var count = parseInt(storage[stemKey(stem, label)]);
if (!count) count = 0;
return count;
};
var stemInverseLabelCount = function(stem, label) {
var labels = getLabels();
var total = 0;
for (var i = 0, length = labels.length; i < length; i++) {
if (labels[i] === label)
continue;
total += parseInt(stemLabelCount(stem, labels[i]));
}
return total;
};
var stemTotalCount = function(stem) {
var count = parseInt(storage[stemCountKey(stem)]);
if (!count) count = 0;
return count;
};
var docCount = function(label) {
var count = parseInt(storage[docCountKey(label)]);
if (!count) count = 0;
return count;
};
var docInverseCount = function(label) {
var labels = getLabels();
var total = 0;
for (var i = 0, length = labels.length; i < length; i++) {
if (labels[i] === label)
continue;
total += parseInt(docCount(labels[i]));
}
return total;
};
var increment = function(key) {
var count = parseInt(storage[key]);
if (!count) count = 0;
storage[key] = parseInt(count) + 1;
return count + 1;
};
var incrementStem = function(stem, label) {
increment(stemCountKey(stem));
increment(stemKey(stem, label));
};
var incrementDocCount = function(label) {
return increment(docCountKey(label));
};
var train = function(text, label) {
registerLabel(label);
var words = Bravey.Text.tokenize(Bravey.Text.clean(text));
if (extensions.filter) words = extensions.filter(words);
var length = words.length;
for (var i = 0; i < length; i++)
incrementStem(extensions.stemmer ? extensions.stemmer(words[i]) : words[i], label);
incrementDocCount(label);
};
var guess = function(text) {
var words = Bravey.Text.tokenize(Bravey.Text.clean(text));
if (extensions.filter) words = extensions.filter(words);
var length = words.length;
var labels = getLabels();
var totalDocCount = 0;
var docCounts = {};
var docInverseCounts = {};
var scores = {};
var labelProbability = {};
for (var j = 0; j < labels.length; j++) {
var label = labels[j];
docCounts[label] = docCount(label);
docInverseCounts[label] = docInverseCount(label);
totalDocCount += parseInt(docCounts[label]);
}
for (var j = 0; j < labels.length; j++) {
var label = labels[j];
var logSum = 0;
labelProbability[label] = docCounts[label] / totalDocCount;
for (var i = 0; i < length; i++) {
var word = extensions.stemmer ? extensions.stemmer(words[i]) : words[i];
var _stemTotalCount = stemTotalCount(word);
if (_stemTotalCount === 0) {
continue;
} else {
var wordProbability = stemLabelCount(word, label) / docCounts[label];
var wordInverseProbability = stemInverseLabelCount(word, label) / docInverseCounts[label];
var wordicity = wordProbability / (wordProbability + wordInverseProbability);
wordicity = ((1 * 0.5) + (_stemTotalCount * wordicity)) / (1 + _stemTotalCount);
if (wordicity === 0)
wordicity = 0.01;
else if (wordicity === 1)
wordicity = 0.99;
}
logSum += (Math.log(1 - wordicity) - Math.log(wordicity));
log(label + "icity of " + word + ": " + wordicity);
}
scores[label] = 1 / (1 + Math.exp(logSum));
}
return scores;
};
var extractWinner = function(scores) {
var bestScore = 0;
var bestLabel = null;
for (var label in scores) {
if (scores[label] > bestScore) {
bestScore = scores[label];
bestLabel = label;
}
}
return {
label: bestLabel,
score: bestScore
};
};
/**
* Add a document to the classifier.
* @param {string} text - The text to be classified.
* @param {string} label - The related label
* @returns {text} The classified text.
*/
this.addDocument = function(text, label) {
train(text, label);
return text;
}
/**
* Classify a document.
* @param {string} text - The document to be classified.
* @returns {DocumentClassification} The document class.
*/
this.classifyDocument = function(text) {
var scores = guess(text);
var winner = extractWinner(scores);
return {
scores: scores,
winner: winner
};
}
this.addDocument("", "none");
}
/**
Describes a document classification.
@typedef DocumentClassification
@type {Object}
@property {number[]} scores The related scores for each known document label.
@property {number} winner.score The score of the winning label.
@property {string} winner.label The name of the winning label.
*/</code></pre>
</article>
</section>
</div>
<nav>
<h2><a href="index.html">Home</a></h2><h3>Classes</h3><ul><li><a href="Bravey.ApiAiAdapter.html">ApiAiAdapter</a></li><li><a href="Bravey.ContextManager.html">ContextManager</a></li><li><a href="Bravey.DocumentClassifier.html">DocumentClassifier</a></li><li><a href="Bravey.EMailEntityRecognizer.html">EMailEntityRecognizer</a></li><li><a href="Bravey.Filter.BasicFilter.html">BasicFilter</a></li><li><a href="Bravey.FreeTextEntityRecognizer.html">FreeTextEntityRecognizer</a></li><li><a href="Bravey.Language.EN.DateEntityRecognizer.html">DateEntityRecognizer</a></li><li><a href="Bravey.Language.EN.FreeTextEntityRecognizer.html">FreeTextEntityRecognizer</a></li><li><a href="Bravey.Language.EN.NumberEntityRecognizer.html">NumberEntityRecognizer</a></li><li><a href="Bravey.Language.EN.Stemmer.html">Stemmer</a></li><li><a href="Bravey.Language.EN.TimeEntityRecognizer.html">TimeEntityRecognizer</a></li><li><a href="Bravey.Language.EN.TimePeriodEntityRecognizer.html">TimePeriodEntityRecognizer</a></li><li><a href="Bravey.Language.IT.DateEntityRecognizer.html">DateEntityRecognizer</a></li><li><a href="Bravey.Language.IT.FreeTextEntityRecognizer.html">FreeTextEntityRecognizer</a></li><li><a href="Bravey.Language.IT.NumberEntityRecognizer.html">NumberEntityRecognizer</a></li><li><a href="Bravey.Language.IT.Stemmer.html">Stemmer</a></li><li><a href="Bravey.Language.IT.TimeEntityRecognizer.html">TimeEntityRecognizer</a></li><li><a href="Bravey.Language.IT.TimePeriodEntityRecognizer.html">TimePeriodEntityRecognizer</a></li><li><a href="Bravey.Language.PT.DateEntityRecognizer.html">DateEntityRecognizer</a></li><li><a href="Bravey.Language.PT.FreeTextEntityRecognizer.html">FreeTextEntityRecognizer</a></li><li><a href="Bravey.Language.PT.NumberEntityRecognizer.html">NumberEntityRecognizer</a></li><li><a href="Bravey.Language.PT.Stemmer.html">Stemmer</a></li><li><a href="Bravey.Language.PT.TimeEntityRecognizer.html">TimeEntityRecognizer</a></li><li><a href="Bravey.Language.PT.TimePeriodEntityRecognizer.html">TimePeriodEntityRecognizer</a></li><li><a href="Bravey.Nlp.Fuzzy.html">Fuzzy</a></li><li><a href="Bravey.Nlp.Sequential.html">Sequential</a></li><li><a href="Bravey.NumberEntityRecognizer.html">NumberEntityRecognizer</a></li><li><a href="Bravey.RegexEntityRecognizer.html">RegexEntityRecognizer</a></li><li><a href="Bravey.SessionManager.InMemorySessionManager.html">InMemorySessionManager</a></li><li><a href="Bravey.StringEntityRecognizer.html">StringEntityRecognizer</a></li><li><a href="Bravey.Text.RegexMap.html">RegexMap</a></li></ul><h3>Namespaces</h3><ul><li><a href="Bravey.html">Bravey</a></li><li><a href="Bravey.Data.html">Data</a></li><li><a href="Bravey.Date.html">Date</a></li><li><a href="Bravey.File.html">File</a></li><li><a href="Bravey.Filter.html">Filter</a></li><li><a href="Bravey.Language.html">Language</a></li><li><a href="Bravey.Language.EN.html">EN</a></li><li><a href="Bravey.Language.IT.html">IT</a></li><li><a href="Bravey.Language.PT.html">PT</a></li><li><a href="Bravey.Nlp.html">Nlp</a></li><li><a href="Bravey.SessionManager.html">SessionManager</a></li><li><a href="Bravey.Text.html">Text</a></li></ul><h3><a href="global.html">Global</a></h3>
</nav>
<br class="clear">
<footer>
Documentation generated by <a href="https://github.com/jsdoc3/jsdoc">JSDoc 3.4.3</a>
</footer>
<script> prettyPrint(); </script>
<script src="scripts/linenumber.js"> </script>
</body>
</html>