UNPKG

bravey

Version:

A simple JavaScript NLP-like library to help you creating your own bot.

253 lines (213 loc) 10 kB
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>JSDoc: Source: classifiers/DocumentClassifier.js</title> <script src="scripts/prettify/prettify.js"> </script> <script src="scripts/prettify/lang-css.js"> </script> <!--[if lt IE 9]> <script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script> <![endif]--> <link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css"> <link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css"> </head> <body> <div id="main"> <h1 class="page-title">Source: classifiers/DocumentClassifier.js</h1> <section> <article> <pre class="prettyprint source linenums"><code>/** * The Bravey document classifier, based on Naive Bayes. * @constructor * @param {string} [extensions.stemmer] - A stemmer instance to be used for classifying. */ Bravey.DocumentClassifier = function(extensions) { extensions = extensions || {}; var storage = {}; var stemKey = function(stem, label) { return 'stem:' + stem + '::label:' + label; }; var docCountKey = function(label) { return 'docCount:' + label; }; var stemCountKey = function(stem) { return 'stemCount:' + stem; }; var log = function(text) { //console.log(text); }; var getLabels = function() { var labels = storage['registeredLabels']; if (!labels) labels = ''; return labels.split(',').filter(function(a) { return a.length; }); }; var registerLabel = function(label) { var labels = getLabels(); if (labels.indexOf(label) === -1) { labels.push(label); storage['registeredLabels'] = labels.join(','); } return true; }; var stemLabelCount = function(stem, label) { var count = parseInt(storage[stemKey(stem, label)]); if (!count) count = 0; return count; }; var stemInverseLabelCount = function(stem, label) { var labels = getLabels(); var total = 0; for (var i = 0, length = labels.length; i &lt; length; i++) { if (labels[i] === label) continue; total += parseInt(stemLabelCount(stem, labels[i])); } return total; }; var stemTotalCount = function(stem) { var count = parseInt(storage[stemCountKey(stem)]); if (!count) count = 0; return count; }; var docCount = function(label) { var count = parseInt(storage[docCountKey(label)]); if (!count) count = 0; return count; }; var docInverseCount = function(label) { var labels = getLabels(); var total = 0; for (var i = 0, length = labels.length; i &lt; length; i++) { if (labels[i] === label) continue; total += parseInt(docCount(labels[i])); } return total; }; var increment = function(key) { var count = parseInt(storage[key]); if (!count) count = 0; storage[key] = parseInt(count) + 1; return count + 1; }; var incrementStem = function(stem, label) { increment(stemCountKey(stem)); increment(stemKey(stem, label)); }; var incrementDocCount = function(label) { return increment(docCountKey(label)); }; var train = function(text, label) { registerLabel(label); var words = Bravey.Text.tokenize(Bravey.Text.clean(text)); if (extensions.filter) words = extensions.filter(words); var length = words.length; for (var i = 0; i &lt; length; i++) incrementStem(extensions.stemmer ? extensions.stemmer(words[i]) : words[i], label); incrementDocCount(label); }; var guess = function(text) { var words = Bravey.Text.tokenize(Bravey.Text.clean(text)); if (extensions.filter) words = extensions.filter(words); var length = words.length; var labels = getLabels(); var totalDocCount = 0; var docCounts = {}; var docInverseCounts = {}; var scores = {}; var labelProbability = {}; for (var j = 0; j &lt; labels.length; j++) { var label = labels[j]; docCounts[label] = docCount(label); docInverseCounts[label] = docInverseCount(label); totalDocCount += parseInt(docCounts[label]); } for (var j = 0; j &lt; labels.length; j++) { var label = labels[j]; var logSum = 0; labelProbability[label] = docCounts[label] / totalDocCount; for (var i = 0; i &lt; length; i++) { var word = extensions.stemmer ? extensions.stemmer(words[i]) : words[i]; var _stemTotalCount = stemTotalCount(word); if (_stemTotalCount === 0) { continue; } else { var wordProbability = stemLabelCount(word, label) / docCounts[label]; var wordInverseProbability = stemInverseLabelCount(word, label) / docInverseCounts[label]; var wordicity = wordProbability / (wordProbability + wordInverseProbability); wordicity = ((1 * 0.5) + (_stemTotalCount * wordicity)) / (1 + _stemTotalCount); if (wordicity === 0) wordicity = 0.01; else if (wordicity === 1) wordicity = 0.99; } logSum += (Math.log(1 - wordicity) - Math.log(wordicity)); log(label + "icity of " + word + ": " + wordicity); } scores[label] = 1 / (1 + Math.exp(logSum)); } return scores; }; var extractWinner = function(scores) { var bestScore = 0; var bestLabel = null; for (var label in scores) { if (scores[label] > bestScore) { bestScore = scores[label]; bestLabel = label; } } return { label: bestLabel, score: bestScore }; }; /** * Add a document to the classifier. * @param {string} text - The text to be classified. * @param {string} label - The related label * @returns {text} The classified text. */ this.addDocument = function(text, label) { train(text, label); return text; } /** * Classify a document. * @param {string} text - The document to be classified. * @returns {DocumentClassification} The document class. */ this.classifyDocument = function(text) { var scores = guess(text); var winner = extractWinner(scores); return { scores: scores, winner: winner }; } this.addDocument("", "none"); } /** Describes a document classification. @typedef DocumentClassification @type {Object} @property {number[]} scores The related scores for each known document label. @property {number} winner.score The score of the winning label. @property {string} winner.label The name of the winning label. */</code></pre> </article> </section> </div> <nav> <h2><a href="index.html">Home</a></h2><h3>Classes</h3><ul><li><a href="Bravey.ApiAiAdapter.html">ApiAiAdapter</a></li><li><a href="Bravey.ContextManager.html">ContextManager</a></li><li><a href="Bravey.DocumentClassifier.html">DocumentClassifier</a></li><li><a href="Bravey.EMailEntityRecognizer.html">EMailEntityRecognizer</a></li><li><a href="Bravey.Filter.BasicFilter.html">BasicFilter</a></li><li><a href="Bravey.FreeTextEntityRecognizer.html">FreeTextEntityRecognizer</a></li><li><a href="Bravey.Language.EN.DateEntityRecognizer.html">DateEntityRecognizer</a></li><li><a href="Bravey.Language.EN.FreeTextEntityRecognizer.html">FreeTextEntityRecognizer</a></li><li><a href="Bravey.Language.EN.NumberEntityRecognizer.html">NumberEntityRecognizer</a></li><li><a href="Bravey.Language.EN.Stemmer.html">Stemmer</a></li><li><a href="Bravey.Language.EN.TimeEntityRecognizer.html">TimeEntityRecognizer</a></li><li><a href="Bravey.Language.EN.TimePeriodEntityRecognizer.html">TimePeriodEntityRecognizer</a></li><li><a href="Bravey.Language.IT.DateEntityRecognizer.html">DateEntityRecognizer</a></li><li><a href="Bravey.Language.IT.FreeTextEntityRecognizer.html">FreeTextEntityRecognizer</a></li><li><a href="Bravey.Language.IT.NumberEntityRecognizer.html">NumberEntityRecognizer</a></li><li><a href="Bravey.Language.IT.Stemmer.html">Stemmer</a></li><li><a href="Bravey.Language.IT.TimeEntityRecognizer.html">TimeEntityRecognizer</a></li><li><a href="Bravey.Language.IT.TimePeriodEntityRecognizer.html">TimePeriodEntityRecognizer</a></li><li><a href="Bravey.Language.PT.DateEntityRecognizer.html">DateEntityRecognizer</a></li><li><a href="Bravey.Language.PT.FreeTextEntityRecognizer.html">FreeTextEntityRecognizer</a></li><li><a href="Bravey.Language.PT.NumberEntityRecognizer.html">NumberEntityRecognizer</a></li><li><a href="Bravey.Language.PT.Stemmer.html">Stemmer</a></li><li><a href="Bravey.Language.PT.TimeEntityRecognizer.html">TimeEntityRecognizer</a></li><li><a href="Bravey.Language.PT.TimePeriodEntityRecognizer.html">TimePeriodEntityRecognizer</a></li><li><a href="Bravey.Nlp.Fuzzy.html">Fuzzy</a></li><li><a href="Bravey.Nlp.Sequential.html">Sequential</a></li><li><a href="Bravey.NumberEntityRecognizer.html">NumberEntityRecognizer</a></li><li><a href="Bravey.RegexEntityRecognizer.html">RegexEntityRecognizer</a></li><li><a href="Bravey.SessionManager.InMemorySessionManager.html">InMemorySessionManager</a></li><li><a href="Bravey.StringEntityRecognizer.html">StringEntityRecognizer</a></li><li><a href="Bravey.Text.RegexMap.html">RegexMap</a></li></ul><h3>Namespaces</h3><ul><li><a href="Bravey.html">Bravey</a></li><li><a href="Bravey.Data.html">Data</a></li><li><a href="Bravey.Date.html">Date</a></li><li><a href="Bravey.File.html">File</a></li><li><a href="Bravey.Filter.html">Filter</a></li><li><a href="Bravey.Language.html">Language</a></li><li><a href="Bravey.Language.EN.html">EN</a></li><li><a href="Bravey.Language.IT.html">IT</a></li><li><a href="Bravey.Language.PT.html">PT</a></li><li><a href="Bravey.Nlp.html">Nlp</a></li><li><a href="Bravey.SessionManager.html">SessionManager</a></li><li><a href="Bravey.Text.html">Text</a></li></ul><h3><a href="global.html">Global</a></h3> </nav> <br class="clear"> <footer> Documentation generated by <a href="https://github.com/jsdoc3/jsdoc">JSDoc 3.4.3</a> </footer> <script> prettyPrint(); </script> <script src="scripts/linenumber.js"> </script> </body> </html>