node-nlp
Version:
Library for NLU (Natural Language Understanding) done in Node.js
352 lines (318 loc) • 13.3 kB
HTML
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>JSDoc: Source: nlp/nlp-classifier.js</title>
<script src="scripts/prettify/prettify.js"> </script>
<script src="scripts/prettify/lang-css.js"> </script>
<!--[if lt IE 9]>
<script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
<![endif]-->
<link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css">
<link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css">
</head>
<body>
<div id="main">
<h1 class="page-title">Source: nlp/nlp-classifier.js</h1>
<section>
<article>
<pre class="prettyprint source linenums"><code>/*
* Copyright (c) AXA Shared Services Spain S.A.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
const LogisticRegressionClassifier = require('../classifiers/logistic-regression-classifier');
const NlpUtil = require('./nlp-util');
const BinaryNeuralNetworkClassifier = require('../classifiers/binary-neural-network-classifier');
/**
* Class for the NLP Classifier.
* In the settings you can specify:
* - classifier (optional): The Machine Learning Classifier Class. If not
* provided, then a default Logistic Regression Classifier is used.
* - stemmer (optional): The language stemmer (also tokenize). If not
* provided, you can provide the language and the default stemmer
* for this language will be used.
* - language (optional): If you don't provide a stemmer, then you can
* provide a language so a default stemmer for this language will
* be used.
*/
class NlpClassifier {
/**
* Constructor of the class.
* @param {Object} settings Settings for this instance.
*/
constructor(settings) {
this.settings = settings || {};
if (!this.settings.language) {
this.settings.language = 'en';
}
if (!this.settings.classifier) {
this.settings.classifier = new LogisticRegressionClassifier();
}
if (this.settings.useNeural === undefined) {
this.settings.useNeural = true;
}
if (!this.settings.neuralClassifier) {
this.settings.neuralClassifier = new BinaryNeuralNetworkClassifier();
}
if (!this.settings.stemmer) {
this.settings.stemmer = NlpUtil.getStemmer(this.settings.language);
}
if (this.settings.keepStopWords === undefined) {
this.settings.keepStopWords = true;
}
this.docs = [];
this.features = {};
}
/**
* Generate the vector of features.
* @param {String} utterance Input utterance.
* @returns {String[]} Vector of features.
*/
tokenizeAndStem(utterance) {
return typeof utterance === 'string'
? this.settings.stemmer.tokenizeAndStem(
utterance,
this.settings.keepStopWords
)
: utterance;
}
/**
* Gets the position of a utterance for an intent.
* @param {Object} srcUtterance Utterance to be found.
* @param {Object} intent Intent of the utterance.
* @returns {Number} Position of the utterance, -1 if not found.
*/
posUtterance(srcUtterance, intent) {
const utterance = this.tokenizeAndStem(srcUtterance);
const utteranceStr = utterance.join(' ');
for (let i = 0; i < this.docs.length; i += 1) {
const doc = this.docs[i];
if (
doc.utterance.join(' ') === utteranceStr &&
(!intent || doc.intent === intent)
) {
return i;
}
}
return -1;
}
/**
* Indicates if an utterance already exists, at the given intent or globally.
* @param {String} utterance Utterance to be checked.
* @param {String} intent Intent to check, undefined to search globally.
* @returns {boolean} True if the intent exists, false otherwise.
*/
existsUtterance(utterance, intent) {
return this.posUtterance(utterance, intent) !== -1;
}
/**
* Adds a new utterance to an intent.
* @param {String} srcUtterance Utterance to be added.
* @param {String} srcIntent Intent for adding the utterance.
*/
add(srcUtterance, srcIntent) {
if (typeof srcUtterance !== 'string') {
throw new Error('Utterance must be an string');
}
if (typeof srcIntent !== 'string') {
throw new Error('Intent must be an string');
}
const intent = srcIntent.trim();
const utterance = this.tokenizeAndStem(srcUtterance);
if (utterance.length === 0 || this.existsUtterance(utterance)) {
return;
}
const doc = { intent, utterance };
this.docs.push(doc);
utterance.forEach(token => {
this.features[token] = (this.features[token] || 0) + 1;
});
}
/**
* Remove an utterance from the classifier.
* @param {String} srcUtterance Utterance to be removed.
* @param {String} srcIntent Intent of the utterance, undefined to search all
*/
remove(srcUtterance, srcIntent) {
if (typeof srcUtterance !== 'string') {
throw new Error('Utterance must be an string');
}
const intent = srcIntent ? srcIntent.trim() : undefined;
const utterance = this.tokenizeAndStem(srcUtterance);
if (utterance.length === 0) {
return;
}
const pos = this.posUtterance(utterance, intent);
if (pos !== -1) {
this.docs.splice(pos, 1);
utterance.forEach(token => {
this.features[token] = this.features[token] - 1;
if (this.features[token] <= 0) {
delete this.features[token];
}
});
}
}
/**
* Given an utterance, tokenize and steam the utterance and convert it
* to a vector of binary values, where each position is a feature (a word
* stemmed) and the value means if the utterance has this feature.
* The input utterance can be an string or an array of tokens.
* @param {String} srcUtterance Utterance to be converted to features vector.
* @returns {Number[]} Features vector of the utterance.
*/
textToFeatures(srcUtterance) {
const utterance = Array.isArray(srcUtterance)
? srcUtterance
: this.tokenizeAndStem(srcUtterance);
const keys = Object.keys(this.features);
const result = [];
keys.forEach(key => {
result.push(utterance.indexOf(key) > -1 ? 1 : 0);
});
return result;
}
tokensToNeural(tokens) {
const tokenFeatures = {};
for (let i = 0; i < tokens.length; i += 1) {
const value = Number.parseInt(tokens[i], 10);
if (Number.isNaN(value)) {
tokenFeatures[tokens[i]] = 1;
} else {
tokenFeatures['%number%'] = 1;
}
}
const result = {};
Object.keys(this.features).forEach(srcToken => {
const value = Number.parseInt(srcToken, 10);
const token = Number.isNaN(value) ? srcToken : '%number%';
result[token] = tokenFeatures[token] ? 1 : 0;
});
return result;
}
/**
* Train the classifier with the existing utterances and intents.
*/
async train() {
this.settings.classifier.clear();
const corpus = [];
this.docs.forEach(doc => {
const tokens = this.tokenizeAndStem(doc.utterance);
corpus.push({
input: this.tokensToNeural(tokens),
output: doc.intent,
});
this.settings.classifier.addObservation(
this.textToFeatures(tokens),
doc.intent
);
});
if (this.settings.classifier.observationCount > 0) {
await this.settings.classifier.train();
if (this.settings.useNeural) {
this.settings.neuralClassifier.trainBatch(corpus);
}
}
}
isEqualClassification(classifications) {
for (let i = 0; i < classifications.length; i += 1) {
if (classifications[i].value !== 0.5) {
return false;
}
}
return true;
}
normalizeNeural(classifications) {
let total = 0;
for (let i = 0; i < classifications.length; i += 1) {
total += classifications[i].value;
}
if (total > 0) {
const result = [];
for (let i = 0; i < classifications.length; i += 1) {
result.push({
label: classifications[i].label,
value: classifications[i].value / total,
});
}
return result;
}
return classifications;
}
/**
* Get all the labels and score for each label from this utterance.
* @param {String} utterance Utterance to be classified.
* @returns {Object[]} Sorted array of classifications, with label and score.
*/
getClassifications(utterance) {
const tokens = this.tokenizeAndStem(utterance);
const classification = this.settings.classifier.getClassifications(
this.textToFeatures(tokens)
);
if (this.settings.useNeural) {
if (this.isEqualClassification(classification)) {
return classification;
}
const neuralClassification = this.normalizeNeural(
this.settings.neuralClassifier.classify(
this.tokensToNeural(tokens),
true
)
);
const neuralIntent = neuralClassification[0].label;
const lrcIntent = classification[0].label;
if (neuralIntent === lrcIntent) {
if (neuralClassification[0].value > classification[0].value) {
return neuralClassification;
}
return classification;
}
return neuralClassification;
}
return classification;
}
/**
* Given an utterance, get the label and score of the best classification.
* @param {String} utterance Utterance to be classified.
* @returns {Object} Best classification of the observation.
*/
getBestClassification(utterance) {
return this.settings.classifier.getBestClassification(
this.textToFeatures(utterance)
);
}
}
module.exports = NlpClassifier;
</code></pre>
</article>
</section>
</div>
<nav>
<h2><a href="index.html">Home</a></h2><h3>Classes</h3><ul><li><a href="BinaryNeuralNetworkClassifier.html">BinaryNeuralNetworkClassifier</a></li><li><a href="Classifier.html">Classifier</a></li><li><a href="ConversationContext.html">ConversationContext</a></li><li><a href="DutchStemmer.html">DutchStemmer</a></li><li><a href="EnglishStemmer.html">EnglishStemmer</a></li><li><a href="EnumNamedEntity.html">EnumNamedEntity</a></li><li><a href="Evaluator.html">Evaluator</a></li><li><a href="HungarianStemmer.html">HungarianStemmer</a></li><li><a href="ItalianStemmer.html">ItalianStemmer</a></li><li><a href="Language.html">Language</a></li><li><a href="LogisticRegressionClassifier.html">LogisticRegressionClassifier</a></li><li><a href="Matrix.html">Matrix</a></li><li><a href="MemoryConversationContext.html">MemoryConversationContext</a></li><li><a href="NamedEntity.html">NamedEntity</a></li><li><a href="NerManager.html">NerManager</a></li><li><a href="NlgManager.html">NlgManager</a></li><li><a href="NlpClassifier.html">NlpClassifier</a></li><li><a href="NlpManager.html">NlpManager</a></li><li><a href="NorwegianStemmer.html">NorwegianStemmer</a></li><li><a href="PortugueseStemmer.html">PortugueseStemmer</a></li><li><a href="Recognizer.html">Recognizer</a></li><li><a href="RegexNamedEntity.html">RegexNamedEntity</a></li><li><a href="RomanianStemmer.html">RomanianStemmer</a></li><li><a href="RussianStemmer.html">RussianStemmer</a></li><li><a href="SentimentAnalyzer.html">SentimentAnalyzer</a></li><li><a href="SentimentManager.html">SentimentManager</a></li><li><a href="SimilarSearch.html">SimilarSearch</a></li><li><a href="SlotManager.html">SlotManager</a></li><li><a href="StemmerJa.html">StemmerJa</a></li><li><a href="SwedishStemmer.html">SwedishStemmer</a></li><li><a href="Tokenizer.html">Tokenizer</a></li><li><a href="TrimNamedEntity.html">TrimNamedEntity</a></li><li><a href="TurkishStemmer.html">TurkishStemmer</a></li><li><a href="Vector.html">Vector</a></li><li><a href="XTable.html">XTable</a></li></ul><h3>Global</h3><ul><li><a href="global.html#endsinArr">endsinArr</a></li><li><a href="global.html#prelude">prelude</a></li><li><a href="global.html#regions">regions</a></li><li><a href="global.html#stem">stem</a></li><li><a href="global.html#stopwords">stopwords</a></li></ul>
</nav>
<br class="clear">
<footer>
Documentation generated by <a href="https://github.com/jsdoc3/jsdoc">JSDoc 3.5.5</a> on Sat Oct 13 2018 19:14:51 GMT+0200 (CEST)
</footer>
<script> prettyPrint(); </script>
<script src="scripts/linenumber.js"> </script>
</body>
</html>