node-red-contrib-chatbot
Version:
REDBot a Chat bot for a full featured chat bot for Telegram, Facebook Messenger and Slack. Almost no coding skills required
595 lines (520 loc) • 16.6 kB
JavaScript
var nlp = require('compromise');
var _ = require('underscore');
var _s = require('underscore.string');
var chrono = require('chrono-node');
var regexps = require('./helpers/regexps');
var levenshtein = require('fast-levenshtein');
var clc = require('cli-color');
var prettyjson = require('prettyjson');
var green = clc.greenBright;
var white = clc.white;
var grey = clc.blackBright;
var matchLevenshtain = function(term, word) {
// get the Levenshtein distance based on the length of the word, for length = 2 distance must be 0 or "off" and "on"
// will be confused
var distance = null;
if (term.length <= 2) {
distance = 0
} else if (term.length <= 4) {
distance = 1;
} else {
distance = 2;
}
return levenshtein.get(term, word) <= distance;
};
var MatchRule = function(obj) {
if (_.isString(obj)) {
var parsed = obj.match(/([a-zA-Z0-9%$£# ]*){0,1}(\[[a-zA-Z0-9]*\]){0,1}(->[a-zA-Z0-9_]*){0,1}/);
if (parsed != null) {
_.extend(this, {
text: parsed[1] != null ? parsed[1] : null,
type: parsed[2] != null ? parsed[2].replace('[', '').replace(']', '') : 'word',
variable: parsed[3] != null ? parsed[3].replace('->', '') : null,
value: null,
distance: 0
});
}
} else if (_.isObject(obj)) {
_.extend(this, {
text: null,
type: null,
variable: null,
value: null,
distance: 0,
raw: null
}, obj);
}
return this;
};
/*
todo
- add city: String, // Toronto, Canada -> Toronto, region: String, // Toronto, Ontario -> Ontario, country: String,
- add float
- add monday
*/
_.extend(MatchRule.prototype, {
_matchingRules: [
function(term) {
// match if type if not specified, just use lev
if (_.isEmpty(this.type) && !_.isEmpty(this.text)) {
if (matchLevenshtain(this.text, term.text)) {
this.raw = term.text;
return true;
}
}
return false;
},
function(term) {
// match exactly a symbol
if (this.type === 'symbol' && term.tags.Symbol) {
if (!_.isEmpty(this.text)) {
if (this.text === term.text) {
this.raw = term.text;
return true;
}
} else {
// catches all
this.value = term.text;
this.raw = term.text;
return true;
}
}
return false;
},
function(term) {
// this rule match a numeric value, improve it with keywords like float or integer
if (this.type === 'number' && term.tags.Cardinal) {
if (_.isEmpty(this.text)) {
if (_.isNumber(term.value)) {
this.value = parseFloat(term.value);
this.raw = term.text;
return true;
} else if (term.value.indexOf(',') !== -1 || term.value.indexOf('.') !== -1) {
if (!isNaN(parseFloat(term.value))) {
this.value = parseFloat(term.value);
this.raw = term.text;
return true;
}
} else if (!isNaN(parseInt(term.value, 10))) {
this.value = parseInt(term.value, 10);
this.raw = term.text;
return true;
}
}
}
return false;
},
function(term) {
// detect currency term, sometimes nlp chuncks two nouns, takes only the first one
if (this.type === 'currency' && term.tags.Currency) {
this.value = term.text.split(' ')[0];
this.raw = term.text;
return true;
}
return false;
},
function(term) {
// match a well formatted email
if (this.type === 'email' && regexps.email(term.text) != null) {
this.value = term.text;
this.raw = term.text;
return true;
}
return false;
},
function(term) {
// match a well formatted url
// todo check here url type
if (this.type === 'url' && regexps.url(term.text) != null) {
this.value = term.text;
this.raw = term.text;
return true;
}
return false;
},
function(term) {
// match a verb
if (this.type === 'verb' && term.tags.Verb) {
if (!_.isEmpty(this.text)) {
if (this.text === term.text || this.text === term.infinitive) {
this.raw = term.text;
return true;
}
} else {
// catches all
this.value = term.text;
this.raw = term.text;
return true;
}
}
return false;
},
function(term) {
// match a person
if (this.type === 'person' && term.tags.Person) {
if (!_.isEmpty(this.text)) {
return this.text === term.text;
}
// catches all
this.value = term.text;
return true;
}
return false;
},
function(term) {
// match a date
if (this.type === 'date' && term.tags.Date) {
this.raw = term.text;
this.value = chrono.parseDate(term.text);
this.text = term.text;
return true;
}
return false;
},
function(term) {
// do not try to match verbs, levenhstain could confuse "off" with "on" in phrasal verbs, also
// it doesn't catch infinitive, checked in a following rule
if (this.type === 'verb' || this.type === 'date') {
return false;
}
var capitalizedType = _s.capitalize(this.type);
// if the type is ok, capture the text or verify, nlp-compromise types are always capitalized (like noun),
// while custom lexicons can be in any case (that explains the double or)
if (term.tags[capitalizedType] || term.tags[this.type] || this.type === 'word') {
if (!_.isEmpty(this.text)) {
if (matchLevenshtain(this.text, term.text)) {
this.raw = term.text;
return true;
}
} else {
// catches all
this.value = term.text;
this.raw = term.text;
return true;
}
}
return false;
}
],
debug: function() {
// eslint-disable-next-line no-console
console.log(this.toJSON());
},
clone: function() {
return new MatchRule(this.toJSON());
},
toJSON: function() {
return {
text: this.text,
type: this.type,
variable: this.variable,
value: this.value,
distance: this.distance,
raw: this.raw
};
},
/**
* @method match
* Check if the term matches a rule
* @param {Term} term
* @return {Boolean}
*/
match: function(term) {
var _this = this;
return _(this._matchingRules).any(function(func) {
return func.call(_this, term);
});
}
});
var MatchRules = function(objs) {
var _this = this;
_this._models = [];
if (_.isArray(objs)) {
objs.forEach(function(obj) {
_this._models.push(new MatchRule(obj));
});
}
};
_.extend(MatchRules.prototype, {
prepend: function(rule) {
this._models.unshift(rule);
return this;
},
count: function() {
return this._models.length;
},
clone: function() {
return new MatchRules(this.map(function(rule) {
return rule.toJSON();
}));
},
map: function(func) {
return _(this._models).map(func);
},
forEach: function(func) {
_(this._models).each(func);
return this;
},
/**
* @method head
* Get the first rule of the set
* @return {MatchRule}
*/
head: function() {
return this.count() >= 1 ? this._models[0] : null;
},
at: function(idx) {
return idx < this._models.length ? this._models[idx] : null;
},
/**
* @method empty
* Tells if the rules collection is empty
* @return {Boolean}
*/
empty: function() {
return this.count() === 0;
},
toJSON: function() {
return this.map(function(rule) {
return rule.toJSON();
});
},
/**
* @method tail
* Return a cloned element of the rules, except the first one
* @return {MatchRules}
*/
tail: function() {
return new MatchRules(_(this.toJSON()).tail());
}
});
/**
* @class Terms
* A collection of parsed terms from a sentence}
*/
var Terms = function(terms) {
this._terms = terms;
return this;
};
_.extend(Terms.prototype, {
debug: function() {
// eslint-disable-next-line no-console
console.log(this._terms);
},
count: function() {
return this._terms.length;
},
head: function() {
return this.count() >= 1 ? this._terms[0] : null;
},
at: function(idx) {
return idx < this._terms.length ? this._terms[idx] : null;
},
/**
* @method empty
* Tells if the terms collection is empty
* @return {Boolean}
*/
empty: function() {
return this.count() === 0;
},
/**
* @method tail
* Return a cloned version of the terms, excluded the first one
* @return {Terms}
*/
tail: function() {
return new Terms(_(this._terms).tail());
}
});
var matchRules = function(sentence, rules, distance) {
distance = distance || 0;
//var matched = [];
// if there something
if (!sentence.empty() && !rules.empty()) {
// always clone the rule before matching (the match stores data into the rule), no side effects here
var clonedRules = rules.clone();
// che if top rule match with top term
if (clonedRules.head().match(sentence.head())) {
// set the distance that matched
clonedRules.head().distance = distance;
// if just one rules is left means we're done here and the whole sentence is matched
// return the cloned rules then
if (clonedRules.count() == 1) {
return [clonedRules];
}
// not done yet, we have to check more rules, so tail both the sentence terms and the rules and
// check again, resets the distance
var matchedWithBothTailed = matchRules(sentence.tail(), clonedRules.tail());
// for each of the matched rules, I've to prepend the previous head checked in this round
_(matchedWithBothTailed).each(function(rule) {
rule.prepend(clonedRules.head());
});
// can be a match also with the rules without tailing it, for example if we're searching for a
// [noun] [color] it could match "[car] is a bmw [blue]" but also "car is a [bmw] [blue]",
// so we're tailing the terms but not the rules
var matchedWithSentenceTailed = matchRules(sentence.tail(), clonedRules);
// join all together
return _.compact(_.union(matchedWithBothTailed, matchedWithSentenceTailed));
}
// if the first term doesn't match, remove it and try to match the rest of the sentence, since we have discarded
// a token, increase distance
return matchRules(sentence.tail(), rules, distance + 1);
// enqueue the found rules with the stack
//matched.push.apply(matched, matchedWithSentenceTail);
}
return [];
};
function stopWord(terms, word, type) {
_(terms).each(function(term) {
if ((_.isEmpty(word) || term._text === word) && term.tags[type]) {
term.tags = {};
term.tags[type] = true;
}
});
}
/**
* @method groupSiblings
* Given a list of terms, groupo together terms with the same type
*/
function groupSiblings(terms, type, excludedTypes) {
var accumulator = [];
var temp = [];
var k = 0;
excludedTypes = excludedTypes || [];
// do it old school
for(k = 0; k < terms.length; k++) {
var term = terms[k];
var isLast = k === terms.length - 1;
var nextTerm = !isLast ? terms[k + 1] : null;
// add to the temp registry if it's the right type, or just to the accumulator if not
if (term.tags[type]) {
temp.push(term);
// if the next term has a type included in "excludedTypes" then skip it
var excludesNext = nextTerm != null ? !_.isEmpty(_.intersection(_(nextTerm.tags).keys(), excludedTypes)) : false;
// if it's the last one or the next one is a different kind and in the temporary registry there are more than 1
// terms, then join them
if (isLast || !nextTerm.tags[type] || excludesNext) {
if (temp.length > 0) {
// join them and evaluate in isolation
var joined = _(temp).map(function (term) {
return term.root;
}).join(' ');
var isolated = nlp(joined);
// switch based on type
var parsed = null;
switch (type) {
case 'Date':
parsed = isolated.dates(0).data();
temp[0]._text = joined;
accumulator.push(temp[0]);
break;
case 'Cardinal':
parsed = isolated.values(0).data();
temp[0].value = parsed[0].number;
temp[0]._text = joined;
accumulator.push(temp[0]);
break;
case 'Verb':
parsed = isolated.verbs(0).data();
temp[0].infinitive = parsed.length !== 0 && parsed[0].conjugations != null ?
parsed[0].conjugations.Infinitive : joined;
temp[0]._text = joined;
accumulator.push(temp[0]);
break;
default:
temp[0]._text = joined;
temp[0].normal = joined;
temp[0].root = joined;
accumulator.push(temp[0]);
}
// reset the temp register
temp = [];
}
}
} else {
accumulator.push(term);
}
// else do nothing
} // end for
return accumulator;
}
module.exports = {
Terms: Terms,
MatchRules: MatchRules,
MatchRule: MatchRule,
matchRules: matchRules,
matchRule: function(terms, rules) {
var matches = matchRules(terms, rules);
// todo improve detect here with distance
return !_.isEmpty(matches) ? matches[0] : null;
},
matchLevenshtain: matchLevenshtain,
/**
* @method parseSentence
* Parse a string with nlp-compromise with some corrections (like the currency with $ symbols)
* @param {String} str
* @param {Object} lexicon
* @param {Boolean} debug
* @return {Array}
*/
parseSentence: function(str, lexicon, debug) {
// detach symbols from leading labels, for example convert "40$" into "40 $", otherwise NLP is not able to parse
// them correctly
str = str.replace(/([0-9a-zA-Z]*)([%|$|€|£|#])/g, '$1 $2');
//var phrase = nlp.text(str, lexicon.lexicon);
var phrase = nlp(str, lexicon);
if (debug) {
// eslint-disable-next-line no-console
console.log(green('Message:'), white(str));
phrase.debug();
}
// collect new entity from lexicon
var entities = _(lexicon).chain().values().unique().value();
//this._phrase = phrase;
var terms = _(phrase.list).chain()
.map(function(list) {
return list.terms;
})
.flatten()
.value();
// manually mark as Symbol
terms.forEach(function(term) {
if (term.normal.match(/[%|$|€|£|#]/)) {
term.tags.Symbol = true;
}
});
// clear stop word, some word must be a kind of separator, for example in the sentece "from 1st january to 10th
// january" all is marked "Date", even the word "to" which cause the grouping by the type "Date" of the whole
// sentence. Some word are just one type, "to" is a "Preposition" and nothing else
stopWord(terms, null, 'Preposition');
// need to group token to extract the right value "twenty four" are two separate terms, this will join them
// together with a parsed "value" key
terms = groupSiblings(terms, 'Cardinal');
terms = groupSiblings(terms, 'Verb');
// join Date elements but only if they are not "Duration", for example "in 8 minutes" could be grouped all in a single
// date element, but since "in" is a preposition and "minutes" is Duration, then don't join them
terms = groupSiblings(terms, 'Date', ['Duration']);
// also group together new entities from lexicon, i.e. "dining room" gets splitted in "dining" and "room"
_(entities).each(function(entity) {
terms = groupSiblings(terms, entity);
});
if (debug) {
// eslint-disable-next-line no-console
console.log('');
// eslint-disable-next-line no-console
console.log(grey('------ Sentence Analysis ----------------'));
try {
// eslint-disable-next-line no-console
console.log(prettyjson.render(
_(terms).map(function(term) {
return _(term).pick('tags', 'whitespace', 'silent_term', 'lumped', 'normal', 'root', 'dirty', 'uid', '_text');
})
));
} catch(e) {
// pretty json may breaks
}
}
return new Terms(terms);
}
};