@mrizki/natural
Version:
General natural language (tokenizing, stemming (English, Russian, Spanish), part-of-speech tagging, sentiment analysis, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance, Dice's Coefficient) facilities for node.
36 lines (29 loc) • 1.11 kB
JavaScript
var stopwords = require('../util/stopwords_it');
var Tokenizer = require('../tokenizers/aggressive_tokenizer_it');
module.exports = function() {
var stemmer = this;
stemmer.stem = function(token) {
return token;
};
stemmer.tokenizeAndStem = function(text, keepStops) {
var stemmedTokens = [];
new Tokenizer().tokenize(text).forEach(function(token) {
if (keepStops || stopwords.words.indexOf(token) == -1) {
var resultToken = token.toLowerCase();
if (resultToken.match(/[a-zàèìòù0-9]/gi)) {
resultToken = stemmer.stem(resultToken);
}
stemmedTokens.push(resultToken);
}
});
return stemmedTokens;
};
stemmer.attach = function() {
String.prototype.stem = function() {
return stemmer.stem(this);
};
String.prototype.tokenizeAndStem = function(keepStops) {
return stemmer.tokenizeAndStem(this, keepStops);
};
};
}