UNPKG

@mrizki/natural

Version:

General natural language (tokenizing, stemming (English, Russian, Spanish), part-of-speech tagging, sentiment analysis, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance, Dice's Coefficient) facilities for node.

36 lines (29 loc) 1.11 kB
var stopwords = require('../util/stopwords_it'); var Tokenizer = require('../tokenizers/aggressive_tokenizer_it'); module.exports = function() { var stemmer = this; stemmer.stem = function(token) { return token; }; stemmer.tokenizeAndStem = function(text, keepStops) { var stemmedTokens = []; new Tokenizer().tokenize(text).forEach(function(token) { if (keepStops || stopwords.words.indexOf(token) == -1) { var resultToken = token.toLowerCase(); if (resultToken.match(/[a-zàèìòù0-9]/gi)) { resultToken = stemmer.stem(resultToken); } stemmedTokens.push(resultToken); } }); return stemmedTokens; }; stemmer.attach = function() { String.prototype.stem = function() { return stemmer.stem(this); }; String.prototype.tokenizeAndStem = function(keepStops) { return stemmer.tokenizeAndStem(this, keepStops); }; }; }