nlp-indo
Version:
Project Nlp indonesia, Recode Dari Bang Binsarjr
41 lines (40 loc) • 1.25 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.Tokenizer = void 0;
const Nalapa = require('nalapa');
const NalapaTokenizer = Nalapa.tokenizer;
const NalapaWord = Nalapa.word;
const index_1 = require("./index");
class Tokenizer {
static word_tokenizer(text) {
return NalapaTokenizer.tokenize(text);
}
static sentence_tokenizer(sentence) {
return NalapaTokenizer.splitSentence(sentence);
}
static filter_punctuation(tokenized) {
let filterd = [];
tokenized.forEach((word) => {
if (!this.ignore_words.includes(word)) {
filterd.push(word);
}
});
return filterd;
}
static stemmer(tokenized) {
let filterd = [];
tokenized.forEach((token) => filterd.push(NalapaWord.stem(token)));
return filterd;
}
static stopwords(tokenized) {
let filterd = [];
tokenized.forEach((token) => {
if (!index_1.Word.isStopword(token)) {
filterd.push(token);
}
});
return filterd;
}
}
exports.Tokenizer = Tokenizer;
Tokenizer.ignore_words = "!,\",#,$,%,&,',(,),*,+,,,-,.,/,:,;,<,=,>,?,@,[,\\,],^,_,`,{,|,},~".split('');