UNPKG

nlp-indo

Version:

Project Nlp indonesia, Recode Dari Bang Binsarjr

41 lines (40 loc) 1.25 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Tokenizer = void 0; const Nalapa = require('nalapa'); const NalapaTokenizer = Nalapa.tokenizer; const NalapaWord = Nalapa.word; const index_1 = require("./index"); class Tokenizer { static word_tokenizer(text) { return NalapaTokenizer.tokenize(text); } static sentence_tokenizer(sentence) { return NalapaTokenizer.splitSentence(sentence); } static filter_punctuation(tokenized) { let filterd = []; tokenized.forEach((word) => { if (!this.ignore_words.includes(word)) { filterd.push(word); } }); return filterd; } static stemmer(tokenized) { let filterd = []; tokenized.forEach((token) => filterd.push(NalapaWord.stem(token))); return filterd; } static stopwords(tokenized) { let filterd = []; tokenized.forEach((token) => { if (!index_1.Word.isStopword(token)) { filterd.push(token); } }); return filterd; } } exports.Tokenizer = Tokenizer; Tokenizer.ignore_words = "!,\",#,$,%,&,',(,),*,+,,,-,.,/,:,;,<,=,>,?,@,[,\\,],^,_,`,{,|,},~".split('');