wordlevel
Version:
simple tool for lookup up word frequency with basic lemma stemming
154 lines (134 loc) • 4.43 kB
JavaScript
// functions for checking word frequency level
'use strict'
var nlp = require('compromise') // for parsing and NLP
// var _ = require('underscore')
var removePunctuation = require('remove-punctuation')
var Lemmatizer_en = require("javascript-lemmatizer")
var freq_en = require('./freq_list_en')
class Wordlevel {
constructor(lang) {
if (!lang) lang='en'
if (['en','ar','fa'].indexOf(lang)===-1) throw('Language not supported')
this.lang = lang
this.lemmatizer = {}
//this.lemmafreq = {}
if (lang==='en') {
this.lemmatizer = new Lemmatizer_en()
this.list = freq_en
}
else if (lang==='fa') {
this.lemmatizer = {} //new Lemmatizer_en()
this.list = [] //freq_en
}
else if (lang==='ar') {
this.lemmatizer = {} //new Lemmatizer_en()
this.list = [] //freq_en
}
}
normalize_word(word, pos) {
word = removePunctuation(word).toLowerCase().trim();
var lemma = '';
// English
if (this.lang==='en') {
var lemmas = this.lemmatizer.lemmas(word);
if (lemmas.length===0) lemma = ''
else if (lemmas.length===1) lemma = lemmas[0][0];
else if (!pos) lemma = lemmas[1][0]; // i.e. the noun form (statistically most likely)
else {
// try to match known POS
var matches = lemmas.filter(function(lem){ return (lem[1]===pos); });
if (matches.length>0) lemma = matches[0][0];
else lemmas[1][0]; // i.e. the noun form (statistically most likely)
}
}
// TODO: Arabic
// TODO: Farsi
return lemma;
}
// return array of objects describing each word
parse_str(str) {
var list = nlp(str).out('terms');
var pos_options = {Noun:'noun', Verb:'verb', Adjective:'adj', Adverb:'adv'};
//console.log(list)
// pull out parts we actually want
var words = [];
list.forEach((word, index) => {
// console.log(word);
let newword = {}
newword.discard = word.tags.filter((tag)=>!pos_options[tag] )
newword.pos = word.tags
.filter((tag)=>pos_options[tag]).map((tag)=>pos_options[tag])
newword.pos = newword.pos[0] || ''
newword.lemma = this.normalize_word(word.normal, newword.pos)
newword.word = word.text
//newword = this.pluginWordAnalysis(newword)
//newword.html = this.word2HTML(newword)
list[index] = newword
});
// console.log(list);
return list;
}
frequency(word, pos) {
if (!this.lemmafreq) {
this.lemmafreq = this._prepare_lema_index()
this.lemmacount = Object.keys(this.lemmafreq).length
}
let freq = 0
let lemma = this.normalize_word(word, pos)
if (lemma.length>0) freq = this.lemmafreq[lemma] || 0
return freq
}
level(word, pos) {
// ratio to percentage rounded to first decimal
let level = Math.round(((this.frequency(word, pos) / this.lemmacount) * 100)*10)/10
return level
}
// returns the level at 98% of these words
block_level(str) {
var words = this.parse_frequency_list(str)
let wordcount = words.length
let top = wordcount - Math.round(wordcount/50)
let word = words[top-1]
return word.level
}
// returns the top 2% words
topwords(str) {
var words = this.parse_frequency_list(str)
let wordcount = words.length
let top = wordcount - Math.round(wordcount/50)
let topwords = words.slice(top-1)
return topwords
}
parse_frequency_list(str){
var text = this.parse_str(str)
var list = {}
var sortedlist = []
let that = this
text.forEach((word)=> {
let level = that.level(word.lemma, word.pos)
if (!list[word.lemma]) list[word.lemma] = {word: word.lemma, count: 1, level: level}
else list[word.lemma].count++
})
// copy over into a sorted array
for (var word in list) {
if (list.hasOwnProperty(word)) sortedlist.push(list[word])
}
// sort array by level
sortedlist.sort((a, b) => a.level-b.level)
return sortedlist
}
/** internal funcitonality */
_prepare_lema_index() {
let result = {}
let that = this
if (this.list.length<1) return result
this.list.forEach(function(word, index) {
let lemma = that.normalize_word(word)
if (lemma) result[lemma] = index
})
//console.log('Prepared lemma frequency index with ', Object.keys(result).length, 'keys')
return result
}
}
// export default Testwords;
module.exports = Wordlevel