compromise
Version:
natural language processing in the browser
56 lines (52 loc) • 1.28 kB
JavaScript
;
//ngrams are consecutive terms of a specific size
const ngram = function(options) {
options = options || {};
options.size = options.size || [1, 2, 3];
if (typeof options.size === 'number') {
options.size = [options.size];
}
//flatten terms
let terms = this.list.map((ts) => {
return ts.mapTerms((t) => t.normal);
});
//count freq
let obj = {};
//each gram-size
options.size.forEach((size) => {
obj[size] = {};
//each sentence/match
for(let s = 0; s < terms.length; s++) {
//start slice at each term
for(let o = 0; o < terms[s].length - size + 1; o++) {
let str = terms[s].slice(o, o + size).join(' ');
obj[size][str] = obj[size][str] || 0;
obj[size][str] += 1;
}
}
});
//flatten to an array
let arr = [];
Object.keys(obj).forEach((size) => {
Object.keys(obj[size]).forEach((k) => {
arr.push({
text: k,
count: obj[size][k],
size: parseInt(size, 10)
});
});
});
//sort the array
arr = arr.sort((a, b) => {
if (a.count > b.count) {
return -1;
}
//(the tie-braker)
if (a.count === b.count && a.size > b.size) {
return -1;
}
return 1;
});
return arr;
};
module.exports = ngram;