nlpsum
Version:
Powerful text summarization algorithms from research papers and dedicated research.
578 lines (571 loc) • 11.8 kB
JavaScript
// Generated by CoffeeScript 1.6.3
var data, tagger, tags, tokenizer, words;
data = require("./helpers/data");
tokenizer = require("./helpers/tokenizer");
tagger = (function() {
var suggest_adjective_phrase, suggest_adverb_phrase, suggest_noun_phrase, suggest_verb_phrase;
suggest_noun_phrase = function(o, rule, results, options) {
var i, top;
top = results.length;
i = o + 1;
while (i < top) {
if (results[i].pos.parent === "noun") {
return results;
}
if (results[i].pos.parent === "verb" && results[i].pos.tag !== "RB") {
results[i].pos = data.parts_of_speech["NN"];
results[i].rule = rule;
return results;
}
if (results[i + 1]) {
if (results[i].pos.parent === "adj" || results[i].pos.tag === "RB") {
results[i].pos.parent = "adjective";
results[i].rule = rule;
}
} else {
results[i].pos = data.parts_of_speech["NN"];
results[i].rule = rule;
}
i++;
}
return results;
};
suggest_verb_phrase = function(o, rule, results, options) {
var i, top;
top = results.length;
i = o + 1;
while (i < top) {
if (results[i].pos.parent === "verb") {
return results;
}
if (results[i].pos.parent === "noun") {
results[i].pos = data.parts_of_speech["VB"];
results[i].rule = rule;
return results;
}
if (results[i + 1]) {
if (results[i].pos.parent === "adj" || results[i].pos.tag === "RB") {
results[i].pos.parent = "verb";
results[i].rule = rule;
}
} else {
results[i].pos = data.parts_of_speech["VB"];
results[i].rule = rule;
}
i++;
}
return results;
};
suggest_adjective_phrase = function(o, rule, results, options) {
var i, top;
top = results.length;
i = o + 1;
while (i < top) {
if (results[i].pos.parent === "adjective") {
return results;
}
if (results[i].pos.tag === "DT" || results[i].pos.tag === "CP") {
return results;
}
if (results[i].pos.parent === "noun" || results[i].pos.parent === "verb") {
results[i].pos = data.parts_of_speech["JJ"];
results[i].rule = rule;
return results;
}
if (results[i + 1]) {
if (results[i].pos.tag === "RB") {
results[i].pos.parent = "adjective";
results[i].rule = rule;
}
} else {
if (options.strong) {
results[i].pos = data.parts_of_speech["JJ"];
results[i].rule = rule;
}
}
i++;
}
return results;
};
suggest_adverb_phrase = function(o, rule, results, options) {
var i, top;
top = results.length;
i = o + 1;
while (i < top) {
if (results[i].pos.parent === "adjective" || results[i].pos.parent === "verb") {
return results;
}
if (results[i].pos.parent === "noun") {
results[i].pos = data.parts_of_speech["JJ"];
results[i].rule = rule;
return results;
}
if (results[i + 1]) {
if (results[i].pos.tag === "RB") {
results[i].pos.parent = "adjective";
results[i].rule = rule;
}
} else {
if (options.strong) {
results[i].pos = data.parts_of_speech["JJ"];
results[i].rule = rule;
}
}
i++;
}
return results;
};
tagger = function(words, options) {
var i, last, lex, o, patterns, results, word;
if (words == null) {
words = [];
}
if (options == null) {
options = {};
}
if (typeof words === "string") {
words = tokenizer(words, options);
}
results = [];
for (i in words) {
word = words[i];
results[i] = {
word: word,
pos: null,
clues: []
};
patterns = [
{
reg: /[a-z]\-[a-z]/,
pos: "JJ"
}, {
reg: /^de\-[a-z]../,
pos: "VB"
}, {
reg: /^un\-[a-z]../,
pos: "VB"
}, {
reg: /^re\-[a-z]../,
pos: "VB"
}, {
reg: /.*ould$/,
pos: "MD"
}, {
reg: /..*ing$/,
pos: "VBG"
}, {
reg: /..*ates$/,
pos: "VBZ"
}, {
reg: /..*ses$/,
pos: "VBZ"
}, {
reg: /..*ify$/,
pos: "VB"
}, {
reg: /..*ize$/,
pos: "VB"
}, {
reg: /..*ated$/,
pos: "VBN"
}, {
reg: /..*'n$/,
pos: "VBG"
}, {
reg: /...*ed$/,
pos: "VBD"
}, {
reg: /.*ness$/,
pos: "NN"
}, {
reg: /.*ment$/,
pos: "NN"
}, {
reg: /.*full?$/,
pos: "JJ"
}, {
reg: /.*ous$/,
pos: "JJ"
}, {
reg: /.*ble$/,
pos: "JJ"
}, {
reg: /.*ic$/,
pos: "JJ"
}, {
reg: /..*ive$/,
pos: "JJ"
}, {
reg: /..*ic$/,
pos: "JJ"
}, {
reg: /..*est$/,
pos: "JJS"
}, {
reg: /.*ical$/,
pos: "JJ"
}, {
reg: /.*ial$/,
pos: "JJ"
}, {
reg: /...*ish$/,
pos: "JJ"
}, {
reg: /...*nal$/,
pos: "JJ"
}, {
reg: /.*less$/,
pos: "JJ"
}, {
reg: /.*ier$/,
pos: "JJR"
}, {
reg: /.*ened$/,
pos: "JJ"
}, {
reg: /.*some$/,
pos: "JJ"
}, {
reg: /..*ant$/,
pos: "JJ"
}, {
reg: /..*like$/,
pos: "JJ"
}, {
reg: /..*ky$/,
pos: "JJ"
}, {
reg: /..*ly$/,
pos: "RB"
}, {
reg: /\./,
pos: "NN"
}, {
reg: /^((?![aeiouy]).)*$/,
pos: "NN"
}, {
reg: /^-?[0-9]+(.[0-9]+)?$/,
pos: "CD"
}, {
reg: /'s$/,
pos: "NNO"
}, {
reg: /.*ized$/,
pos: "JJ"
}, {
reg: /.*ates$/,
pos: "VBZ"
}, {
reg: /.*ting$/,
pos: "JJ"
}, {
reg: /.*rate$/,
pos: "VB"
}, {
reg: /.*ling$/,
pos: "JJ"
}, {
reg: /.*ring$/,
pos: "JJ"
}, {
reg: /.*fied$/,
pos: "JJ"
}, {
reg: /.*shed$/,
pos: "JJ"
}, {
reg: /.*ched$/,
pos: "JJ"
}, {
reg: /.*tory$/,
pos: "JJ"
}, {
reg: /.*ding$/,
pos: "JJ"
}, {
reg: /.*ning$/,
pos: "JJ"
}, {
reg: /.*ular$/,
pos: "JJ"
}, {
reg: /.*late$/,
pos: "VB"
}, {
reg: /.*tted$/,
pos: "VBN"
}, {
reg: /.*ying$/,
pos: "JJ"
}, {
reg: /.*king$/,
pos: "JJ"
}, {
reg: /.*izes$/,
pos: "VBZ"
}, {
reg: /.*sing$/,
pos: "JJ"
}, {
reg: /.*nary$/,
pos: "JJ"
}, {
reg: /.*ntal$/,
pos: "JJ"
}, {
reg: /.*rian$/,
pos: "JJ"
}, {
reg: /.*ound$/,
pos: "JJ"
}, {
reg: /.*iate$/,
pos: "VB"
}, {
reg: /.*cate$/,
pos: "VB"
}, {
reg: /.*hing$/,
pos: "JJ"
}, {
reg: /.*ming$/,
pos: "JJ"
}, {
reg: /.*ient$/,
pos: "JJ"
}, {
reg: /.*fies$/,
pos: "VBZ"
}, {
reg: /.*tary$/,
pos: "JJ"
}, {
reg: /.*ards$/,
pos: "RB"
}, {
reg: /.*ural$/,
pos: "JJ"
}, {
reg: /.*ight$/,
pos: "JJ"
}, {
reg: /.*lent$/,
pos: "JJ"
}, {
reg: /.*ging$/,
pos: "JJ"
}, {
reg: /.*cent$/,
pos: "JJ"
}, {
reg: /.*shes$/,
pos: "VBZ"
}, {
reg: /.*nian$/,
pos: "JJ"
}, {
reg: /.*ects$/,
pos: "VBZ"
}, {
reg: /.*ving$/,
pos: "JJ"
}, {
reg: /.*dent$/,
pos: "JJ"
}, {
reg: /.*ends$/,
pos: "VBZ"
}, {
reg: /.*tent$/,
pos: "JJ"
}, {
reg: /.*tual$/,
pos: "JJ"
}, {
reg: /.*rent$/,
pos: "JJ"
}, {
reg: /.*eral$/,
pos: "JJ"
}, {
reg: /.*uate$/,
pos: "VB"
}, {
reg: /.*sian$/,
pos: "JJ"
}, {
reg: /.*ives$/,
pos: "VBZ"
}, {
reg: /.*gent$/,
pos: "JJ"
}, {
reg: /.*bles$/,
pos: "VBZ"
}, {
reg: /.*tens$/,
pos: "VBZ"
}, {
reg: /.*lian$/,
pos: "JJ"
}, {
reg: /.*tian$/,
pos: "JJ"
}, {
reg: /.*ains$/,
pos: "VBZ"
}, {
reg: /.*nist$/,
pos: "JJ"
}, {
reg: /.*oral$/,
pos: "JJ"
}, {
reg: /.*ines$/,
pos: "VBZ"
}, {
reg: /.*erly$/,
pos: "JJ"
}, {
reg: /.*duce$/,
pos: "VB"
}, {
reg: /.*ures$/,
pos: "VBZ"
}, {
reg: /.*wide$/,
pos: "JJ"
}, {
reg: /.*udes$/,
pos: "VBZ"
}, {
reg: /.*ters$/,
pos: "VBZ"
}, {
reg: /.*ents$/,
pos: "VBZ"
}
];
for (o in patterns) {
if (word.match(patterns[o].reg)) {
results[i].pos = data.parts_of_speech[patterns[o].pos];
results[i].rule = "regex";
}
}
word = word.replace(/[\.,!:;]*$/, "");
lex = data.lexicon[word.toLowerCase()];
if (lex) {
results[i].pos = data.parts_of_speech[lex];
results[i].rule = "lexicon";
}
if (i !== 0 && word.match(/[A-Z]/)) {
results[i].pos = data.parts_of_speech["NN"];
results[i].rule = "capital";
}
if (parseFloat(word)) {
results[i].pos = data.parts_of_speech["NN"];
results[i].rule = "number";
}
if (!results[i].pos) {
results[i].pos = data.parts_of_speech["NN"];
results[i].rule = "unknown";
}
}
for (i in results) {
i = parseInt(i);
if (!results[i + 1]) {
continue;
}
if (results[i].pos.tag === "RB" && (!results[i - 1] || results[i - 1].pos.parent !== "verb")) {
results = suggest_adverb_phrase(i, "from_adverb", results, {
strong: false
});
}
if (results[i].pos.tag === "PP") {
results = suggest_noun_phrase(i, "from_posessive", results, {
strong: true
});
}
if (results[i].pos.tag === "VBZ" && results[i + 1].pos.parent !== "verb") {
results = suggest_adjective_phrase(i, "vbz-adjective", results, {
strong: false
});
}
if (results[i].pos.tag === "DT") {
results = suggest_noun_phrase(i, "from_determiner", results, {
strong: false
});
}
if (results[i].pos.tag === "MD") {
results = suggest_verb_phrase(i, "from_would", results, {
strong: false
});
}
}
for (i in results) {
i = parseInt(i);
if (!results[i + 1]) {
continue;
}
if (results[i].pos.parent === "noun" && results[i + 2] && results[i + 1].pos.tag === "JJ" && results[i + 2].pos.parent === "noun") {
if (!options.big) {
results[i + 1].pos = data.parts_of_speech["NN"];
results[i + 1].rule = "noun_adjective_noun";
}
}
if (results[i].pos.tag === "JJ" && results[i + 1].pos.parent === "verb") {
results[i].pos = data.parts_of_speech["RB"];
results[i].rule = "adjective_verb";
}
if (results[i].pos.tag === "JJ" && results[i + 1].pos.tag === "JJ") {
if (!results[i].word.match(",")) {
results[i].pos = data.parts_of_speech["RB"];
results[i].rule = "twoadjectives";
}
}
if (results[i].pos.tag === "PRP") {
if (results[i - 1] && results[i - 1].pos.parent === "adjective") {
results[i - 1].pos = data.parts_of_speech["VB"];
results[i - 1].rule = "verb_myself";
} else if (!results[i - 1] || !results[i - 1].pos.parent === "verb") {
results = suggest_verb_phrase(i, "from_pronoun", results, {
strong: false
});
}
}
if (results[i].pos.tag === "CP" && results[i + 1].pos.tag === "IN") {
results[i + 1].pos = data.parts_of_speech["VB"];
results[i + 1].rule = "preposition_verb";
}
if (results[i].pos.parent === "adjective" && results[i + 1].pos.tag === "CC" && results[i + 2] && results[i + 2].pos.parent === "noun") {
results[i + 2].pos = data.parts_of_speech["JJ"];
results[i + 2].rule = "and_adjective";
}
}
last = results.length - 1;
if (results[last - 1]) {
if (results[last - 1].pos.tag === "CP" && results[last].pos.parent === "noun") {
results[last].pos = data.parts_of_speech["JJ"];
results[last].rule = "end_copula";
}
if (results[last - 1].pos.parent === "noun" && (results[last].pos.parent === "adjective" || results[last].pos.tag === "RB")) {
results[last].pos = data.parts_of_speech["NN"];
results[last].rule = "ending_noun";
}
}
return results;
};
if (typeof define !== "undefined" && define.amd) {
define([], function() {
return tagger;
});
} else {
if (typeof module !== "undefined" && module.exports) {
module.exports = tagger;
}
}
return tagger;
})();
words = tagger("sally walked to the store");
tags = tagger(words);
console.log(words);