UNPKG

nlpsum

Version:

Powerful text summarization algorithms from research papers and dedicated research.

645 lines (633 loc) 15.1 kB
// Generated by CoffeeScript 1.6.3 var data, tagger, tokenizer; data = require("./helpers/data"); tokenizer = require("./helpers/tokenizer"); tagger = (function() { var suggest_adjective_phrase, suggest_adverb_phrase, suggest_noun_phrase, suggest_verb_phrase; suggest_noun_phrase = function(o, rule, results, options) { var i, top; top = results.length; i = o + 1; while (i < top) { if (results[i].pos.parent === "noun") { return results; } if (results[i].pos.parent === "verb" && results[i].pos.tag !== "RB") { results[i].pos = data.parts_of_speech["NN"]; results[i].rule = rule; return results; } if (results[i + 1]) { if (results[i].pos.parent === "adj" || results[i].pos.tag === "RB") { results[i].pos.parent = "adjective"; results[i].rule = rule; } } else { results[i].pos = data.parts_of_speech["NN"]; results[i].rule = rule; } i++; } return results; }; suggest_verb_phrase = function(o, rule, results, options) { var i, top; top = results.length; i = o + 1; while (i < top) { if (results[i].pos.parent === "verb") { return results; } if (results[i].pos.parent === "noun") { results[i].pos = data.parts_of_speech["VB"]; results[i].rule = rule; return results; } if (results[i + 1]) { if (results[i].pos.parent === "adj" || results[i].pos.tag === "RB") { results[i].pos.parent = "verb"; results[i].rule = rule; } } else { results[i].pos = data.parts_of_speech["VB"]; results[i].rule = rule; } i++; } return results; }; suggest_adjective_phrase = function(o, rule, results, options) { var i, top; top = results.length; i = o + 1; while (i < top) { if (results[i].pos.parent === "adjective") { return results; } if (results[i].pos.tag === "DT" || results[i].pos.tag === "CP") { return results; } if (results[i].pos.parent === "noun" || results[i].pos.parent === "verb") { results[i].pos = data.parts_of_speech["JJ"]; results[i].rule = rule; return results; } if (results[i + 1]) { if (results[i].pos.tag === "RB") { results[i].pos.parent = "adjective"; results[i].rule = rule; } } else { if (options.strong) { results[i].pos = data.parts_of_speech["JJ"]; results[i].rule = rule; } } i++; } return results; }; suggest_adverb_phrase = function(o, rule, results, options) { var i, top; top = results.length; i = o + 1; while (i < top) { if (results[i].pos.parent === "adjective" || results[i].pos.parent === "verb") { return results; } if (results[i].pos.parent === "noun") { results[i].pos = data.parts_of_speech["JJ"]; results[i].rule = rule; return results; } if (results[i + 1]) { if (results[i].pos.tag === "RB") { results[i].pos.parent = "adjective"; results[i].rule = rule; } } else { if (options.strong) { results[i].pos = data.parts_of_speech["JJ"]; results[i].rule = rule; } } i++; } return results; }; tagger = function(words, options) { var i, last, lex, o, patterns, results, word; if (words == null) { words = []; } if (options == null) { options = {}; } if (typeof words === "string") { words = tokenizer(words, options); } results = []; for (i in words) { word = words[i]; results[i] = { word: word, pos: null, clues: [] }; patterns = [ { reg: /[a-z]\-[a-z]/, pos: "JJ" }, { reg: /^de\-[a-z]../, pos: "VB" }, { reg: /^un\-[a-z]../, pos: "VB" }, { reg: /^re\-[a-z]../, pos: "VB" }, { reg: /.*ould$/, pos: "MD" }, { reg: /..*ing$/, pos: "VBG" }, { reg: /..*ates$/, pos: "VBZ" }, { reg: /..*ses$/, pos: "VBZ" }, { reg: /..*ify$/, pos: "VB" }, { reg: /..*ize$/, pos: "VB" }, { reg: /..*ated$/, pos: "VBN" }, { reg: /..*'n$/, pos: "VBG" }, { reg: /...*ed$/, pos: "VBD" }, { reg: /.*ness$/, pos: "NN" }, { reg: /.*ment$/, pos: "NN" }, { reg: /.*full?$/, pos: "JJ" }, { reg: /.*ous$/, pos: "JJ" }, { reg: /.*ble$/, pos: "JJ" }, { reg: /.*ic$/, pos: "JJ" }, { reg: /..*ive$/, pos: "JJ" }, { reg: /..*ic$/, pos: "JJ" }, { reg: /..*est$/, pos: "JJS" }, { reg: /.*ical$/, pos: "JJ" }, { reg: /.*ial$/, pos: "JJ" }, { reg: /...*ish$/, pos: "JJ" }, { reg: /...*nal$/, pos: "JJ" }, { reg: /.*less$/, pos: "JJ" }, { reg: /.*ier$/, pos: "JJR" }, { reg: /.*ened$/, pos: "JJ" }, { reg: /.*some$/, pos: "JJ" }, { reg: /..*ant$/, pos: "JJ" }, { reg: /..*like$/, pos: "JJ" }, { reg: /..*ky$/, pos: "JJ" }, { reg: /..*ly$/, pos: "RB" }, { reg: /\./, pos: "NN" }, { reg: /^((?![aeiouy]).)*$/, pos: "NN" }, { reg: /^-?[0-9]+(.[0-9]+)?$/, pos: "CD" }, { reg: /'s$/, pos: "NNO" }, { "reg": /.*ized$/, "pos": "JJ" }, { "reg": /.*ates$/, "pos": "VBZ" }, { "reg": /.*ting$/, "pos": "JJ" }, { "reg": /.*rate$/, "pos": "VB" }, { "reg": /.*ling$/, "pos": "JJ" }, { "reg": /.*ring$/, "pos": "JJ" }, { "reg": /.*fied$/, "pos": "JJ" }, { "reg": /.*shed$/, "pos": "JJ" }, { "reg": /.*ched$/, "pos": "JJ" }, { "reg": /.*tory$/, "pos": "JJ" }, { "reg": /.*ding$/, "pos": "JJ" }, { "reg": /.*ning$/, "pos": "JJ" }, { "reg": /.*ular$/, "pos": "JJ" }, { "reg": /.*late$/, "pos": "VB" }, { "reg": /.*tted$/, "pos": "VBN" }, { "reg": /.*ying$/, "pos": "JJ" }, { "reg": /.*king$/, "pos": "JJ" }, { "reg": /.*izes$/, "pos": "VBZ" }, { "reg": /.*sing$/, "pos": "JJ" }, { "reg": /.*nary$/, "pos": "JJ" }, { "reg": /.*ntal$/, "pos": "JJ" }, { "reg": /.*rian$/, "pos": "JJ" }, { "reg": /.*ound$/, "pos": "JJ" }, { "reg": /.*iate$/, "pos": "VB" }, { "reg": /.*cate$/, "pos": "VB" }, { "reg": /.*hing$/, "pos": "JJ" }, { "reg": /.*ming$/, "pos": "JJ" }, { "reg": /.*ient$/, "pos": "JJ" }, { "reg": /.*fies$/, "pos": "VBZ" }, { "reg": /.*tary$/, "pos": "JJ" }, { "reg": /.*ards$/, "pos": "RB" }, { "reg": /.*ural$/, "pos": "JJ" }, { "reg": /.*ight$/, "pos": "JJ" }, { "reg": /.*lent$/, "pos": "JJ" }, { "reg": /.*ging$/, "pos": "JJ" }, { "reg": /.*cent$/, "pos": "JJ" }, { "reg": /.*shes$/, "pos": "VBZ" }, { "reg": /.*nian$/, "pos": "JJ" }, { "reg": /.*ects$/, "pos": "VBZ" }, { "reg": /.*ving$/, "pos": "JJ" }, { "reg": /.*dent$/, "pos": "JJ" }, { "reg": /.*ends$/, "pos": "VBZ" }, { "reg": /.*tent$/, "pos": "JJ" }, { "reg": /.*tual$/, "pos": "JJ" }, { "reg": /.*rent$/, "pos": "JJ" }, { "reg": /.*eral$/, "pos": "JJ" }, { "reg": /.*uate$/, "pos": "VB" }, { "reg": /.*sian$/, "pos": "JJ" }, { "reg": /.*ives$/, "pos": "VBZ" }, { "reg": /.*gent$/, "pos": "JJ" }, { "reg": /.*bles$/, "pos": "VBZ" }, { "reg": /.*tens$/, "pos": "VBZ" }, { "reg": /.*lian$/, "pos": "JJ" }, { "reg": /.*tian$/, "pos": "JJ" }, { "reg": /.*ains$/, "pos": "VBZ" }, { "reg": /.*nist$/, "pos": "JJ" }, { "reg": /.*oral$/, "pos": "JJ" }, { "reg": /.*ines$/, "pos": "VBZ" }, { "reg": /.*erly$/, "pos": "JJ" }, { "reg": /.*duce$/, "pos": "VB" }, { "reg": /.*ures$/, "pos": "VBZ" }, { "reg": /.*wide$/, "pos": "JJ" }, { "reg": /.*udes$/, "pos": "VBZ" }, { "reg": /.*ters$/, "pos": "VBZ" }, { "reg": /.*ents$/, "pos": "VBZ" } ]; for (o in patterns) { if (word.match(patterns[o].reg)) { results[i].pos = data.parts_of_speech[patterns[o].pos]; results[i].rule = "regex"; } } word = word.replace(/[\.,!:;]*$/, ""); lex = data.lexicon[word.toLowerCase()]; if (lex) { results[i].pos = data.parts_of_speech[lex]; results[i].rule = "lexicon"; } if (i !== 0 && word.match(/[A-Z]/)) { results[i].pos = data.parts_of_speech["NN"]; results[i].rule = "capital"; } if (parseFloat(word)) { results[i].pos = data.parts_of_speech["NN"]; results[i].rule = "number"; } if (!results[i].pos) { results[i].pos = data.parts_of_speech["NN"]; results[i].rule = "unknown"; } } for (i in results) { i = parseInt(i); if (!results[i + 1]) { continue; } if (results[i].pos.tag === "RB" && (!results[i - 1] || results[i - 1].pos.parent !== "verb")) { results = suggest_adverb_phrase(i, "from_adverb", results, { strong: false }); } if (results[i].pos.tag === "PP") { results = suggest_noun_phrase(i, "from_posessive", results, { strong: true }); } if (results[i].pos.tag === "VBZ" && results[i + 1].pos.parent !== "verb") { results = suggest_adjective_phrase(i, "vbz-adjective", results, { strong: false }); } if (results[i].pos.tag === "DT") { results = suggest_noun_phrase(i, "from_determiner", results, { strong: false }); } if (results[i].pos.tag === "MD") { results = suggest_verb_phrase(i, "from_would", results, { strong: false }); } } for (i in results) { i = parseInt(i); if (!results[i + 1]) { continue; } if (results[i].pos.parent === "noun" && results[i + 2] && results[i + 1].pos.tag === "JJ" && results[i + 2].pos.parent === "noun") { if (!options.big) { results[i + 1].pos = data.parts_of_speech["NN"]; results[i + 1].rule = "noun_adjective_noun"; } } if (results[i].pos.tag === "JJ" && results[i + 1].pos.parent === "verb") { results[i].pos = data.parts_of_speech["RB"]; results[i].rule = "adjective_verb"; } if (results[i].pos.tag === "JJ" && results[i + 1].pos.tag === "JJ") { if (!results[i].word.match(",")) { results[i].pos = data.parts_of_speech["RB"]; results[i].rule = "twoadjectives"; } } if (results[i].pos.tag === "PRP") { if (results[i - 1] && results[i - 1].pos.parent === "adjective") { results[i - 1].pos = data.parts_of_speech["VB"]; results[i - 1].rule = "verb_myself"; } else if (!results[i - 1] || !results[i - 1].pos.parent === "verb") { results = suggest_verb_phrase(i, "from_pronoun", results, { strong: false }); } } if (results[i].pos.tag === "CP" && results[i + 1].pos.tag === "IN") { results[i + 1].pos = data.parts_of_speech["VB"]; results[i + 1].rule = "preposition_verb"; } if (results[i].pos.parent === "adjective" && results[i + 1].pos.tag === "CC" && results[i + 2] && results[i + 2].pos.parent === "noun") { results[i + 2].pos = data.parts_of_speech["JJ"]; results[i + 2].rule = "and_adjective"; } } last = results.length - 1; if (results[last - 1]) { if (results[last - 1].pos.tag === "CP" && results[last].pos.parent === "noun") { results[last].pos = data.parts_of_speech["JJ"]; results[last].rule = "end_copula"; } if (results[last - 1].pos.parent === "noun" && (results[last].pos.parent === "adjective" || results[last].pos.tag === "RB")) { results[last].pos = data.parts_of_speech["NN"]; results[last].rule = "ending_noun"; } } return results; }; if (typeof define !== "undefined" && define.amd) { define([], function() { return tagger; }); } else { if (typeof module !== "undefined" && module.exports) { module.exports = tagger; } } return tagger; })();