nlpsum

Version:

Powerful text summarization algorithms from research papers and dedicated research.

60 lines (49 loc) • 1.88 kB

text/coffeescript

data= require("./data") tokenizer = (-> #undo contractions #remove bracketed parts #connect common multiple-word-phrases into one token spot_multiples = (words) -> for i of words i = parseInt(i) continue unless words[i + 1] two = words[i] + " " + words[i + 1] two = two.replace(/[\.,!:;]*$/, "") if data.multiples[two] words[i] = words[i] + " " + words[i + 1] words[i + 1] = null #remove empty words words.filter (w)-> w #rejoin quotations to one token rejoin = (words) -> quotes = [] for i of words quotes.push parseInt(i) if words[i].match("\"") if quotes.length is 2 quote = words.slice(quotes[0], quotes[1] + 1).join(" ") quote = quote.replace(/"/g, "") words.push quote words tokenizer = (text, options) -> options = {} unless options text = text.replace(/([^ ])['’]s /g, "$1 is ") if text.match(/(he's|she's|it's)/) text = text.replace(/([^ ])['’]ve /g, "$1 have ") text = text.replace(/([^ ])['’]re /g, "$1 are ") text = text.replace(/([^ ])['’]d /g, "$1 would ") text = text.replace(/([^ ])['’]ll /g, "$1 will ") text = text.replace(/([^ ])n['’]t /g, "$1 not ") text = text.replace(/\bi'm /g, "I am ") text = text.replace(RegExp(" ?\$.{0,200}?\$", "g"), "") unless options.keep_brackets words = text.split(" ") words = rejoin(words) if text.match("\"") if options.want_quotations words = spot_multiples(words) words # export for AMD / RequireJS if typeof define isnt "undefined" and define.amd define [], -> tokenizer # export for Node.js else module.exports = tokenizer if typeof module isnt "undefined" and module.exports tokenizer )() # console.log(tokenizer('toronto and chicago! seem as usual, "well-disguised as hell" yeah', {want_quotations:true}));