nlpsum
Version:
Powerful text summarization algorithms from research papers and dedicated research.
199 lines (187 loc) • 2.58 kB
JavaScript
var sentenceparser = (function() {
//by spencer kelly (@spencermountain)
var sentenceparser=function(text) {
var tmp = text.split(/(\S.+?[.])(?=\s+|$)/g);
var sentences = [];
//from package Lingua::EN::Sentence
var abbrevs=[
"jr",
"mr",
"mrs",
"ms",
"dr",
"prof",
"sr",
"sen",
"rep",
"gov",
"atty",
"supt",
"det",
"rev",
"col",
"gen",
"lt",
"cmdr",
"adm",
"capt",
"sgt",
"cpl",
"maj",
"dept",
"univ",
"assn",
"bros",
"inc",
"ltd",
"co",
"corp",
"arc",
"al",
"ave",
"blvd",
"cl",
"ct",
"cres",
"exp",
"rd",
"st",
"dist",
"mt",
"ft",
"fy",
"hwy",
"la",
"pd",
"pl",
"plz",
"tce",
"Ala",
"Ariz",
"Ark",
"Cal",
"Calif",
"Col",
"Colo",
"Conn",
"Del",
"Fed",
"Fla",
"Ga",
"Ida",
"Id",
"Ill",
"Ind",
"Ia",
"Kan",
"Kans",
"Ken",
"Ky",
"La",
"Me",
"Md",
"Mass",
"Mich",
"Minn",
"Miss",
"Mo",
"Mont",
"Neb",
"Nebr",
"Nev",
"Mex",
"Okla",
"Ok",
"Ore",
"Penna",
"Penn",
"Pa",
"Dak",
"Tenn",
"Tex",
"Ut",
"Vt",
"Va",
"Wash",
"Wis",
"Wisc",
"Wy",
"Wyo",
"USAFA",
"Alta",
"Ont",
"Qué",
"Sask",
"Yuk",
"jan",
"feb",
"mar",
"apr",
"jun",
"jul",
"aug",
"sep",
"oct",
"nov",
"dec",
"sept",
"vs",
"etc",
"esp",
"llb",
"md",
"bl",
"phd",
"ma",
"ba",
"miss",
"misses",
"mister",
"sir",
"esq",
"mstr",
"lit",
"fl",
"ex",
"eg",
"sep",
"sept"
]
var abbrev=new RegExp("(^| )("+abbrevs.join("|")+")\. ?$","i");
//join acronyms, titles
for (var i in tmp) {
if (tmp[i]) {
tmp[i] = tmp[i].replace(/^\s+|\s+$/g, ''); //trim extra whitespace
//join common abbreviations + acronyms
if (tmp[i].match(abbrev) || tmp[i].match(/[ |\.][a-z]\.?$/i)) {
tmp[parseInt(i) + 1] = tmp[i] + ' ' + tmp[parseInt(i) + 1];
}
else {
sentences.push(tmp[i]);
tmp[i] = '';
}
}
}
//cleanup afterwards
var clean = [];
for (var i in sentences) {
sentences[i] = sentences[i].replace(/^\s+|\s+$/g, ''); //trim extra whitespace
if (sentences[i]) {
clean.push(sentences[i]);
}
}
return clean;
}
//console.log(exports.sentenceparser('Dr. calm is me. lkj'))
// export for AMD / RequireJS
if (typeof define !== 'undefined' && define.amd) {
define([], function() {
return sentenceparser;
});
}
// export for Node.js
else if (typeof module !== 'undefined' && module.exports) {
module.exports = sentenceparser;
}
return sentenceparser;
})()