nlpsum
Version:
Powerful text summarization algorithms from research papers and dedicated research.
378 lines (326 loc) • 13.9 kB
JavaScript
var date_extractor = (function() {
var date_extractor = function(text) {
var results = finddate(text);
return {
"text": results[0],
"from": formatdate(results[1]),
"to": formatdate(results[2])
}
}
//this function returns an array.
//foundarray[0] is the text that contains the date bit (to highlight),
//foundarray[1] is the start date
//foundarray[2] is the end date
function finddate(text) {
if (text == null) {
return '';
}
var foundarray = new Array();
text = text.replace(/ Feb\.? /g, ' February ');
text = text.replace(/ Mar\.? /g, ' March ');
text = text.replace(/ Apr\.? /g, ' April ');
text = text.replace(/ Jun\.? /g, ' June ');
text = text.replace(/ Jul\.? /g, ' july ');
text = text.replace(/ Aug\.? /g, ' august ');
text = text.replace(/ Sep\.? /g, ' september ');
text = text.replace(/ Oct\.? /g, ' october ');
text = text.replace(/ Nov\.? /g, ' november ');
text = text.replace(/ Dec\.? /g, ' december ');
text = text.replace(/\(/g, '( ');
text = text.replace(/\)/g, ' )');
//text=text.replace(/./g,'');
var sentences = text.split(/\. /); //one sentence at a time
for (var i in sentences) {
var found;
sentences[i] = ' ' + sentences[i] + '';
sentences[i] = sentences[i].replace(/\Bfirst\B/, '1st');
sentences[i] = sentences[i].replace(/second of /, '2nd of');
sentences[i] = sentences[i].replace(/second of /, '2nd of');
sentences[i] = sentences[i].replace(/second of /, '2nd of');
sentences[i] = sentences[i].replace(/second of /, '2nd of');
sentences[i] = sentences[i].replace(/second of /, '2nd of');
//eg March 7th-11th 1987
var best = /(july|august|september|october|november|december|january|february|march|april|may|june),? (the )?[0-9]{1,2}(th|rd|st)? ?(\-| to | until | till ) ?[0-9]{1,2}(th|rd|st)?,? [0-9]{4}/i.exec(sentences[i]);
if (best != null) {
found = best[0];
var remove = / ?(-| to | until ) ?[0-9]{1,2}(th|rd|st)?,?/i.exec(found); //remove until date
foundarray[0] = found;
foundarray[1] = found.replace(remove[0], '');
var remove = /[0-9]{1,2}(th|rd|st)?,? ?(-| to | until )/i.exec(found); //remove to date
foundarray[2] = found.replace(remove[0], '');
return foundarray;
}
//eg '28th of September to 5th of October 2008'
var best = /[0-9]{1,2}(th|rd|st)? (of )?(july|august|september|october|november|december|january|february|march|april|may|june),? ?(-| to | until | till ) ?[0-9]{1,2}(th|rd|st)? (of )?(july|august|september|october|november|december|january|february|march|april|may|june),? [0-9]{4}/i.exec(sentences[i]);
if (best != null) {
var found = best[0];
foundarray[0] = found;
var remove = /(-| to | until | till ) ?[0-9]{1,2}(th|rd|st)? (of )?(july|august|september|october|november|december|january|february|march|april|may|june),?/i.exec(found); //remove until date
var tfound = found.replace(remove[0], '');
tfound = tfound.replace(/( of | to | until | till |-)/, ' ');
foundarray[1] = tfound.replace(remove[0], '');
//to date
var remove = /[0-9]{1,2}(th|rd|st)? (of )?(july|august|september|october|november|december|january|february|march|april|may|june),? ?(-| to | until | till )/i.exec(found); //remove from date
var tfound = found.replace(remove[0], '');
tfound = tfound.replace(/( of | to | until | till |-)/, ' ');
foundarray[2] = tfound.replace(remove[0], '');
return foundarray;
}
//eg March 7th to june 11th 1987
var best = /(july|august|september|october|november|december|january|february|march|april|may|june),? (the )?[0-9]{1,2}(th|rd|st)? ?(-| to | until | till ) ?(july|august|september|october|november|december|january|february|march|april|may|june),? (the )?[0-9]{1,2}(th|rd|st)?( |, )[0-9]{4}/i.exec(sentences[i]);
// var best = /(july|august|september|october|november|december|january|february|march|april|may|june) [0-9]{1,2} to (july|august|september|october|november|december|january|february|march|april|may|june) [0-9]{1,2} [0-9]{4}/i.exec(text);
if (best != null) {
found = best[0];
var remove = / ?(-| to | until | till ) ?(july|august|september|october|november|december|january|february|march|april|may|june),? (the )?[0-9]{1,2}(th|rd|st)?,?/i.exec(found); //remove until date
foundarray[0] = found;
foundarray[1] = found.replace(remove[0], '');
var end = found.replace(/(july|august|september|october|november|december|january|february|march|april|may|june),? (the )?[0-9]{1,2}(th|rd|st)? ?(-| to | until | till ) ?/i, '');
foundarray[2] = end;
return foundarray;
}
//eg between 13 February and 15 February 1945
var best = /(through|throughout|during|within|between) [0-9]{1,2}(th|rd|st)? (july|august|september|october|november|december|january|february|march|april|may|june),? ?(-| to | until | till | and ) ?[0-9]{1,2}(th|rd|st)? (july|august|september|october|november|december|january|february|march|april|may|june),? [0-9]{4}/i.exec(sentences[i]);
// var best = /(july|august|september|october|november|december|january|february|march|april|may|june) [0-9]{1,2} to (july|august|september|october|november|december|january|february|march|april|may|june) [0-9]{1,2} [0-9]{4}/i.exec(text);
if (best != null) {
found = best[0];
// console.log("*"+best);
var end = /[0-9]{1,2}(th|rd|st)? (july|august|september|october|november|december|january|february|march|april|may|june),? [0-9]{4}/i.exec(found); //grab end date
var start = found.replace(/(through|throughout|during|within|between) /i, '');
start = start.replace(/,? ?(-| to | until | till | and ) ?[0-9]{1,2}(th|rd|st)? (july|august|september|october|november|december|january|february|march|april|may|june),?/i, '');
foundarray[0] = found;
foundarray[1] = start;
foundarray[2] = end[0];
//console.log(foundarray[1]);
return foundarray;
}
//eg between March 7th and june 11th 1987
var best = /between (july|august|september|october|november|december|january|february|march|april|may|june),? (the )?[0-9]{1,2}(th|rd|st)? ?(-| and ) ?(july|august|september|october|november|december|january|february|march|april|may|june),? (the )?[0-9]{1,2}(th|rd|st)?( |, )[0-9]{4}/i.exec(sentences[i]);
// var best = /(july|august|september|october|november|december|january|february|march|april|may|june) [0-9]{1,2} to (july|august|september|october|november|december|january|february|march|april|may|june) [0-9]{1,2} [0-9]{4}/i.exec(text);
if (best != null) {
found = best[0];
foundarray[0] = found;
var remove = / ?(-| and ) ?(july|august|september|october|november|december|january|february|march|april|may|june),? (the )?[0-9]{1,2}(th|rd|st)?,?/i.exec(found); //remove until date
foundarray[1] = found.replace(remove[0], '');
foundarray[1] = foundarray[1].replace('between', '');
//to date
var remove = /between (july|august|september|october|november|december|january|february|march|april|may|june),? (the )?[0-9]{1,2}(th|rd|st)? ?(-| and ) ?/i.exec(found); //remove from date
foundarray[2] = found.replace(remove[0], '');
foundarray[2] = foundarray[2].replace('between', '');
return foundarray;
}
//eg March 1st 1987
var best = /(july|august|september|october|november|december|january|february|march|april|may|june),? (the )?[0-9]{1,2}(th|rd|st)?( |, |,)[0-9]{4}/i.exec(sentences[i]);
if (best != null) {
found = best[0];
foundarray[0] = found;
foundarray[1] = found;
return foundarray;
}
//eg 3rd - 5th of March 1969
if (found == null) {
var second = /[0-9]{1,2}(th|rd|st)? ?( to |-| until |\/) ?[0-9]{1,2}(th|rd|st)? (of )?(july|august|september|october|november|december|january|february|march|april|may|june)( |, |,)[0-9]{4}/i.exec(sentences[i]);
if (second != null) {
found = second[0];
foundarray[0] = found;
var start = found.replace(/(th|rd|st)? ?( to |-| until |\/) ?[0-9]{1,2}(th|rd|st)?( of)?/i, '');
var end = found.replace(/[0-9]{1,2}(th|rd|st)? ?( to |-| until |\/) ?/i, '');
start = start.replace('of', '');
end = end.replace('of', '');
foundarray[1] = start;
foundarray[2] = end;
return foundarray;
}
}
//eg 3rd of March 1969
if (found == null) {
var second = /[0-9]{1,2}(th|rd|st)? ?(of )?(july|august|september|october|november|december|january|february|march|april|may|june)( |, |,)[0-9]{4}/i.exec(sentences[i]);
if (second != null) {
found = second[0];
foundarray[0] = found;
foundarray[1] = found
return foundarray;
}
}
//eg September 1939 to April 1945
if (found == null) {
var third = /(july|august|september|october|november|december|january|february|march|april|may|june)( |, |,)(of )?[0-9]{4,4} ?( to |-| until ) ?(july|august|september|october|november|december|january|february|march|april|may|june)( |, |,)(of )?[0-9]{4,4}/i.exec(sentences[i]); //
if (third != null) {
var start = third[0].replace(/ (to|-|until) (july|august|september|october|november|december|january|february|march|april|may|june)( |, |,)(of )?[0-9]{4,4}/i, '');
var end = third[0].replace(/(july|august|september|october|november|december|january|february|march|april|may|june)( |, |,)(of )?[0-9]{4,4} (to|-|until) /i, '');
found = third[0];
foundarray[0] = found;
foundarray[1] = start;
foundarray[2] = end;
return foundarray;
}
}
//eg March 1969
if (found == null) {
var third = /(july|august|september|october|november|december|january|february|march|april|may|june)( |, |,)(of )?[0-9]{4,4}/i.exec(sentences[i]); //
if (third != null) {
found = third[0];
foundarray[0] = found;
foundarray[1] = found
return foundarray;
}
}
//eg 400 - 600 BC
if (found == null) {
var year = / ([0-9]{3,4} ?- ?[0-9]{2,4} ?(BC.|B.C.|BC ))/i.exec(sentences[i]);
if (year != null) {
found = year[1];
found = found.replace(/ ?- ?[0-9]{2,4}/, '');
foundarray[0] = found;
foundarray[1] = found.replace(/(BC.|B.C.|BC |BCE.|B.C.E.|BCE )/, 'B.C.')
return foundarray;
}
}
//eg 1997-1998
if (found == null) {
var year = / [0-9]{4,4} ?(-|–| to | until ) ?'?[0-9]{4,4}/i.exec(sentences[i]);
if (year != null) {
found = year[0];
foundarray[0] = found;
var tfound = found.replace(/ ?- ?'?[0-9]{4,4}/, '');
if (tfound != null && tfound < 2020) {
foundarray[1] = tfound
var remove = /[0-9]{4} ?- ?/i.exec(found); //remove from date
foundarray[2] = found.replace(remove[0], '');
return foundarray;
}
}
}
//eg 1997-98
if (found == null) {
var year = / [0-9]{4,4} ?(-|–| to | until ) ?'?[0-9]{2,2}/i.exec(sentences[i]);
if (year != null) {
found = year[0];
foundarray[0] = found;
var tfound = found.replace(/ ?- ?'?[0-9]{2,2}/, '');
if (tfound != null && tfound < 2020) {
foundarray[1] = tfound
var remove = /[0-9]{2} ?- ?/i.exec(found); //remove from date
foundarray[2] = found.replace(remove[0], '');
return foundarray;
}
}
}
//eg 400 BC
if (found == null) {
var year = / ([0-9]{3,4} (BC.|B.C.|BC ))/i.exec(sentences[i]);
if (year != null) {
found = year[1];
foundarray[0] = found;
foundarray[1] = found.replace(/(BC.|B.C.|BC |BCE.|B.C.E.|BCE )/, 'B.C.')
return foundarray;
}
}
//matches year
if (found == null) {
var year = / [0-9]{4,4}/i.exec(sentences[i]);
if (year != null && year < 2020) {
found = year[0];
foundarray[0] = found;
foundarray[1] = found
return foundarray;
}
}
if (found != null) {
break;
}
} //for senetences
return foundarray;
}
//begin processing date to be mql-friendly
//var str=formatdate('July 2, 1934'); acre.write(str);
function formatdate(found) {
if (!found) {
return {};
}
found = found.replace('of ', '');
found = found.replace('the ', '');
found = found.replace('th ', ' ');
found = found.replace('rd ', ' ');
found = found.replace('1st ', '01');
var month = /july|august|september|october|november|december|january|february|march|april|may|june/i.exec(found);
month = '' + month;
month = month.toLowerCase();
var monthnum = 0;
if (month == 'january') {
monthnum = '01';
}
if (month == 'february') {
monthnum = '02';
}
if (month == 'march') {
monthnum = '03';
}
if (month == 'april') {
monthnum = '04';
}
if (month == 'may') {
monthnum = '05';
}
if (month == 'june') {
monthnum = '06';
}
if (month == 'july') {
monthnum = '07';
}
if (month == 'august') {
monthnum = '08';
}
if (month == 'september') {
monthnum = '09';
}
if (month == 'october') {
monthnum = '10';
}
if (month == 'november') {
monthnum = '11';
}
if (month == 'december') {
monthnum = '12';
}
if (found.match('B.C.')) {
var year = /[0-9]{3,4}/i.exec(found);
year = year + '';
if (year.length == 3) {
year = '0' + year;
}
if (year.length == 2) {
year = '00' + year;
}
year = '-' + year;
return year;
} //something bc
else {
var year = /[0-9]{4}/i.exec(found);
} //normal years
year = '' + year;
found = found.replace(year, '');
var date = /[0-9]{1,2}/i.exec(found);
if (date != null) {
if (date < 10) {
date = '0' + date;
} //turn 1 into 01
}
return {
"year": year,
"month": monthnum,
"day": date
};
}
//console.log(exports.date_extractor('my wife left me on the 9th of april, 2005. now i just programz the computerz.'));
// export for AMD / RequireJS
if (typeof define !== 'undefined' && define.amd) {
define([], function() {
return date_extractor;
});
}
// export for Node.js
else if (typeof module !== 'undefined' && module.exports) {
module.exports = date_extractor;
}
return date_extractor;
})()