node-nlp
Version:
Library for NLU (Natural Language Understanding) done in Node.js
137 lines (113 loc) • 4.52 kB
JavaScript
/*
The parser for parsing US's date format that begin with month's name.
EX.
- January 13
- January 13, 2012
- January 13 - 15, 2012
- Tuesday, January 13, 2012
Watch out for:
- January 12:00
- January 12.44
- January 1222344
*/
var moment = require('moment');
var Parser = require('../parser').Parser;
var ParsedResult = require('../../result').ParsedResult;
var util = require('../../utils/EN');
var PATTERN = new RegExp('(\\W|^)' +
'(?:' +
'(?:on\\s*?)?' +
'(Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sun\\.?|Mon\\.?|Tue\\.?|Wed\\.?|Thu\\.?|Fri\\.?|Sat\\.?)' +
'\\s*,?\\s*)?' +
'(Jan\\.?|January|Feb\\.?|February|Mar\\.?|March|Apr\\.?|April|May\\.?|Jun\\.?|June|Jul\\.?|July|Aug\\.?|August|Sep\\.?|Sept\\.?|September|Oct\\.?|October|Nov\\.?|November|Dec\\.?|December)' +
'(?:-|\/|\\s*,?\\s*)' +
'(([0-9]{1,2})(?:st|nd|rd|th)?|' + util.ORDINAL_WORDS_PATTERN +')\\s*' +
'(?:' +
'(?:to|\\-)\\s*' +
'(([0-9]{1,2})(?:st|nd|rd|th)?| ' + util.ORDINAL_WORDS_PATTERN + ')\\s*' +
')?' +
'(?:' +
'(?:-|\/|\\s*,?\\s*)' +
'(?:([0-9]{4})\\s*(BE|AD|BC)?|([0-9]{1,4})\\s*(AD|BC))\\s*' +
')?' +
'(?=\\W|$)(?!\\:\\d)', 'i');
var WEEKDAY_GROUP = 2;
var MONTH_NAME_GROUP = 3;
var DATE_GROUP = 4;
var DATE_NUM_GROUP = 5;
var DATE_TO_GROUP = 6;
var DATE_TO_NUM_GROUP = 7;
var YEAR_GROUP = 8;
var YEAR_BE_GROUP = 9;
var YEAR_GROUP2 = 10;
var YEAR_BE_GROUP2 = 11;
exports.Parser = function ENMonthNameMiddleEndianParser(){
Parser.apply(this, arguments);
this.pattern = function() { return PATTERN; }
this.extract = function(text, ref, match, opt){
var result = new ParsedResult({
text: match[0].substr(match[1].length, match[0].length - match[1].length),
index: match.index + match[1].length,
ref: ref,
});
var month = match[MONTH_NAME_GROUP];
month = util.MONTH_OFFSET[month.toLowerCase()];
var day = match[DATE_NUM_GROUP] ?
parseInt(match[DATE_NUM_GROUP]) :
util.ORDINAL_WORDS[match[DATE_GROUP].trim().replace('-', ' ').toLowerCase()];
var year = null;
if (match[YEAR_GROUP] || match[YEAR_GROUP2]) {
year = match[YEAR_GROUP] || match[YEAR_GROUP2];
year = parseInt(year);
var yearBE = match[YEAR_BE_GROUP] || match[YEAR_BE_GROUP2];
if (yearBE) {
if (/BE/i.test(yearBE)) {
// Buddhist Era
year = year - 543;
} else if (/BC/i.test(yearBE)) {
// Before Christ
year = -year;
}
} else if (year < 100){
year = year + 2000;
}
}
if(year){
result.start.assign('day', day);
result.start.assign('month', month);
result.start.assign('year', year);
} else {
//Find the most appropriated year
var refMoment = moment(ref);
refMoment.month(month - 1);
refMoment.date(day);
var nextYear = refMoment.clone().add(1, 'y');
var lastYear = refMoment.clone().add(-1, 'y');
if( Math.abs(nextYear.diff(moment(ref))) < Math.abs(refMoment.diff(moment(ref))) ){
refMoment = nextYear;
}
else if( Math.abs(lastYear.diff(moment(ref))) < Math.abs(refMoment.diff(moment(ref))) ){
refMoment = lastYear;
}
result.start.assign('day', day);
result.start.assign('month', month);
result.start.imply('year', refMoment.year());
}
// Weekday component
if (match[WEEKDAY_GROUP]) {
var weekday = match[WEEKDAY_GROUP];
weekday = util.WEEKDAY_OFFSET[weekday.toLowerCase()]
result.start.assign('weekday', weekday);
}
// Text can be 'range' value. Such as 'January 12 - 13, 2012'
if (match[DATE_TO_GROUP]) {
var endDate = match[DATE_TO_NUM_GROUP] ?
endDate = parseInt(match[DATE_TO_NUM_GROUP]) :
util.ORDINAL_WORDS[match[DATE_TO_GROUP].replace('-', ' ').trim().toLowerCase()];
result.end = result.start.clone();
result.end.assign('day', endDate);
}
result.tags['ENMonthNameMiddleEndianParser'] = true;
return result;
}
};