sbd-fork
Version:
Split text into sentences with Sentence Boundary Detection (SBD).
168 lines (167 loc) • 4.97 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
var abbreviations;
var englishAbbreviations = [
"al",
"adj",
"assn",
"Ave",
"BSc", "MSc",
"Cell",
"Ch",
"Co",
"cc",
"Corp",
"Dem",
"Dept",
"ed",
"eg",
"Eq",
"Eqs",
"est",
"est",
"etc",
"Ex",
"ext",
"Fig",
"fig",
"Figs",
"figs",
"i.e",
"ie",
"Inc",
"inc",
"Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Sept", "Oct", "Nov", "Dec",
"jr",
"mi",
"Miss", "Mrs", "Mr", "Ms",
"Mol",
"mt",
"mts",
"no",
"Nos",
"PhD", "MD", "BA", "MA", "MM",
"pl",
"pop",
"pp",
"Prof", "Dr",
"pt",
"Ref",
"Refs",
"Rep",
"repr",
"rev",
"Sec",
"Secs",
"Sgt", "Col", "Gen", "Rep", "Sen", 'Gov', "Lt", "Maj", "Capt", "St",
"Sr", "sr", "Jr", "jr", "Rev",
"Sun", "Mon", "Tu", "Tue", "Tues", "Wed", "Th", "Thu", "Thur", "Thurs", "Fri", "Sat",
"trans",
"Univ",
"Viz",
"Vol",
"vs",
"v",
];
var Match = /** @class */ (function () {
function Match() {
}
Match.setAbbreviations = function (abbr) {
if (abbr) {
abbreviations = abbr;
}
else {
abbreviations = englishAbbreviations;
}
};
Match.isCapitalized = function (str) {
return /^[A-Z][a-z].*/.test(str) || this.isNumber(str);
};
// Start with opening quotes or capitalized letter
Match.isSentenceStarter = function (str) {
return this.isCapitalized(str) || /``|"|'/.test(str.substring(0, 2));
};
Match.isCommonAbbreviation = function (str) {
var noSymbols = str.replace(/[-'`~!@#$%^&*()_|+=?;:'",.<>\{\}\[\]\\\/]/gi, "");
return ~abbreviations.indexOf(noSymbols);
};
// This is going towards too much rule based
Match.isTimeAbbreviation = function (word, next) {
if (word === "a.m." || word === "p.m.") {
var tmp = next.replace(/\W+/g, '').slice(-3).toLowerCase();
if (tmp === "day") {
return true;
}
}
return false;
};
Match.isDottedAbbreviation = function (word) {
var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/((.|[A-Z]+)\.)*/);
return matches && matches[0].length > 0;
};
// TODO look for next words, if multiple are capitalized,
// then it's probably not a sentence ending
Match.isCustomAbbreviation = function (str) {
if (str.length <= 3) {
return true;
}
return this.isCapitalized(str);
};
// Uses current word count in sentence and next few words to check if it is
// more likely an abbreviation + name or new sentence.
Match.isNameAbbreviation = function (wordCount, words) {
if (words.length > 0) {
if (wordCount < 5 && words[0].length < 6 && this.isCapitalized(words[0])) {
return true;
}
var capitalized = words.filter(function (str) {
return /[A-Z]/.test(str.charAt(0));
});
return capitalized.length >= 3;
}
return false;
};
Match.isNumber = function (str, dotPos) {
if (dotPos) {
str = str.slice(dotPos - 1, dotPos + 2);
}
return !isNaN(str);
};
;
// Phone number matching
// http://stackoverflow.com/a/123666/951517
Match.isPhoneNr = function (str) {
return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/);
};
;
// Match urls / emails
// http://stackoverflow.com/a/3809435/951517
Match.isURL = function (str) {
return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/);
};
;
// Starting a new sentence if beginning with capital letter
// Exception: The word is enclosed in brackets
Match.isConcatenated = function (word) {
var i = 0;
if ((i = word.indexOf(".")) > -1 ||
(i = word.indexOf("!")) > -1 ||
(i = word.indexOf("?")) > -1) {
var c = word.charAt(i + 1);
// Check if the next word starts with a letter
if (c.match(/[a-zA-Z].*/)) {
return [word.slice(0, i), word.slice(i + 1)];
}
}
return false;
};
;
Match.isBoundaryChar = function (word) {
return word === "." ||
word === "!" ||
word === "?";
};
;
return Match;
}());
exports.default = Match;