sbd-ts
Version:
Split text into sentences with Sentence Boundary Detection (SBD).
184 lines (156 loc) • 4.38 kB
text/typescript
var abbreviations;
var englishAbbreviations = [
"al",
"adj",
"assn",
"Ave",
"BSc", "MSc",
"Cell",
"Ch",
"Co",
"cc",
"Corp",
"Dem",
"Dept",
"ed",
"eg",
"Eq",
"Eqs",
"est",
"est",
"etc",
"Ex",
"ext", // + number?
"Fig",
"fig",
"Figs",
"figs",
"i.e",
"ie",
"Inc",
"inc",
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec",
"jr",
"mi",
"Miss", "Mrs", "Mr", "Ms",
"Mol",
"mt",
"mts",
"no",
"Nos",
"PhD", "MD", "BA", "MA", "MM",
"pl",
"pop",
"pp",
"Prof", "Dr",
"pt",
"Ref",
"Refs",
"Rep",
"repr",
"rev",
"Sec",
"Secs",
"Sgt", "Col", "Gen", "Rep", "Sen",'Gov', "Lt", "Maj", "Capt","St",
"Sr", "sr", "Jr", "jr", "Rev",
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat",
"trans",
"Univ",
"Viz",
"Vol",
"vs",
"v",
];
export default class Match {
public static setAbbreviations(abbr) {
if (abbr) {
abbreviations = abbr;
} else {
abbreviations = englishAbbreviations;
}
}
public static isCapitalized(str) {
return /^[A-Z][a-z].*/.test(str) || this.isNumber(str);
}
// Start with opening quotes or capitalized letter
public static isSentenceStarter(str) {
return this.isCapitalized(str) || /``|"|'/.test(str.substring(0,2));
}
public static isCommonAbbreviation(str) {
var noSymbols = str.replace(/[-'`~!@#$%^&*()_|+=?;:'",.<>\{\}\[\]\\\/]/gi, "");
return ~abbreviations.indexOf(noSymbols);
}
// This is going towards too much rule based
public static isTimeAbbreviation(word, next) {
if (word === "a.m." || word === "p.m.") {
var tmp = next.replace(/\W+/g, '').slice(-3).toLowerCase();
if (tmp === "day") {
return true;
}
}
return false;
}
public static isDottedAbbreviation(word) {
var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/((.|[A-Z]+)\.)*/);
return matches && matches[0].length > 0;
}
// TODO look for next words, if multiple are capitalized,
// then it's probably not a sentence ending
public static isCustomAbbreviation(str) {
if (str.length <= 3) {
return true;
}
return this.isCapitalized(str);
}
// Uses current word count in sentence and next few words to check if it is
// more likely an abbreviation + name or new sentence.
public static isNameAbbreviation(wordCount, words) {
if (words.length > 0) {
if (wordCount < 5 && words[0].length < 6 && this.isCapitalized(words[0])) {
return true;
}
var capitalized = words.filter(function(str) {
return /[A-Z]/.test(str.charAt(0));
});
return capitalized.length >= 3;
}
return false;
}
public static isNumber(str, dotPos?) {
if (dotPos) {
str = str.slice(dotPos-1, dotPos+2);
}
return !isNaN(str);
};
// Phone number matching
// http://stackoverflow.com/a/123666/951517
public static isPhoneNr(str) {
return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/);
};
// Match urls / emails
// http://stackoverflow.com/a/3809435/951517
public static isURL(str) {
return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/);
};
// Starting a new sentence if beginning with capital letter
// Exception: The word is enclosed in brackets
public static isConcatenated(word) {
var i = 0;
if ((i = word.indexOf(".")) > -1 ||
(i = word.indexOf("!")) > -1 ||
(i = word.indexOf("?")) > -1)
{
var c = word.charAt(i + 1);
// Check if the next word starts with a letter
if (c.match(/[a-zA-Z].*/)) {
return [word.slice(0, i), word.slice(i+1)];
}
}
return false;
};
public static isBoundaryChar(word) {
return word === "." ||
word === "!" ||
word === "?";
};
}