UNPKG

sbd-fork

Version:

Split text into sentences with Sentence Boundary Detection (SBD).

168 lines (167 loc) 4.97 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); var abbreviations; var englishAbbreviations = [ "al", "adj", "assn", "Ave", "BSc", "MSc", "Cell", "Ch", "Co", "cc", "Corp", "Dem", "Dept", "ed", "eg", "Eq", "Eqs", "est", "est", "etc", "Ex", "ext", "Fig", "fig", "Figs", "figs", "i.e", "ie", "Inc", "inc", "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Sept", "Oct", "Nov", "Dec", "jr", "mi", "Miss", "Mrs", "Mr", "Ms", "Mol", "mt", "mts", "no", "Nos", "PhD", "MD", "BA", "MA", "MM", "pl", "pop", "pp", "Prof", "Dr", "pt", "Ref", "Refs", "Rep", "repr", "rev", "Sec", "Secs", "Sgt", "Col", "Gen", "Rep", "Sen", 'Gov', "Lt", "Maj", "Capt", "St", "Sr", "sr", "Jr", "jr", "Rev", "Sun", "Mon", "Tu", "Tue", "Tues", "Wed", "Th", "Thu", "Thur", "Thurs", "Fri", "Sat", "trans", "Univ", "Viz", "Vol", "vs", "v", ]; var Match = /** @class */ (function () { function Match() { } Match.setAbbreviations = function (abbr) { if (abbr) { abbreviations = abbr; } else { abbreviations = englishAbbreviations; } }; Match.isCapitalized = function (str) { return /^[A-Z][a-z].*/.test(str) || this.isNumber(str); }; // Start with opening quotes or capitalized letter Match.isSentenceStarter = function (str) { return this.isCapitalized(str) || /``|"|'/.test(str.substring(0, 2)); }; Match.isCommonAbbreviation = function (str) { var noSymbols = str.replace(/[-'`~!@#$%^&*()_|+=?;:'",.<>\{\}\[\]\\\/]/gi, ""); return ~abbreviations.indexOf(noSymbols); }; // This is going towards too much rule based Match.isTimeAbbreviation = function (word, next) { if (word === "a.m." || word === "p.m.") { var tmp = next.replace(/\W+/g, '').slice(-3).toLowerCase(); if (tmp === "day") { return true; } } return false; }; Match.isDottedAbbreviation = function (word) { var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/((.|[A-Z]+)\.)*/); return matches && matches[0].length > 0; }; // TODO look for next words, if multiple are capitalized, // then it's probably not a sentence ending Match.isCustomAbbreviation = function (str) { if (str.length <= 3) { return true; } return this.isCapitalized(str); }; // Uses current word count in sentence and next few words to check if it is // more likely an abbreviation + name or new sentence. Match.isNameAbbreviation = function (wordCount, words) { if (words.length > 0) { if (wordCount < 5 && words[0].length < 6 && this.isCapitalized(words[0])) { return true; } var capitalized = words.filter(function (str) { return /[A-Z]/.test(str.charAt(0)); }); return capitalized.length >= 3; } return false; }; Match.isNumber = function (str, dotPos) { if (dotPos) { str = str.slice(dotPos - 1, dotPos + 2); } return !isNaN(str); }; ; // Phone number matching // http://stackoverflow.com/a/123666/951517 Match.isPhoneNr = function (str) { return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/); }; ; // Match urls / emails // http://stackoverflow.com/a/3809435/951517 Match.isURL = function (str) { return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/); }; ; // Starting a new sentence if beginning with capital letter // Exception: The word is enclosed in brackets Match.isConcatenated = function (word) { var i = 0; if ((i = word.indexOf(".")) > -1 || (i = word.indexOf("!")) > -1 || (i = word.indexOf("?")) > -1) { var c = word.charAt(i + 1); // Check if the next word starts with a letter if (c.match(/[a-zA-Z].*/)) { return [word.slice(0, i), word.slice(i + 1)]; } } return false; }; ; Match.isBoundaryChar = function (word) { return word === "." || word === "!" || word === "?"; }; ; return Match; }()); exports.default = Match;