UNPKG

sbd-ts

Version:

Split text into sentences with Sentence Boundary Detection (SBD).

481 lines (391 loc) 14.5 kB
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer = f()}})(function(){var define,module,exports;return (function(){function r(e,n,t){function o(i,f){if(!n[i]){if(!e[i]){var c="function"==typeof require&&require;if(!f&&c)return c(i,!0);if(u)return u(i,!0);var a=new Error("Cannot find module '"+i+"'");throw a.code="MODULE_NOT_FOUND",a}var p=n[i]={exports:{}};e[i][0].call(p.exports,function(r){var n=e[i][1][r];return o(n||r)},p,p.exports,r,e,n,t)}return n[i].exports}for(var u="function"==typeof require&&require,i=0;i<t.length;i++)o(t[i]);return o}return r})()({1:[function(require,module,exports){ var abbreviations; var englishAbbreviations = [ "al", "adj", "assn", "Ave", "BSc", "MSc", "Cell", "Ch", "Co", "cc", "Corp", "Dem", "Dept", "ed", "eg", "Eq", "Eqs", "est", "est", "etc", "Ex", "ext", // + number? "Fig", "fig", "Figs", "figs", "i.e", "ie", "Inc", "inc", "Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec", "jr", "mi", "Miss", "Mrs", "Mr", "Ms", "Mol", "mt", "mts", "no", "Nos", "PhD", "MD", "BA", "MA", "MM", "pl", "pop", "pp", "Prof", "Dr", "pt", "Ref", "Refs", "Rep", "repr", "rev", "Sec", "Secs", "Sgt", "Col", "Gen", "Rep", "Sen",'Gov', "Lt", "Maj", "Capt","St", "Sr", "sr", "Jr", "jr", "Rev", "Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat", "trans", "Univ", "Viz", "Vol", "vs", "v", ]; exports.setAbbreviations = function(abbr) { if (abbr) { abbreviations = abbr; } else { abbreviations = englishAbbreviations; } } var isCapitalized = exports.isCapitalized = function(str) { return /^[A-Z][a-z].*/.test(str) || isNumber(str); } // Start with opening quotes or capitalized letter exports.isSentenceStarter = function(str) { return isCapitalized(str) || /``|"|'/.test(str.substring(0,2)); } exports.isCommonAbbreviation = function(str) { var noSymbols = str.replace(/[-'`~!@#$%^&*()_|+=?;:'",.<>\{\}\[\]\\\/]/gi, ""); return ~abbreviations.indexOf(noSymbols); } // This is going towards too much rule based exports.isTimeAbbreviation = function(word, next) { if (word === "a.m." || word === "p.m.") { var tmp = next.replace(/\W+/g, '').slice(-3).toLowerCase(); if (tmp === "day") { return true; } } return false; } exports.isDottedAbbreviation = function(word) { var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/(.\.)*/); return matches && matches[0].length > 0; } // TODO look for next words, if multiple are capitalized, // then it's probably not a sentence ending exports.isCustomAbbreviation = function(str) { if (str.length <= 3) { return true; } return isCapitalized(str); } // Uses current word count in sentence and next few words to check if it is // more likely an abbreviation + name or new sentence. exports.isNameAbbreviation = function(wordCount, words) { if (words.length > 0) { if (wordCount < 5 && words[0].length < 6 && isCapitalized(words[0])) { return true; } var capitalized = words.filter(function(str) { return /[A-Z]/.test(str.charAt(0)); }); return capitalized.length >= 3; } return false; } var isNumber = exports.isNumber = function(str, dotPos) { if (dotPos) { str = str.slice(dotPos-1, dotPos+2); } return !isNaN(str); }; // Phone number matching // http://stackoverflow.com/a/123666/951517 exports.isPhoneNr = function(str) { return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/); }; // Match urls / emails // http://stackoverflow.com/a/3809435/951517 exports.isURL = function(str) { return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/); }; // Starting a new sentence if beginning with capital letter // Exception: The word is enclosed in brackets exports.isConcatenated = function(word) { var i = 0; if ((i = word.indexOf(".")) > -1 || (i = word.indexOf("!")) > -1 || (i = word.indexOf("?")) > -1) { var c = word.charAt(i + 1); // Check if the next word starts with a letter if (c.match(/[a-zA-Z].*/)) { return [word.slice(0, i), word.slice(i+1)]; } } return false; }; exports.isBoundaryChar = function(word) { return word === "." || word === "!" || word === "?"; }; },{}],2:[function(require,module,exports){ module.exports = function sanitizeHtml(text, opts) { // Strip HTML from Text using browser HTML parser if ((typeof text == 'string' || text instanceof String) && typeof document !== "undefined") { var $div = document.createElement("DIV"); $div.innerHTML = text; text = ($div.textContent || '').trim(); } //DOM Object else if (typeof text === 'object' && text.textContent) { text = (text.textContent || '').trim(); } return text; }; },{}],3:[function(require,module,exports){ exports.endsWithChar = function ends_with_char(word, c) { if (c.length > 1) { return c.indexOf(word.slice(-1)) > -1; } return word.slice(-1) === c; }; exports.endsWith = function ends_with(word, end) { return word.slice(word.length - end.length) === end; }; },{}],4:[function(require,module,exports){ /*jshint node:true, laxcomma:true */ var sanitizeHtml = require("sanitize-html"); var stringHelper = require("./stringHelper"); var Match = require("./Match"); var newline_placeholder = " @~@ "; var newline_placeholder_t = newline_placeholder.trim(); var whiteSpaceCheck = new RegExp("\\S", ""); var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g"); var splitIntoWords = new RegExp("\\S+|\\n", "g"); // Split the entry into sentences. exports.sentences = function(text, user_options) { if (!text || typeof text !== "string" || !text.length) { return []; } if (!whiteSpaceCheck.test(text)) { // whitespace-only string has no sentences return []; } var options = { "newline_boundaries" : false, "html_boundaries" : false, "html_boundaries_tags": ["p","div","ul","ol"], "sanitize" : false, "allowed_tags" : false, "preserve_whitespace" : false, "abbreviations" : null }; if (typeof user_options === "boolean") { // Deprecated quick option options.newline_boundaries = true; } else { // Extend options for (var k in user_options) { options[k] = user_options[k]; } } Match.setAbbreviations(options.abbreviations); if (options.newline_boundaries) { text = text.replace(addNewLineBoundaries, newline_placeholder); } if (options.html_boundaries) { var html_boundaries_regexp = "(<br\\s*\\/?>|<\\/(" + options.html_boundaries_tags.join("|") + ")>)"; var re = new RegExp(html_boundaries_regexp, "g"); text = text.replace(re, "$1" + newline_placeholder); } if (options.sanitize || options.allowed_tags) { if (! options.allowed_tags) { options.allowed_tags = [""]; } text = sanitizeHtml(text, { "allowedTags" : options.allowed_tags }); } // Split the text into words var words; var tokens; // Split the text into words if (options.preserve_whitespace) { // <br> tags are the odd man out, as whitespace is allowed inside the tag tokens = text.split(/(<br\s*\/?>|\S+|\n+)/); // every other token is a word words = tokens.filter(function (token, ii) { return ii % 2; }); } else { // - see http://blog.tompawlak.org/split-string-into-tokens-javascript words = text.trim().match(splitIntoWords); } var wordCount = 0; var index = 0; var temp = []; var sentences = []; var current = []; // If given text is only whitespace (or nothing of \S+) if (!words || !words.length) { return []; } for (var i=0, L=words.length; i < L; i++) { wordCount++; // Add the word to current sentence current.push(words[i]); // Sub-sentences, reset counter if (~words[i].indexOf(",")) { wordCount = 0; } if (Match.isBoundaryChar(words[i]) || stringHelper.endsWithChar(words[i], "?!") || words[i] === newline_placeholder_t) { if ((options.newline_boundaries || options.html_boundaries) && words[i] === newline_placeholder_t) { current.pop(); } sentences.push(current); wordCount = 0; current = []; continue; } if (stringHelper.endsWithChar(words[i], "\"") || stringHelper.endsWithChar(words[i], "”")) { words[i] = words[i].slice(0, -1); } // A dot might indicate the end sentences // Exception: The next sentence starts with a word (non abbreviation) // that has a capital letter. if (stringHelper.endsWithChar(words[i], ".")) { // Check if there is a next word // This probably needs to be improved with machine learning if (i+1 < L) { // Single character abbr. if (words[i].length === 2 && isNaN(words[i].charAt(0))) { continue; } // Common abbr. that often do not end sentences if (Match.isCommonAbbreviation(words[i])) { continue; } // Next word starts with capital word, but current sentence is // quite short if (Match.isSentenceStarter(words[i+1])) { if (Match.isTimeAbbreviation(words[i], words[i+1])) { continue; } // Dealing with names at the start of sentences if (Match.isNameAbbreviation(wordCount, words.slice(i, 6))) { continue; } if (Match.isNumber(words[i+1])) { if (Match.isCustomAbbreviation(words[i])) { continue; } } } else { // Skip ellipsis if (stringHelper.endsWith(words[i], "..")) { continue; } //// Skip abbreviations // Short words + dot or a dot after each letter if (Match.isDottedAbbreviation(words[i])) { continue; } if (Match.isNameAbbreviation(wordCount, words.slice(i, 5))) { continue; } } } sentences.push(current); current = []; wordCount = 0; continue; } // Check if the word has a dot in it if ((index = words[i].indexOf(".")) > -1) { if (Match.isNumber(words[i], index)) { continue; } // Custom dotted abbreviations (like K.L.M or I.C.T) if (Match.isDottedAbbreviation(words[i])) { continue; } // Skip urls / emails and the like if (Match.isURL(words[i]) || Match.isPhoneNr(words[i])) { continue; } } if (temp = Match.isConcatenated(words[i])) { current.pop(); current.push(temp[0]); sentences.push(current); current = []; wordCount = 0; current.push(temp[1]); } } if (current.length) { sentences.push(current); } // Clear "empty" sentences sentences = sentences.filter(function(s) { return s.length > 0; }); var result = sentences.slice(1).reduce(function (out, sentence) { var lastSentence = out[out.length - 1]; // Single words, could be "enumeration lists" if (lastSentence.length === 1 && /^.{1,2}[.]$/.test(lastSentence[0])) { // Check if there is a next sentence // It should not be another list item if (!/[.]/.test(sentence[0])) { out.pop() out.push(lastSentence.concat(sentence)); return out; } } out.push(sentence); return out; }, [ sentences[0] ]); // join tokens back together return result.map(function (sentence, ii) { if (options.preserve_whitespace && !options.newline_boundaries && !options.html_boundaries) { // tokens looks like so: [leading-space token, non-space token, space // token, non-space token, space token... ]. In other words, the first // item is the leading space (or the empty string), and the rest of // the tokens are [non-space, space] token pairs. var tokenCount = sentence.length * 2; if (ii === 0) { tokenCount += 1; } return tokens.splice(0, tokenCount).join(""); } return sentence.join(" "); }); }; },{"./Match":1,"./stringHelper":3,"sanitize-html":2}]},{},[4])(4) });