UNPKG

sbd-fork

Version:

Split text into sentences with Sentence Boundary Detection (SBD).

271 lines (212 loc) 9.15 kB
/*jshint node:true, laxcomma:true */ import sanitize from "sanitize-html"; import stringHelper from "./stringHelper"; import Match from "./Match"; var newline_placeholder = " @~@ "; var newline_placeholder_t = newline_placeholder.trim(); var whiteSpaceCheck = new RegExp("\\S", ""); var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g"); var splitIntoWords = new RegExp("\\S+|\\n", "g"); // Split the entry into sentences. export default class tokenizer { public static sentences(text, user_options?) { if (!text || typeof text !== "string" || !text.length) { return []; } if (!whiteSpaceCheck.test(text)) { // whitespace-only string has no sentences return []; } var options = { "newline_boundaries": false, "html_boundaries": false, "html_boundaries_tags": ["p", "div", "ul", "ol"], "sanitize": false, "allowed_tags": false, "preserve_whitespace": false, "abbreviations": null }; if (typeof user_options === "boolean") { // Deprecated quick option options.newline_boundaries = true; } else { // Extend options for (var k in user_options) { options[k] = user_options[k]; } } Match.setAbbreviations(options.abbreviations); if (options.newline_boundaries) { text = text.replace(addNewLineBoundaries, newline_placeholder); } if (options.html_boundaries) { var html_boundaries_regexp = "(<br\\s*\\/?>|<\\/(" + options.html_boundaries_tags.join("|") + ")>)"; var re = new RegExp(html_boundaries_regexp, "g"); text = text.replace(re, "$1" + newline_placeholder); } if (options.sanitize || options.allowed_tags) { let allowedTags: any = options.allowed_tags if (!options.allowed_tags) { allowedTags = [""]; } text = sanitize(text, { "allowedTags": allowedTags }); } // Split the text into words var words; var tokens; // Split the text into words if (options.preserve_whitespace) { // <br> tags are the odd man out, as whitespace is allowed inside the tag tokens = text.split(/(<br\s*\/?>|\S+|\n+)/); // every other token is a word words = tokens.filter(function (token, ii) { return ii % 2; }); } else { // - see http://blog.tompawlak.org/split-string-into-tokens-javascript words = text.trim().match(splitIntoWords); } var wordCount = 0; var index = 0; var temp: false | any[] = []; var sentences = []; var current = []; // If given text is only whitespace (or nothing of \S+) if (!words || !words.length) { return []; } for (var i = 0, L = words.length; i < L; i++) { wordCount++; // Add the word to current sentence current.push(words[i]); // Sub-sentences, reset counter if (~words[i].indexOf(",")) { wordCount = 0; } if (Match.isBoundaryChar(words[i]) || stringHelper.endsWithChar(words[i], "?!") || words[i] === newline_placeholder_t) { if ((options.newline_boundaries || options.html_boundaries) && words[i] === newline_placeholder_t) { current.pop(); } sentences.push(current); wordCount = 0; current = []; continue; } if (stringHelper.endsWithChar(words[i], "\"") || stringHelper.endsWithChar(words[i], "”")) { words[i] = words[i].slice(0, -1); } // A dot might indicate the end sentences // Exception: The next sentence starts with a word (non abbreviation) // that has a capital letter. if (stringHelper.endsWithChar(words[i], ".")) { // Check if there is a next word // This probably needs to be improved with machine learning if (i + 1 < L) { // Single character abbr. if (words[i].length === 2 && isNaN(words[i].charAt(0))) { continue; } // Common abbr. that often do not end sentences if (Match.isCommonAbbreviation(words[i])) { continue; } // Next word starts with capital word, but current sentence is // quite short if (Match.isSentenceStarter(words[i + 1])) { if (Match.isTimeAbbreviation(words[i], words[i + 1])) { continue; } // Dealing with names at the start of sentences if (Match.isNameAbbreviation(wordCount, words.slice(i, 6))) { continue; } if (Match.isNumber(words[i + 1])) { if (Match.isCustomAbbreviation(words[i])) { continue; } } } else { // Skip ellipsis if (stringHelper.endsWith(words[i], "..")) { continue; } //// Skip abbreviations // Short words + dot or a dot after each letter if (Match.isDottedAbbreviation(words[i])) { continue; } if (Match.isNameAbbreviation(wordCount, words.slice(i, 5))) { continue; } } } sentences.push(current); current = []; wordCount = 0; continue; } // Check if the word has a dot in it if ((index = words[i].indexOf(".")) > -1) { if (Match.isNumber(words[i], index)) { continue; } // Custom dotted abbreviations (like K.L.M or I.C.T) if (Match.isDottedAbbreviation(words[i])) { continue; } // Skip urls / emails and the like if (Match.isURL(words[i]) || Match.isPhoneNr(words[i])) { continue; } } if (temp = Match.isConcatenated(words[i])) { current.pop(); current.push(temp[0]); sentences.push(current); current = []; wordCount = 0; current.push(temp[1]); } } if (current.length) { sentences.push(current); } // Clear "empty" sentences sentences = sentences.filter(function (s) { return s.length > 0; }); var result = sentences.slice(1).reduce(function (out, sentence) { var lastSentence = out[out.length - 1]; // Single words, could be "enumeration lists" if (lastSentence.length === 1 && /^.{1,2}[.]$/.test(lastSentence[0])) { // Check if there is a next sentence // It should not be another list item if (!/[.]/.test(sentence[0])) { out.pop() out.push(lastSentence.concat(sentence)); return out; } } out.push(sentence); return out; }, [sentences[0]]); // join tokens back together return result.map(function (sentence, ii) { if (options.preserve_whitespace && !options.newline_boundaries && !options.html_boundaries) { // tokens looks like so: [leading-space token, non-space token, space // token, non-space token, space token... ]. In other words, the first // item is the leading space (or the empty string), and the rest of // the tokens are [non-space, space] token pairs. var tokenCount = sentence.length * 2; if (ii === 0) { tokenCount += 1; } return tokens.splice(0, tokenCount).join(""); } return sentence.join(" "); }); }; }