UNPKG

@diplodoc/sentenizer

Version:

text segmentation into sentences

544 lines (530 loc) 21.3 kB
var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all2) => { for (var name in all2) __defProp(target, name, { get: all2[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var src_exports = {}; __export(src_exports, { sentenize: () => sentenize }); module.exports = __toCommonJS(src_exports); var import_ramda7 = require("ramda"); // src/parsers/index.ts var import_ramda2 = require("ramda"); // src/constants/markers.ts var SENTENCE_END_MARKERS = ".?!\u2026"; var QUOTATION_GENERIC_MARKERS = `"\u201E'`; var QUOTATION_CLOSE_MARKERS = "\xBB\u201D\u2019"; var BRACKETS_CLOSE_MARKERS = "\\)\\]\\}>"; // src/constants/parameters.ts var WINDOW_WIDTH = 10; // src/constants/abbreviations.ts var INITIALS = { "\u0434\u0436": true, "ed": true, "\u044D\u0434": true, "\u0432\u0441": true, "md": true, "\u043C\u0434": true }; var HEAD = { "\u0431\u0443\u043A\u0432": true, // яп. 18禁, букв. «запрещено "\u0441\u0442": true, // ст.-слав. "\u0442\u0440\u0430\u0434": true, // кит. трад "\u043B\u0430\u0442": true, "\u0432\u0435\u043D\u0433": true, "\u0438\u0441\u043F": true, "\u043A\u0430\u0442": true, "\u0443\u043A\u0440": true, "\u043D\u0435\u043C": true, "\u0430\u043D\u0433\u043B": true, "\u0444\u0440": true, "\u0438\u0442\u0430\u043B": true, "\u0433\u0440\u0435\u0447": true, "\u0435\u0432\u0440": true, "\u0430\u0440\u0430\u0431": true, "\u044F\u043F": true, "\u0441\u043B\u0430\u0432": true, "\u043A\u0438\u0442": true, "\u0440\u0443\u0441": true, "\u0440\u0443\u0441\u0441\u043A": true, "\u043B\u0430\u0442\u0432": true, "\u0441\u043B\u043E\u0432\u0430\u0446\u043A": true, "\u0445\u043E\u0440\u0432": true, "mr": true, "mrs": true, "ms": true, "dr": true, "vs": true, "\u0441\u0432": true, // св.Иоанна "\u0430\u0440\u0445": true, "\u0437\u0430\u0432": true, "\u0437\u0430\u043C": true, "\u043F\u0440\u043E\u0444": true, "\u0430\u043A\u0430\u0434": true, "\u043A\u043D": true, // кандидат наук "\u043A\u043E\u0440\u0440": true, // сообщил корр. ИТАР-ТАСС "\u0440\u0435\u0434": true, // Под ред. Линды Уильямс "\u0433\u0440": true, // гр. Валевской "\u0441\u0440": true, // Ср. L. Ross "\u0447\u043B": true, "\u0438\u043C": true, // им. Вс. Мейерхольда "\u0442\u043E\u0432": true, // тюремном подвале тов. Берия "\u043D\u0430\u0447": true, "\u043F\u043E\u043B": true, // нач. XX века "chap": true, "\u043F": true, "\u043F\u043F": true, "\u0447": true, "\u0447\u0447": true, "\u0433\u043B": true, "\u0430\u0431\u0437": true, "\u043F\u0442": true, // ст. 129 ч. 2 п. 8 Гл. VI "no": true, // No. 6 "\u043F\u0440\u043E\u0441\u043F": true, "\u043F\u0440": true, "\u0443\u043B": true, "\u0448": true, "\u0433": true, "\u0433\u043E\u0440": true, "\u0434": true, "\u0441\u0442\u0440": true, "\u043A": true, "\u043A\u043E\u0440\u043F": true, "\u043F\u0435\u0440": true, "\u043E\u0431\u043B": true, "\u044D\u0442": true, "\u043F\u043E\u043C": true, "\u0430\u0443\u0434": true, "\u043E\u0444": true, "\u043A\u043E\u043C": true, "\u043A\u043E\u043C\u043D": true, "\u043A\u0430\u0431": true, "\u0434\u043E\u043C\u043E\u0432\u043B\u0430\u0434": true, "\u043B\u0438\u0442": true, "\u0442": true, // т. 1 л.д. 85-89 "\u0440\u043F": true, "\u043F\u043E\u0441": true, "\u0441": true, "\u0445": true, // х. Ново-Максимовский, с. Кляшево рп.Раздолинск "\u043F\u043B": true, // площадь "bd": true, // Bd. 16, Berlin "\u043E": true, "\u043E\u0437": true, // Вблизи оз. Селяха "\u0440": true, // р. Иордан "\u0430": true, // а. Адыге-Хабль "\u043E\u0431\u0440": true, // обр. 1936 г. "\u0443\u043C": true, // ум. 1064 "\u043E\u043A": true, // "родилась ок. 1211", "работают ок. 150 специалистов" "\u043E\u0442\u043A\u0440": true, // Откр. 20:40 "\u043F\u0441": true, "ps": true, "upd": true, "\u0441\u043C": true, "\u043D\u0430\u043F\u0440": true, // UNIX-семейства, напр. Linux, FreeBSD "\u0434\u043E\u043F": true, "\u044E\u0440": true, "\u0444\u0438\u0437": true, // юр. адрес "\u0442\u0435\u043B": true, "\u0441\u0431": true, // Сб. «Киноварь» "\u0432\u043D\u0443\u0442\u0440": true, // к внутр. миру героев "\u0434\u0438\u0444\u0444": true, // мне по дифф. зачёту «5» поставил "\u0433\u043E\u0441": true, // гос. экзамены "\u043E\u0442\u043C": true, // от отм. 0.000 "\u0434\u043E\u0431": true // доб. 1243 (телефон) }; var TAIL = { "\u0434\u0435\u0441": true, "\u0442\u044B\u0441": true, "\u043C\u043B\u043D": true, "\u043C\u043B\u0440\u0434": true, "\u0434\u043E\u043B": true, "\u0434\u043E\u043B\u043B": true, "\u043A\u043E\u043F": true, "\u0440\u0443\u0431": true, "\u0440": true, "\u043F\u0440\u043E\u0446": true, // 95 проц. акций, "\u0433\u0430": true, "\u0431\u0430\u0440\u0440": true, // 40 долларов за барр. "\u043A\u0443\u0431": true, // 1000 куб. метр. "\u043A\u0432": true, "\u043A\u043C": true, // 700 тыс. кв. км. "\u0441\u043C": true, // 30 см "\u0447\u0430\u0441": true, "\u043C\u0438\u043D": true, "\u0441\u0435\u043A": true, // в 15 час. 13 мин. 53 сек. "\u0432": true, "\u0432\u0432": true, // XII в. XVIII—XIX вв. "\u0433": true, "\u0433\u0433": true, // 1996-1999гг "\u0441": true, "\u0441\u0442\u0440": true, // 287 стр. "co": true, "corp": true, "inc": true, "\u0438\u0437\u0434": true, "ed": true, // 1-е изд. Arthur W. Hummel, ed. Eminent Chinese "\u0434\u0440": true, // и другие "al": true // North et al. }; var OTHER = { "\u0441\u043E\u043A\u0440": true, "\u0440\u0438\u0441": true, "\u0438\u0441\u043A\u043B": true, "\u043F\u0440\u0438\u043C": true, "\u044F\u0437": true, "\u0443\u0441\u0442\u0430\u0440": true, // пометкой "устар." "\u0448\u0443\u0442\u043B": true // "в стиле шутл.", "bones — шутл. человек" }; var HEAD_PAIR = { "\u0442.\u0435": true, "\u0442.\u043A": true, "\u0438.\u043E": true, "\u043A.\u043D": true, "\u043A.\u043F": true, "\u043F.\u043D": true, // к.п.н "\u043A.\u0442": true, "\u0442.\u043D": true, // к.т.н "\u043B.\u0434": true // т. 1 л.д. 85-89 }; var TAIL_PAIR = { "\u0442.\u043F": true, "\u0447.\u0442": true, "\u0442.\u0434": true, // ч.т.д "\u0443.\u0435": true, "\u043D.\u044D": true, "p.m": true, "a.m": true, "\u0441.\u0433": true, // от 18 мая с. г. "\u0440.\u0445": true, // 250 года до Р. Х. "\u0441.\u0448": true, // 50°13′ с. ш. "\u0437.\u0434": true, // 12°48′ з. д. "\u043B.\u0441": true }; var OTHER_PAIR = { "\u0435\u0434.\u0447": true, "\u043C\u043D.\u0447": true, "\u043F\u043E\u0432\u0435\u043B.\u043D\u0430\u043A\u043B": true, // в 1 лице мн. ч. повел. накл. "\u0436\u0435\u043D.\u0440": true, "\u043C\u0443\u0436.\u0440": true }; // src/lenses/index.ts var import_ramda = require("ramda"); var first = () => (0, import_ramda.lensIndex)(0); var second = () => (0, import_ramda.lensIndex)(1); var last = () => (0, import_ramda.lensIndex)(-1); // src/parsers/index.ts var firstString = first(); var fst = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(firstString)); var secondString = second(); var snd = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(secondString)); var lastString = last(); var lst = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(lastString)); var sentencePattern = `([^${SENTENCE_END_MARKERS}]*?[${SENTENCE_END_MARKERS}]+)`; var senteceFlags = "gmu"; var sentenceRegExp = new RegExp(sentencePattern, senteceFlags); var sentences = (0, import_ramda2.compose)((0, import_ramda2.filter)(Boolean), (0, import_ramda2.split)(sentenceRegExp)); var sentenceDelimitersPattern = `([${SENTENCE_END_MARKERS}]+)$`; var sentenceDelimitersFlags = "gmu"; var sentenceDelimitersRegExp = new RegExp(sentenceDelimitersPattern, sentenceDelimitersFlags); var words = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.replace)(sentenceDelimitersRegExp)("")); var delimiters = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(sentenceDelimitersRegExp)); var fstTokenPattern = /^\s*([^\s]+?)(?=\s|$)/; var fstTokenFlags = "mu"; var fstTokenRegExp = new RegExp(fstTokenPattern, fstTokenFlags); var fstToken = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(fstTokenRegExp)); var fstWord = (0, import_ramda2.compose)(fstToken, words); var lstTokenPattern = /([^\s]+)\s*$/; var lstTokenFlags = "mu"; var lstTokenRegExp = new RegExp(lstTokenPattern, lstTokenFlags); var lstToken = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(lstTokenRegExp)); var nonAlphaStartPattern = /^[^\wа-яА-Я]*/; var nonAlphaStartFlags = "gmu"; var nonAlphaStartRegExp = new RegExp(nonAlphaStartPattern, nonAlphaStartFlags); var omitNonAlphaStart = (0, import_ramda2.replace)(nonAlphaStartRegExp, ""); var lstWord = (0, import_ramda2.compose)(lstToken, words); var fstChars = (width = WINDOW_WIDTH) => { const fstCharsPattern = `^[\\s\\S]{0,${width}}`; const fstCharsFlags = "gmu"; const fstCharsRegExp = new RegExp(fstCharsPattern, fstCharsFlags); return (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(fstCharsRegExp)); }; var lstChars = (width = WINDOW_WIDTH) => { const lstCharsPattern = `.{0,${width}}$`; const lstCharsFlags = "gmu"; const lstCharsRegExp = new RegExp(lstCharsPattern, lstCharsFlags); return (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(lstCharsRegExp)); }; var spacePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(/^\s/)); var spaceSuffix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(/\s$/)); var quotationGenericPrefixPattern = `^([${QUOTATION_GENERIC_MARKERS}]+)`; var quotationGenericPrefixFlags = "mu"; var quotationGenericPrefixRegExp = new RegExp( quotationGenericPrefixPattern, quotationGenericPrefixFlags ); var quotationGenericPrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(quotationGenericPrefixRegExp)); var quotationClosePrefixPattern = `^([${QUOTATION_CLOSE_MARKERS}]+)`; var quotationClosePrefixFlags = "mu"; var quotationClosePrefixRegExp = new RegExp( quotationClosePrefixPattern, quotationClosePrefixFlags ); var quotationClosePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(quotationClosePrefixRegExp)); var delimiterPrefixPattern = `^([${SENTENCE_END_MARKERS}]+)`; var delimiterPrefixFlags = "mu"; var delimiterPrefixRegExp = new RegExp(delimiterPrefixPattern, delimiterPrefixFlags); var delimiterPrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(delimiterPrefixRegExp)); var bracketsClosePrefixPattern = `^([${BRACKETS_CLOSE_MARKERS}]+)`; var bracketsClosePrefixFlags = "mu"; var bracketsClosePrefixRegExp = new RegExp(bracketsClosePrefixPattern, bracketsClosePrefixFlags); var bracketsClosePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(bracketsClosePrefixRegExp)); var spacesPattern = /^(\s+)$/; var spacesFlags = "gmu"; var spacesRegExp = new RegExp(spacesPattern, spacesFlags); var spaces = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(spacesRegExp)); var dotSuffixPattern = /[^.](\.)$/; var dotSuffixFlags = "mu"; var dotSuffixRegExp = new RegExp(dotSuffixPattern, dotSuffixFlags); var dotSuffix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(dotSuffixRegExp)); // src/rules/base.ts var import_ramda5 = require("ramda"); // src/utilities/list.ts var import_ramda3 = require("ramda"); var lenLte = (len) => (0, import_ramda3.compose)((0, import_ramda3.curry)((0, import_ramda3.flip)(import_ramda3.lte))(len), import_ramda3.length); var allEqual = (0, import_ramda3.compose)(lenLte(1), import_ramda3.uniq); var lengthNonZero = (0, import_ramda3.compose)(Boolean, import_ramda3.length); // src/utilities/string.ts var import_ramda4 = require("ramda"); var charAt = (0, import_ramda4.invoker)(1, "charAt"); var notAlpha = (0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.toLower, import_ramda4.toUpper])); var hasAlpha = (0, import_ramda4.compose)(import_ramda4.not, notAlpha); var startsWithLower = (0, import_ramda4.allPass)([ (0, import_ramda4.compose)(hasAlpha, charAt(0)), (0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.identity, import_ramda4.toLower]), charAt(0)) ]); var startsWithUpper = (0, import_ramda4.allPass)([ (0, import_ramda4.compose)(hasAlpha, charAt(0)), (0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.identity, import_ramda4.toUpper]), charAt(0)) ]); var startsWithNewline = (0, import_ramda4.compose)(lengthNonZero, (0, import_ramda4.match)(/^\n/)); var startsWithHardbreak = (0, import_ramda4.compose)(lengthNonZero, (0, import_ramda4.match)(/^\n\n/)); var endsWithHardbreak = (0, import_ramda4.compose)(lengthNonZero, (0, import_ramda4.match)(/\n\n$/)); var isUpper = (0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.toUpper, import_ramda4.identity])); // src/rules/base.ts var isLeftDotDelimiter = (0, import_ramda5.compose)(lengthNonZero, dotSuffix); var isLeftSingleLetter = (0, import_ramda5.compose)((0, import_ramda5.equals)(1), import_ramda5.length, lstWord); var isLeftUpper = (0, import_ramda5.compose)(allEqual, (0, import_ramda5.juxt)([import_ramda5.toUpper, import_ramda5.identity]), lstWord); var leftHasAlpha = (0, import_ramda5.compose)(hasAlpha, lstWord); var isSpaceSuffix = (0, import_ramda5.compose)(lengthNonZero, spaceSuffix); var isSpacePrefix = (0, import_ramda5.compose)(lengthNonZero, spacePrefix); var log = (name, action) => { return (...args) => { const result = action(...args); if (process.env.DEBUG) { console.log(name, args, result); } return result; }; }; var _ = (0, import_ramda5.always)(true); var rule = (name, [left, right], remap = import_ramda5.identity) => { return log(name, (0, import_ramda5.compose)( (0, import_ramda5.all)(Boolean), (0, import_ramda5.zipWith)(import_ramda5.call, [left, right]), (0, import_ramda5.map)(remap) )); }; var spaceBothSides = rule("spaceBothSides", [isSpaceSuffix, isSpacePrefix], words); var rightLacksSpacePrefix = rule("rightLacksSpacePrefix", [_, (0, import_ramda5.compose)(import_ramda5.not, isSpacePrefix)], words); var rightStartsWithLowercase = rule("rightStartsWithLowercase", [_, (0, import_ramda5.compose)(startsWithLower, fstToken)]); var rightDelimiterPrefix = rule("rightDelimiterPrefix", [_, (0, import_ramda5.compose)(lengthNonZero, delimiterPrefix, fstToken)]); var rightQuotationGenericPrefix = rule("rightQuotationGenericPrefix", [_, (0, import_ramda5.compose)(lengthNonZero, quotationGenericPrefix)]); var rightQuotationClosePrefix = rule("rightQuotationClosePrefix", [_, (0, import_ramda5.compose)(lengthNonZero, quotationClosePrefix, fstToken)]); var rightBracketsClosePrefix = rule("rightBracketsClosePrefix", [_, (0, import_ramda5.compose)(lengthNonZero, bracketsClosePrefix, fstToken)]); var rightOnlySpaces = rule("rightOnlySpaces", [_, (0, import_ramda5.compose)(lengthNonZero, spaces)]); var leftEndsWithHardbreak = rule("leftEndsWithHardbreak", [endsWithHardbreak, _]); var rightStartsWithHardbreak = rule("rightStartsWithHardbreak", [_, startsWithHardbreak]); var rightStartsNewlineUppercased = rule("rightStartsNewlineUppercased", [_, (0, import_ramda5.allPass)([startsWithNewline, startsWithUpper])]); var leftInitials = rule("leftInitials", [(0, import_ramda5.allPass)([isLeftDotDelimiter, isLeftSingleLetter, isLeftUpper, leftHasAlpha]), _]); // src/rules/abbreviations.ts var import_ramda6 = require("ramda"); var fst2 = (0, import_ramda6.compose)((0, import_ramda6.defaultTo)(""), (0, import_ramda6.view)(first())); var snd2 = (0, import_ramda6.compose)((0, import_ramda6.defaultTo)(""), (0, import_ramda6.view)(second())); var isDotDelimiter = (0, import_ramda6.compose)(lengthNonZero, dotSuffix); var hash = (0, import_ramda6.compose)(import_ramda6.toLower, (0, import_ramda6.join)(".")); var insidePairAbbreviationMap = (0, import_ramda6.anyPass)([ (0, import_ramda6.prop)(import_ramda6.__, HEAD_PAIR), (0, import_ramda6.prop)(import_ramda6.__, TAIL_PAIR), (0, import_ramda6.prop)(import_ramda6.__, OTHER_PAIR) ]); var isPairAbbreviation = (0, import_ramda6.compose)( insidePairAbbreviationMap, hash, (0, import_ramda6.zipWith)(import_ramda6.call, [ (0, import_ramda6.compose)(omitNonAlphaStart, lstWord, lstToken), (0, import_ramda6.compose)(fstWord, fstToken) ]) ); var pairAbbreviation = (0, import_ramda6.allPass)([ (0, import_ramda6.compose)(isDotDelimiter, lstToken, fst2), isPairAbbreviation ]); var insideAbbreviationMap = (0, import_ramda6.anyPass)([ // @ts-ignore (0, import_ramda6.prop)(import_ramda6.__, INITIALS), // @ts-ignore (0, import_ramda6.prop)(import_ramda6.__, HEAD), // @ts-ignore (0, import_ramda6.prop)(import_ramda6.__, TAIL), // @ts-ignore (0, import_ramda6.prop)(import_ramda6.__, OTHER) ]); var isLeftAbbreviation = (0, import_ramda6.compose)( insideAbbreviationMap, omitNonAlphaStart, import_ramda6.toLower, lstWord, lstToken ); var leftAbbreviation = (0, import_ramda6.compose)( (0, import_ramda6.allPass)([(0, import_ramda6.compose)(isDotDelimiter, lstToken), isLeftAbbreviation]), fst2 ); var isCaps = (0, import_ramda6.allPass)([isUpper, (0, import_ramda6.compose)((0, import_ramda6.lt)(1), import_ramda6.length)]); var rightLowercaseOrCaps = (0, import_ramda6.compose)((0, import_ramda6.anyPass)([startsWithLower, isCaps]), fstWord, snd2); var before = (s) => (t) => s.slice(0, Math.max(s.indexOf(t), 0)); var isLeftPairsTail = (left) => { const rest = before(left); const head = (0, import_ramda6.compose)(words, lstWord, rest, lstWord, lstToken); return (0, import_ramda6.or)( isPairAbbreviation([head(left), lstWord(left)]), isPairAbbreviation(lstWord(left).split(".")) ); }; var leftPairsTailAbbreviation = (0, import_ramda6.allPass)([ (0, import_ramda6.compose)(isDotDelimiter, lstToken, fst2), (0, import_ramda6.compose)(isLeftPairsTail, fst2), rightLowercaseOrCaps ]); // src/index.ts var leftPreprocessor = lstChars(20); var rightPreprocessor = fstChars(20); var sidesPreprocessors = [leftPreprocessor, rightPreprocessor]; var joinCondition = (0, import_ramda7.anyPass)([ spaceBothSides, rightLacksSpacePrefix, rightStartsWithLowercase, rightDelimiterPrefix, rightQuotationGenericPrefix, rightQuotationClosePrefix, rightBracketsClosePrefix, rightOnlySpaces, leftInitials, leftAbbreviation, pairAbbreviation, leftPairsTailAbbreviation ]); var breakCondition = (0, import_ramda7.anyPass)([ leftEndsWithHardbreak, rightStartsWithHardbreak, rightStartsNewlineUppercased ]); var join2 = (0, import_ramda7.compose)(joinCondition, (0, import_ramda7.zipWith)(import_ramda7.call, sidesPreprocessors)); var breaks = (0, import_ramda7.compose)(breakCondition, (0, import_ramda7.zipWith)(import_ramda7.call, sidesPreprocessors)); function sentenize(text) { const parts = text.split(/((?:\n\s*){2,})/); const parsed = []; for (const part of parts) { const chunks = sentences(part); let left = null; for (const right of chunks) { if (!left) { left = right; continue; } if (!breaks([left, right]) && join2([left, right])) { left += right; } else { parsed.push(left); left = right; } } if (left) parsed.push(left); } return parsed; } // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { sentenize });