@diplodoc/sentenizer
Version:
text segmentation into sentences
544 lines (530 loc) • 21.3 kB
JavaScript
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all2) => {
for (var name in all2)
__defProp(target, name, { get: all2[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.ts
var src_exports = {};
__export(src_exports, {
sentenize: () => sentenize
});
module.exports = __toCommonJS(src_exports);
var import_ramda7 = require("ramda");
// src/parsers/index.ts
var import_ramda2 = require("ramda");
// src/constants/markers.ts
var SENTENCE_END_MARKERS = ".?!\u2026";
var QUOTATION_GENERIC_MARKERS = `"\u201E'`;
var QUOTATION_CLOSE_MARKERS = "\xBB\u201D\u2019";
var BRACKETS_CLOSE_MARKERS = "\\)\\]\\}>";
// src/constants/parameters.ts
var WINDOW_WIDTH = 10;
// src/constants/abbreviations.ts
var INITIALS = {
"\u0434\u0436": true,
"ed": true,
"\u044D\u0434": true,
"\u0432\u0441": true,
"md": true,
"\u043C\u0434": true
};
var HEAD = {
"\u0431\u0443\u043A\u0432": true,
// яп. 18禁, букв. «запрещено
"\u0441\u0442": true,
// ст.-слав.
"\u0442\u0440\u0430\u0434": true,
// кит. трад
"\u043B\u0430\u0442": true,
"\u0432\u0435\u043D\u0433": true,
"\u0438\u0441\u043F": true,
"\u043A\u0430\u0442": true,
"\u0443\u043A\u0440": true,
"\u043D\u0435\u043C": true,
"\u0430\u043D\u0433\u043B": true,
"\u0444\u0440": true,
"\u0438\u0442\u0430\u043B": true,
"\u0433\u0440\u0435\u0447": true,
"\u0435\u0432\u0440": true,
"\u0430\u0440\u0430\u0431": true,
"\u044F\u043F": true,
"\u0441\u043B\u0430\u0432": true,
"\u043A\u0438\u0442": true,
"\u0440\u0443\u0441": true,
"\u0440\u0443\u0441\u0441\u043A": true,
"\u043B\u0430\u0442\u0432": true,
"\u0441\u043B\u043E\u0432\u0430\u0446\u043A": true,
"\u0445\u043E\u0440\u0432": true,
"mr": true,
"mrs": true,
"ms": true,
"dr": true,
"vs": true,
"\u0441\u0432": true,
// св.Иоанна
"\u0430\u0440\u0445": true,
"\u0437\u0430\u0432": true,
"\u0437\u0430\u043C": true,
"\u043F\u0440\u043E\u0444": true,
"\u0430\u043A\u0430\u0434": true,
"\u043A\u043D": true,
// кандидат наук
"\u043A\u043E\u0440\u0440": true,
// сообщил корр. ИТАР-ТАСС
"\u0440\u0435\u0434": true,
// Под ред. Линды Уильямс
"\u0433\u0440": true,
// гр. Валевской
"\u0441\u0440": true,
// Ср. L. Ross
"\u0447\u043B": true,
"\u0438\u043C": true,
// им. Вс. Мейерхольда
"\u0442\u043E\u0432": true,
// тюремном подвале тов. Берия
"\u043D\u0430\u0447": true,
"\u043F\u043E\u043B": true,
// нач. XX века
"chap": true,
"\u043F": true,
"\u043F\u043F": true,
"\u0447": true,
"\u0447\u0447": true,
"\u0433\u043B": true,
"\u0430\u0431\u0437": true,
"\u043F\u0442": true,
// ст. 129 ч. 2 п. 8 Гл. VI
"no": true,
// No. 6
"\u043F\u0440\u043E\u0441\u043F": true,
"\u043F\u0440": true,
"\u0443\u043B": true,
"\u0448": true,
"\u0433": true,
"\u0433\u043E\u0440": true,
"\u0434": true,
"\u0441\u0442\u0440": true,
"\u043A": true,
"\u043A\u043E\u0440\u043F": true,
"\u043F\u0435\u0440": true,
"\u043E\u0431\u043B": true,
"\u044D\u0442": true,
"\u043F\u043E\u043C": true,
"\u0430\u0443\u0434": true,
"\u043E\u0444": true,
"\u043A\u043E\u043C": true,
"\u043A\u043E\u043C\u043D": true,
"\u043A\u0430\u0431": true,
"\u0434\u043E\u043C\u043E\u0432\u043B\u0430\u0434": true,
"\u043B\u0438\u0442": true,
"\u0442": true,
// т. 1 л.д. 85-89
"\u0440\u043F": true,
"\u043F\u043E\u0441": true,
"\u0441": true,
"\u0445": true,
// х. Ново-Максимовский, с. Кляшево рп.Раздолинск
"\u043F\u043B": true,
// площадь
"bd": true,
// Bd. 16, Berlin
"\u043E": true,
"\u043E\u0437": true,
// Вблизи оз. Селяха
"\u0440": true,
// р. Иордан
"\u0430": true,
// а. Адыге-Хабль
"\u043E\u0431\u0440": true,
// обр. 1936 г.
"\u0443\u043C": true,
// ум. 1064
"\u043E\u043A": true,
// "родилась ок. 1211", "работают ок. 150 специалистов"
"\u043E\u0442\u043A\u0440": true,
// Откр. 20:40
"\u043F\u0441": true,
"ps": true,
"upd": true,
"\u0441\u043C": true,
"\u043D\u0430\u043F\u0440": true,
// UNIX-семейства, напр. Linux, FreeBSD
"\u0434\u043E\u043F": true,
"\u044E\u0440": true,
"\u0444\u0438\u0437": true,
// юр. адрес
"\u0442\u0435\u043B": true,
"\u0441\u0431": true,
// Сб. «Киноварь»
"\u0432\u043D\u0443\u0442\u0440": true,
// к внутр. миру героев
"\u0434\u0438\u0444\u0444": true,
// мне по дифф. зачёту «5» поставил
"\u0433\u043E\u0441": true,
// гос. экзамены
"\u043E\u0442\u043C": true,
// от отм. 0.000
"\u0434\u043E\u0431": true
// доб. 1243 (телефон)
};
var TAIL = {
"\u0434\u0435\u0441": true,
"\u0442\u044B\u0441": true,
"\u043C\u043B\u043D": true,
"\u043C\u043B\u0440\u0434": true,
"\u0434\u043E\u043B": true,
"\u0434\u043E\u043B\u043B": true,
"\u043A\u043E\u043F": true,
"\u0440\u0443\u0431": true,
"\u0440": true,
"\u043F\u0440\u043E\u0446": true,
// 95 проц. акций,
"\u0433\u0430": true,
"\u0431\u0430\u0440\u0440": true,
// 40 долларов за барр.
"\u043A\u0443\u0431": true,
// 1000 куб. метр.
"\u043A\u0432": true,
"\u043A\u043C": true,
// 700 тыс. кв. км.
"\u0441\u043C": true,
// 30 см
"\u0447\u0430\u0441": true,
"\u043C\u0438\u043D": true,
"\u0441\u0435\u043A": true,
// в 15 час. 13 мин. 53 сек.
"\u0432": true,
"\u0432\u0432": true,
// XII в. XVIII—XIX вв.
"\u0433": true,
"\u0433\u0433": true,
// 1996-1999гг
"\u0441": true,
"\u0441\u0442\u0440": true,
// 287 стр.
"co": true,
"corp": true,
"inc": true,
"\u0438\u0437\u0434": true,
"ed": true,
// 1-е изд. Arthur W. Hummel, ed. Eminent Chinese
"\u0434\u0440": true,
// и другие
"al": true
// North et al.
};
var OTHER = {
"\u0441\u043E\u043A\u0440": true,
"\u0440\u0438\u0441": true,
"\u0438\u0441\u043A\u043B": true,
"\u043F\u0440\u0438\u043C": true,
"\u044F\u0437": true,
"\u0443\u0441\u0442\u0430\u0440": true,
// пометкой "устар."
"\u0448\u0443\u0442\u043B": true
// "в стиле шутл.", "bones — шутл. человек"
};
var HEAD_PAIR = {
"\u0442.\u0435": true,
"\u0442.\u043A": true,
"\u0438.\u043E": true,
"\u043A.\u043D": true,
"\u043A.\u043F": true,
"\u043F.\u043D": true,
// к.п.н
"\u043A.\u0442": true,
"\u0442.\u043D": true,
// к.т.н
"\u043B.\u0434": true
// т. 1 л.д. 85-89
};
var TAIL_PAIR = {
"\u0442.\u043F": true,
"\u0447.\u0442": true,
"\u0442.\u0434": true,
// ч.т.д
"\u0443.\u0435": true,
"\u043D.\u044D": true,
"p.m": true,
"a.m": true,
"\u0441.\u0433": true,
// от 18 мая с. г.
"\u0440.\u0445": true,
// 250 года до Р. Х.
"\u0441.\u0448": true,
// 50°13′ с. ш.
"\u0437.\u0434": true,
// 12°48′ з. д.
"\u043B.\u0441": true
};
var OTHER_PAIR = {
"\u0435\u0434.\u0447": true,
"\u043C\u043D.\u0447": true,
"\u043F\u043E\u0432\u0435\u043B.\u043D\u0430\u043A\u043B": true,
// в 1 лице мн. ч. повел. накл.
"\u0436\u0435\u043D.\u0440": true,
"\u043C\u0443\u0436.\u0440": true
};
// src/lenses/index.ts
var import_ramda = require("ramda");
var first = () => (0, import_ramda.lensIndex)(0);
var second = () => (0, import_ramda.lensIndex)(1);
var last = () => (0, import_ramda.lensIndex)(-1);
// src/parsers/index.ts
var firstString = first();
var fst = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(firstString));
var secondString = second();
var snd = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(secondString));
var lastString = last();
var lst = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.view)(lastString));
var sentencePattern = `([^${SENTENCE_END_MARKERS}]*?[${SENTENCE_END_MARKERS}]+)`;
var senteceFlags = "gmu";
var sentenceRegExp = new RegExp(sentencePattern, senteceFlags);
var sentences = (0, import_ramda2.compose)((0, import_ramda2.filter)(Boolean), (0, import_ramda2.split)(sentenceRegExp));
var sentenceDelimitersPattern = `([${SENTENCE_END_MARKERS}]+)$`;
var sentenceDelimitersFlags = "gmu";
var sentenceDelimitersRegExp = new RegExp(sentenceDelimitersPattern, sentenceDelimitersFlags);
var words = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), (0, import_ramda2.replace)(sentenceDelimitersRegExp)(""));
var delimiters = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(sentenceDelimitersRegExp));
var fstTokenPattern = /^\s*([^\s]+?)(?=\s|$)/;
var fstTokenFlags = "mu";
var fstTokenRegExp = new RegExp(fstTokenPattern, fstTokenFlags);
var fstToken = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(fstTokenRegExp));
var fstWord = (0, import_ramda2.compose)(fstToken, words);
var lstTokenPattern = /([^\s]+)\s*$/;
var lstTokenFlags = "mu";
var lstTokenRegExp = new RegExp(lstTokenPattern, lstTokenFlags);
var lstToken = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(lstTokenRegExp));
var nonAlphaStartPattern = /^[^\wа-яА-Я]*/;
var nonAlphaStartFlags = "gmu";
var nonAlphaStartRegExp = new RegExp(nonAlphaStartPattern, nonAlphaStartFlags);
var omitNonAlphaStart = (0, import_ramda2.replace)(nonAlphaStartRegExp, "");
var lstWord = (0, import_ramda2.compose)(lstToken, words);
var fstChars = (width = WINDOW_WIDTH) => {
const fstCharsPattern = `^[\\s\\S]{0,${width}}`;
const fstCharsFlags = "gmu";
const fstCharsRegExp = new RegExp(fstCharsPattern, fstCharsFlags);
return (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(fstCharsRegExp));
};
var lstChars = (width = WINDOW_WIDTH) => {
const lstCharsPattern = `.{0,${width}}$`;
const lstCharsFlags = "gmu";
const lstCharsRegExp = new RegExp(lstCharsPattern, lstCharsFlags);
return (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(lstCharsRegExp));
};
var spacePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(/^\s/));
var spaceSuffix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(/\s$/));
var quotationGenericPrefixPattern = `^([${QUOTATION_GENERIC_MARKERS}]+)`;
var quotationGenericPrefixFlags = "mu";
var quotationGenericPrefixRegExp = new RegExp(
quotationGenericPrefixPattern,
quotationGenericPrefixFlags
);
var quotationGenericPrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(quotationGenericPrefixRegExp));
var quotationClosePrefixPattern = `^([${QUOTATION_CLOSE_MARKERS}]+)`;
var quotationClosePrefixFlags = "mu";
var quotationClosePrefixRegExp = new RegExp(
quotationClosePrefixPattern,
quotationClosePrefixFlags
);
var quotationClosePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(quotationClosePrefixRegExp));
var delimiterPrefixPattern = `^([${SENTENCE_END_MARKERS}]+)`;
var delimiterPrefixFlags = "mu";
var delimiterPrefixRegExp = new RegExp(delimiterPrefixPattern, delimiterPrefixFlags);
var delimiterPrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(delimiterPrefixRegExp));
var bracketsClosePrefixPattern = `^([${BRACKETS_CLOSE_MARKERS}]+)`;
var bracketsClosePrefixFlags = "mu";
var bracketsClosePrefixRegExp = new RegExp(bracketsClosePrefixPattern, bracketsClosePrefixFlags);
var bracketsClosePrefix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(bracketsClosePrefixRegExp));
var spacesPattern = /^(\s+)$/;
var spacesFlags = "gmu";
var spacesRegExp = new RegExp(spacesPattern, spacesFlags);
var spaces = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), fst, (0, import_ramda2.match)(spacesRegExp));
var dotSuffixPattern = /[^.](\.)$/;
var dotSuffixFlags = "mu";
var dotSuffixRegExp = new RegExp(dotSuffixPattern, dotSuffixFlags);
var dotSuffix = (0, import_ramda2.compose)((0, import_ramda2.defaultTo)(""), snd, (0, import_ramda2.match)(dotSuffixRegExp));
// src/rules/base.ts
var import_ramda5 = require("ramda");
// src/utilities/list.ts
var import_ramda3 = require("ramda");
var lenLte = (len) => (0, import_ramda3.compose)((0, import_ramda3.curry)((0, import_ramda3.flip)(import_ramda3.lte))(len), import_ramda3.length);
var allEqual = (0, import_ramda3.compose)(lenLte(1), import_ramda3.uniq);
var lengthNonZero = (0, import_ramda3.compose)(Boolean, import_ramda3.length);
// src/utilities/string.ts
var import_ramda4 = require("ramda");
var charAt = (0, import_ramda4.invoker)(1, "charAt");
var notAlpha = (0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.toLower, import_ramda4.toUpper]));
var hasAlpha = (0, import_ramda4.compose)(import_ramda4.not, notAlpha);
var startsWithLower = (0, import_ramda4.allPass)([
(0, import_ramda4.compose)(hasAlpha, charAt(0)),
(0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.identity, import_ramda4.toLower]), charAt(0))
]);
var startsWithUpper = (0, import_ramda4.allPass)([
(0, import_ramda4.compose)(hasAlpha, charAt(0)),
(0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.identity, import_ramda4.toUpper]), charAt(0))
]);
var startsWithNewline = (0, import_ramda4.compose)(lengthNonZero, (0, import_ramda4.match)(/^\n/));
var startsWithHardbreak = (0, import_ramda4.compose)(lengthNonZero, (0, import_ramda4.match)(/^\n\n/));
var endsWithHardbreak = (0, import_ramda4.compose)(lengthNonZero, (0, import_ramda4.match)(/\n\n$/));
var isUpper = (0, import_ramda4.compose)(allEqual, (0, import_ramda4.juxt)([import_ramda4.toUpper, import_ramda4.identity]));
// src/rules/base.ts
var isLeftDotDelimiter = (0, import_ramda5.compose)(lengthNonZero, dotSuffix);
var isLeftSingleLetter = (0, import_ramda5.compose)((0, import_ramda5.equals)(1), import_ramda5.length, lstWord);
var isLeftUpper = (0, import_ramda5.compose)(allEqual, (0, import_ramda5.juxt)([import_ramda5.toUpper, import_ramda5.identity]), lstWord);
var leftHasAlpha = (0, import_ramda5.compose)(hasAlpha, lstWord);
var isSpaceSuffix = (0, import_ramda5.compose)(lengthNonZero, spaceSuffix);
var isSpacePrefix = (0, import_ramda5.compose)(lengthNonZero, spacePrefix);
var log = (name, action) => {
return (...args) => {
const result = action(...args);
if (process.env.DEBUG) {
console.log(name, args, result);
}
return result;
};
};
var _ = (0, import_ramda5.always)(true);
var rule = (name, [left, right], remap = import_ramda5.identity) => {
return log(name, (0, import_ramda5.compose)(
(0, import_ramda5.all)(Boolean),
(0, import_ramda5.zipWith)(import_ramda5.call, [left, right]),
(0, import_ramda5.map)(remap)
));
};
var spaceBothSides = rule("spaceBothSides", [isSpaceSuffix, isSpacePrefix], words);
var rightLacksSpacePrefix = rule("rightLacksSpacePrefix", [_, (0, import_ramda5.compose)(import_ramda5.not, isSpacePrefix)], words);
var rightStartsWithLowercase = rule("rightStartsWithLowercase", [_, (0, import_ramda5.compose)(startsWithLower, fstToken)]);
var rightDelimiterPrefix = rule("rightDelimiterPrefix", [_, (0, import_ramda5.compose)(lengthNonZero, delimiterPrefix, fstToken)]);
var rightQuotationGenericPrefix = rule("rightQuotationGenericPrefix", [_, (0, import_ramda5.compose)(lengthNonZero, quotationGenericPrefix)]);
var rightQuotationClosePrefix = rule("rightQuotationClosePrefix", [_, (0, import_ramda5.compose)(lengthNonZero, quotationClosePrefix, fstToken)]);
var rightBracketsClosePrefix = rule("rightBracketsClosePrefix", [_, (0, import_ramda5.compose)(lengthNonZero, bracketsClosePrefix, fstToken)]);
var rightOnlySpaces = rule("rightOnlySpaces", [_, (0, import_ramda5.compose)(lengthNonZero, spaces)]);
var leftEndsWithHardbreak = rule("leftEndsWithHardbreak", [endsWithHardbreak, _]);
var rightStartsWithHardbreak = rule("rightStartsWithHardbreak", [_, startsWithHardbreak]);
var rightStartsNewlineUppercased = rule("rightStartsNewlineUppercased", [_, (0, import_ramda5.allPass)([startsWithNewline, startsWithUpper])]);
var leftInitials = rule("leftInitials", [(0, import_ramda5.allPass)([isLeftDotDelimiter, isLeftSingleLetter, isLeftUpper, leftHasAlpha]), _]);
// src/rules/abbreviations.ts
var import_ramda6 = require("ramda");
var fst2 = (0, import_ramda6.compose)((0, import_ramda6.defaultTo)(""), (0, import_ramda6.view)(first()));
var snd2 = (0, import_ramda6.compose)((0, import_ramda6.defaultTo)(""), (0, import_ramda6.view)(second()));
var isDotDelimiter = (0, import_ramda6.compose)(lengthNonZero, dotSuffix);
var hash = (0, import_ramda6.compose)(import_ramda6.toLower, (0, import_ramda6.join)("."));
var insidePairAbbreviationMap = (0, import_ramda6.anyPass)([
(0, import_ramda6.prop)(import_ramda6.__, HEAD_PAIR),
(0, import_ramda6.prop)(import_ramda6.__, TAIL_PAIR),
(0, import_ramda6.prop)(import_ramda6.__, OTHER_PAIR)
]);
var isPairAbbreviation = (0, import_ramda6.compose)(
insidePairAbbreviationMap,
hash,
(0, import_ramda6.zipWith)(import_ramda6.call, [
(0, import_ramda6.compose)(omitNonAlphaStart, lstWord, lstToken),
(0, import_ramda6.compose)(fstWord, fstToken)
])
);
var pairAbbreviation = (0, import_ramda6.allPass)([
(0, import_ramda6.compose)(isDotDelimiter, lstToken, fst2),
isPairAbbreviation
]);
var insideAbbreviationMap = (0, import_ramda6.anyPass)([
// @ts-ignore
(0, import_ramda6.prop)(import_ramda6.__, INITIALS),
// @ts-ignore
(0, import_ramda6.prop)(import_ramda6.__, HEAD),
// @ts-ignore
(0, import_ramda6.prop)(import_ramda6.__, TAIL),
// @ts-ignore
(0, import_ramda6.prop)(import_ramda6.__, OTHER)
]);
var isLeftAbbreviation = (0, import_ramda6.compose)(
insideAbbreviationMap,
omitNonAlphaStart,
import_ramda6.toLower,
lstWord,
lstToken
);
var leftAbbreviation = (0, import_ramda6.compose)(
(0, import_ramda6.allPass)([(0, import_ramda6.compose)(isDotDelimiter, lstToken), isLeftAbbreviation]),
fst2
);
var isCaps = (0, import_ramda6.allPass)([isUpper, (0, import_ramda6.compose)((0, import_ramda6.lt)(1), import_ramda6.length)]);
var rightLowercaseOrCaps = (0, import_ramda6.compose)((0, import_ramda6.anyPass)([startsWithLower, isCaps]), fstWord, snd2);
var before = (s) => (t) => s.slice(0, Math.max(s.indexOf(t), 0));
var isLeftPairsTail = (left) => {
const rest = before(left);
const head = (0, import_ramda6.compose)(words, lstWord, rest, lstWord, lstToken);
return (0, import_ramda6.or)(
isPairAbbreviation([head(left), lstWord(left)]),
isPairAbbreviation(lstWord(left).split("."))
);
};
var leftPairsTailAbbreviation = (0, import_ramda6.allPass)([
(0, import_ramda6.compose)(isDotDelimiter, lstToken, fst2),
(0, import_ramda6.compose)(isLeftPairsTail, fst2),
rightLowercaseOrCaps
]);
// src/index.ts
var leftPreprocessor = lstChars(20);
var rightPreprocessor = fstChars(20);
var sidesPreprocessors = [leftPreprocessor, rightPreprocessor];
var joinCondition = (0, import_ramda7.anyPass)([
spaceBothSides,
rightLacksSpacePrefix,
rightStartsWithLowercase,
rightDelimiterPrefix,
rightQuotationGenericPrefix,
rightQuotationClosePrefix,
rightBracketsClosePrefix,
rightOnlySpaces,
leftInitials,
leftAbbreviation,
pairAbbreviation,
leftPairsTailAbbreviation
]);
var breakCondition = (0, import_ramda7.anyPass)([
leftEndsWithHardbreak,
rightStartsWithHardbreak,
rightStartsNewlineUppercased
]);
var join2 = (0, import_ramda7.compose)(joinCondition, (0, import_ramda7.zipWith)(import_ramda7.call, sidesPreprocessors));
var breaks = (0, import_ramda7.compose)(breakCondition, (0, import_ramda7.zipWith)(import_ramda7.call, sidesPreprocessors));
function sentenize(text) {
const parts = text.split(/((?:\n\s*){2,})/);
const parsed = [];
for (const part of parts) {
const chunks = sentences(part);
let left = null;
for (const right of chunks) {
if (!left) {
left = right;
continue;
}
if (!breaks([left, right]) && join2([left, right])) {
left += right;
} else {
parsed.push(left);
left = right;
}
}
if (left)
parsed.push(left);
}
return parsed;
}
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
sentenize
});