punkt
Version:
A port of NLTK's Punkt sentence tokenizer to JS.
262 lines • 9.55 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.PunktTokenizer = exports.PunktParameters = exports.PunktLanguageVars = void 0;
const utils_js_1 = require("./utils.js");
const languageVars_js_1 = require("./languageVars.js");
exports.PunktLanguageVars = languageVars_js_1.default;
const parameters_js_1 = require("./parameters.js");
exports.PunktParameters = parameters_js_1.default;
const token_js_1 = require("./token.js");
var OrthoContext;
(function (OrthoContext) {
OrthoContext[OrthoContext["BeginUpper"] = 2] = "BeginUpper";
OrthoContext[OrthoContext["MiddleUpper"] = 4] = "MiddleUpper";
OrthoContext[OrthoContext["UnknownUpper"] = 8] = "UnknownUpper";
OrthoContext[OrthoContext["BeginLower"] = 16] = "BeginLower";
OrthoContext[OrthoContext["MiddleLower"] = 32] = "MiddleLower";
OrthoContext[OrthoContext["UnknownLower"] = 64] = "UnknownLower";
OrthoContext[OrthoContext["Upper"] = 14] = "Upper";
OrthoContext[OrthoContext["Lower"] = 112] = "Lower";
})(OrthoContext || (OrthoContext = {}));
class PunktTokenizer {
constructor(params, langVars) {
this.langVars = new languageVars_js_1.default();
this.params = params;
if (langVars)
this.langVars = langVars;
}
*tokenizeWords(plaintext) {
let parastart = false;
for (const line of plaintext.split("\n")) {
if (line.trim()) {
const lineToks = (0, utils_js_1.matchAll)(line, this.langVars.reWordTokenizer);
const tok = lineToks.next();
if (tok.done)
continue;
yield new token_js_1.default(tok.value[0], { parastart, linestart: true });
parastart = false;
for (const tok of lineToks) {
yield new token_js_1.default(tok[0]);
}
}
else {
parastart = true;
}
}
}
*annotateFirstPass(tokens) {
for (const tok of tokens) {
this.firstPassAnnotation(tok);
yield tok;
}
}
firstPassAnnotation(augTok) {
var _a;
const tok = augTok.tok;
if (this.langVars.sentEndChars.indexOf(tok) !== -1) {
augTok.sentbreak = true;
}
else if (augTok.isEllipsis) {
augTok.ellipsis = true;
}
else if (augTok.periodFinal && !tok.endsWith("..")) {
if (this.params.abbreviations.indexOf(tok.slice(0, -1).toLowerCase()) !==
-1 ||
this.params.abbreviations.indexOf((_a = tok.slice(0, -1).toLowerCase().split("-").pop()) !== null && _a !== void 0 ? _a : "") !== -1) {
augTok.abbr = true;
}
else {
augTok.sentbreak = true;
}
}
}
*spanTokenize(text, realignBoundaries = true) {
let slices = this.slicesFromText(text);
if (realignBoundaries)
slices = this.realignBoundaries(text, slices);
for (const sentence of slices) {
yield sentence;
}
}
*sentencesFromText(text, realignBoundaries = true) {
for (const slice of this.spanTokenize(text, realignBoundaries)) {
yield text.slice(slice[0], slice[1]);
}
}
getLastWhitespaceIndex(text) {
for (let i = text.length - 1; i >= 0; i--) {
if ([" ", "\n", "\t", "\r", "\0x0b", "\0x0c"].indexOf(text[i]) !== -1)
return i;
}
return 0;
}
*matchPotentialEndContexts(text) {
let previousSlice = [0, 0];
let previousMatch = null;
for (const match of (0, utils_js_1.matchAll)(text, this.langVars.rePeriodContext)) {
const before = text.slice(previousSlice[1], match.index);
let indexAfterLastSpace = this.getLastWhitespaceIndex(before);
if (indexAfterLastSpace) {
indexAfterLastSpace += previousSlice[1] + 1;
}
else {
indexAfterLastSpace = previousSlice[0];
}
const prevWordSlice = [indexAfterLastSpace, match.index];
if (previousMatch && previousSlice[1] <= prevWordSlice[0]) {
yield [
previousMatch,
text.slice(previousSlice[0], previousSlice[1]) +
previousMatch[0] +
previousMatch[1],
];
}
previousMatch = match;
previousSlice = prevWordSlice;
}
if (previousMatch) {
yield [
previousMatch,
text.slice(previousSlice[0], previousSlice[1]) +
previousMatch[0] +
previousMatch[1],
];
}
}
*slicesFromText(text) {
let lastBreak = 0;
for (const [match, context] of this.matchPotentialEndContexts(text)) {
if (this.textContainsSentbreak(context)) {
yield [lastBreak, match.index + match[0].length];
if (match[2]) {
lastBreak = match.index + text.slice(match.index).indexOf(match[2]);
}
else {
lastBreak = match.index + match[0].length;
}
}
}
yield [lastBreak, text.replace(/\s+$/, "").length];
}
*realignBoundaries(text, slices) {
let realign = 0;
for (const [sentence1, sentence2] of (0, utils_js_1.pairIter)(slices)) {
const [start1, end1] = sentence1;
const start1R = start1 + realign;
if (!sentence2) {
if (text.slice(start1R, end1)) {
yield [start1R, end1];
}
continue;
}
const [start2, end2] = sentence2;
const match = text
.slice(start2, end2)
.match(this.langVars.reBoundaryRealignment);
if (match && match.index === 0) {
const realignLen = match[0].replace(/\s+$/, "").length;
yield [start1R, start2 + realignLen];
realign = realignLen;
}
else {
realign = 0;
if (text.slice(start1R, end1)) {
yield [start1R, end1];
}
}
}
}
textContainsSentbreak(text) {
let found = false;
for (const tok of this.annotateTokens(this.tokenizeWords(text))) {
if (found) {
return true;
}
if (tok.sentbreak) {
found = true;
}
}
return false;
}
annotateTokens(tokens) {
const firstPass = this.annotateFirstPass(tokens);
const secondPass = this.annotateSecondPass(firstPass);
return secondPass;
}
*annotateSecondPass(tokens) {
for (const [prev, tok] of (0, utils_js_1.pairIter)(tokens)) {
this.secondPassAnnotation(prev, tok);
yield prev;
}
}
secondPassAnnotation(augTok1, augTok2) {
if (!augTok2) {
return;
}
if (!augTok1.periodFinal) {
return;
}
const typ = augTok1.typeNoPeriod;
const nextTyp = augTok2.typeNoSentPeriod;
const tokIsInitial = augTok1.isInitial;
if (this.params.collocations.has(typ) &&
this.params.collocations.get(typ) === nextTyp) {
augTok1.sentbreak = false;
augTok1.abbr = true;
return;
}
const isSentStarter = this.orthoHeuristic(augTok2);
if ((augTok1.abbr || augTok1.isEllipsis) && !tokIsInitial) {
if (isSentStarter === true) {
augTok1.sentbreak = true;
return;
}
if (augTok2.firstUpper &&
this.params.sentStarters.indexOf(nextTyp) !== -1) {
augTok1.sentbreak = true;
return;
}
}
if (tokIsInitial || typ === "##number##") {
if (!isSentStarter) {
augTok1.sentbreak = false;
augTok1.abbr = true;
return;
}
const orthoContext = this.params.orthoContext.get(nextTyp) || 0;
if (isSentStarter === "unknown" &&
tokIsInitial &&
augTok2.firstUpper &&
!(orthoContext & OrthoContext.Lower)) {
augTok1.sentbreak = false;
augTok1.abbr = true;
return;
}
}
}
orthoHeuristic(augTok) {
if (";:,.!?".split("").indexOf(augTok.tok) !== -1) {
return false;
}
const orthoContext = this.params.orthoContext.get(augTok.typeNoSentPeriod);
if (!orthoContext) {
return "unknown";
}
if (augTok.firstUpper &&
orthoContext & OrthoContext.Lower &&
!(orthoContext & OrthoContext.MiddleUpper)) {
return true;
}
if (augTok.firstLower &&
(orthoContext & OrthoContext.Upper ||
!(orthoContext & OrthoContext.BeginLower))) {
return false;
}
return "unknown";
}
tokenize(text, realignBoundaries = true) {
return Array.from(this.sentencesFromText(text, realignBoundaries));
}
}
exports.PunktTokenizer = PunktTokenizer;
//# sourceMappingURL=index.js.map