UNPKG

punkt

Version:

A port of NLTK's Punkt sentence tokenizer to JS.

github.com/noahcoolboy/punkt

noahcoolboy/punkt

262 lines • 9.55 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.PunktTokenizer = exports.PunktParameters = exports.PunktLanguageVars = void 0; const utils_js_1 = require("./utils.js"); const languageVars_js_1 = require("./languageVars.js"); exports.PunktLanguageVars = languageVars_js_1.default; const parameters_js_1 = require("./parameters.js"); exports.PunktParameters = parameters_js_1.default; const token_js_1 = require("./token.js"); var OrthoContext; (function (OrthoContext) { OrthoContext[OrthoContext["BeginUpper"] = 2] = "BeginUpper"; OrthoContext[OrthoContext["MiddleUpper"] = 4] = "MiddleUpper"; OrthoContext[OrthoContext["UnknownUpper"] = 8] = "UnknownUpper"; OrthoContext[OrthoContext["BeginLower"] = 16] = "BeginLower"; OrthoContext[OrthoContext["MiddleLower"] = 32] = "MiddleLower"; OrthoContext[OrthoContext["UnknownLower"] = 64] = "UnknownLower"; OrthoContext[OrthoContext["Upper"] = 14] = "Upper"; OrthoContext[OrthoContext["Lower"] = 112] = "Lower"; })(OrthoContext || (OrthoContext = {})); class PunktTokenizer { constructor(params, langVars) { this.langVars = new languageVars_js_1.default(); this.params = params; if (langVars) this.langVars = langVars; } *tokenizeWords(plaintext) { let parastart = false; for (const line of plaintext.split("\n")) { if (line.trim()) { const lineToks = (0, utils_js_1.matchAll)(line, this.langVars.reWordTokenizer); const tok = lineToks.next(); if (tok.done) continue; yield new token_js_1.default(tok.value[0], { parastart, linestart: true }); parastart = false; for (const tok of lineToks) { yield new token_js_1.default(tok[0]); } } else { parastart = true; } } } *annotateFirstPass(tokens) { for (const tok of tokens) { this.firstPassAnnotation(tok); yield tok; } } firstPassAnnotation(augTok) { var _a; const tok = augTok.tok; if (this.langVars.sentEndChars.indexOf(tok) !== -1) { augTok.sentbreak = true; } else if (augTok.isEllipsis) { augTok.ellipsis = true; } else if (augTok.periodFinal && !tok.endsWith("..")) { if (this.params.abbreviations.indexOf(tok.slice(0, -1).toLowerCase()) !== -1 || this.params.abbreviations.indexOf((_a = tok.slice(0, -1).toLowerCase().split("-").pop()) !== null && _a !== void 0 ? _a : "") !== -1) { augTok.abbr = true; } else { augTok.sentbreak = true; } } } *spanTokenize(text, realignBoundaries = true) { let slices = this.slicesFromText(text); if (realignBoundaries) slices = this.realignBoundaries(text, slices); for (const sentence of slices) { yield sentence; } } *sentencesFromText(text, realignBoundaries = true) { for (const slice of this.spanTokenize(text, realignBoundaries)) { yield text.slice(slice[0], slice[1]); } } getLastWhitespaceIndex(text) { for (let i = text.length - 1; i >= 0; i--) { if ([" ", "\n", "\t", "\r", "\0x0b", "\0x0c"].indexOf(text[i]) !== -1) return i; } return 0; } *matchPotentialEndContexts(text) { let previousSlice = [0, 0]; let previousMatch = null; for (const match of (0, utils_js_1.matchAll)(text, this.langVars.rePeriodContext)) { const before = text.slice(previousSlice[1], match.index); let indexAfterLastSpace = this.getLastWhitespaceIndex(before); if (indexAfterLastSpace) { indexAfterLastSpace += previousSlice[1] + 1; } else { indexAfterLastSpace = previousSlice[0]; } const prevWordSlice = [indexAfterLastSpace, match.index]; if (previousMatch && previousSlice[1] <= prevWordSlice[0]) { yield [ previousMatch, text.slice(previousSlice[0], previousSlice[1]) + previousMatch[0] + previousMatch[1], ]; } previousMatch = match; previousSlice = prevWordSlice; } if (previousMatch) { yield [ previousMatch, text.slice(previousSlice[0], previousSlice[1]) + previousMatch[0] + previousMatch[1], ]; } } *slicesFromText(text) { let lastBreak = 0; for (const [match, context] of this.matchPotentialEndContexts(text)) { if (this.textContainsSentbreak(context)) { yield [lastBreak, match.index + match[0].length]; if (match[2]) { lastBreak = match.index + text.slice(match.index).indexOf(match[2]); } else { lastBreak = match.index + match[0].length; } } } yield [lastBreak, text.replace(/\s+$/, "").length]; } *realignBoundaries(text, slices) { let realign = 0; for (const [sentence1, sentence2] of (0, utils_js_1.pairIter)(slices)) { const [start1, end1] = sentence1; const start1R = start1 + realign; if (!sentence2) { if (text.slice(start1R, end1)) { yield [start1R, end1]; } continue; } const [start2, end2] = sentence2; const match = text .slice(start2, end2) .match(this.langVars.reBoundaryRealignment); if (match && match.index === 0) { const realignLen = match[0].replace(/\s+$/, "").length; yield [start1R, start2 + realignLen]; realign = realignLen; } else { realign = 0; if (text.slice(start1R, end1)) { yield [start1R, end1]; } } } } textContainsSentbreak(text) { let found = false; for (const tok of this.annotateTokens(this.tokenizeWords(text))) { if (found) { return true; } if (tok.sentbreak) { found = true; } } return false; } annotateTokens(tokens) { const firstPass = this.annotateFirstPass(tokens); const secondPass = this.annotateSecondPass(firstPass); return secondPass; } *annotateSecondPass(tokens) { for (const [prev, tok] of (0, utils_js_1.pairIter)(tokens)) { this.secondPassAnnotation(prev, tok); yield prev; } } secondPassAnnotation(augTok1, augTok2) { if (!augTok2) { return; } if (!augTok1.periodFinal) { return; } const typ = augTok1.typeNoPeriod; const nextTyp = augTok2.typeNoSentPeriod; const tokIsInitial = augTok1.isInitial; if (this.params.collocations.has(typ) && this.params.collocations.get(typ) === nextTyp) { augTok1.sentbreak = false; augTok1.abbr = true; return; } const isSentStarter = this.orthoHeuristic(augTok2); if ((augTok1.abbr || augTok1.isEllipsis) && !tokIsInitial) { if (isSentStarter === true) { augTok1.sentbreak = true; return; } if (augTok2.firstUpper && this.params.sentStarters.indexOf(nextTyp) !== -1) { augTok1.sentbreak = true; return; } } if (tokIsInitial || typ === "##number##") { if (!isSentStarter) { augTok1.sentbreak = false; augTok1.abbr = true; return; } const orthoContext = this.params.orthoContext.get(nextTyp) || 0; if (isSentStarter === "unknown" && tokIsInitial && augTok2.firstUpper && !(orthoContext & OrthoContext.Lower)) { augTok1.sentbreak = false; augTok1.abbr = true; return; } } } orthoHeuristic(augTok) { if (";:,.!?".split("").indexOf(augTok.tok) !== -1) { return false; } const orthoContext = this.params.orthoContext.get(augTok.typeNoSentPeriod); if (!orthoContext) { return "unknown"; } if (augTok.firstUpper && orthoContext & OrthoContext.Lower && !(orthoContext & OrthoContext.MiddleUpper)) { return true; } if (augTok.firstLower && (orthoContext & OrthoContext.Upper || !(orthoContext & OrthoContext.BeginLower))) { return false; } return "unknown"; } tokenize(text, realignBoundaries = true) { return Array.from(this.sentencesFromText(text, realignBoundaries)); } } exports.PunktTokenizer = PunktTokenizer; //# sourceMappingURL=index.js.map