UNPKG

punkt

Version:

A port of NLTK's Punkt sentence tokenizer to JS.

39 lines 1.93 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const utils_js_1 = require("./utils.js"); /** * Stores variables, mostly regular expressions, which may be * language-dependent for correct application of the algorithm. * An extension of this class may modify its properties to suit * a language other than English; an instance can then be passed * as an argument to PunktSentenceTokenizer and PunktTrainer * constructors. */ class PunktLanguageVars { constructor(sentEndChars) { /** * Characters which are candidates for sentence boundaries */ this.sentEndChars = [".", "?", "!"]; /** * Regular expression for boundary realignment */ this.reBoundaryRealignment = /["'\])}]+(?:\s+|(?=--)|$)/m; /** * Excludes some characters from starting word tokens */ this.reWordStart = /[^("`{[:;&#*@)}\]\-,]/; /** * Hyphen and ellipsis are multi-character punctuation */ this.reMultiCharPunct = /(?:-{2,}|\.{2,}|(?:\.\s){2,}\.)/; if (sentEndChars) this.sentEndChars = sentEndChars; this.reSentEndChars = new RegExp(`[${(0, utils_js_1.escapeRegExp)(this.sentEndChars.join(""))}]`); this.reNonWordChars = new RegExp(`(?:[)\\";\\]\\*:@\\'\\({\\[${(0, utils_js_1.escapeRegExp)(this.sentEndChars.filter(c => c !== ".").join(""))}])`); this.reWordTokenizer = new RegExp(`(${this.reMultiCharPunct.source}|(?=${this.reWordStart.source})\\S+?(?=\\s|$|${this.reNonWordChars.source}|${this.reMultiCharPunct.source}|,(?=$|\\s|${this.reNonWordChars.source}|${this.reMultiCharPunct.source}))|\\S)`, "g"); this.rePeriodContext = new RegExp(`${this.reSentEndChars.source}(?=(?<after_tok>${this.reNonWordChars.source}|\\s+(?<next_tok>\\S+)))`, "g"); } } exports.default = PunktLanguageVars; //# sourceMappingURL=languageVars.js.map