UNPKG

punkt

Version:

A port of NLTK's Punkt sentence tokenizer to JS.

36 lines 1.8 kB
import { escapeRegExp } from "./utils.js"; /** * Stores variables, mostly regular expressions, which may be * language-dependent for correct application of the algorithm. * An extension of this class may modify its properties to suit * a language other than English; an instance can then be passed * as an argument to PunktSentenceTokenizer and PunktTrainer * constructors. */ export default class PunktLanguageVars { constructor(sentEndChars) { /** * Characters which are candidates for sentence boundaries */ this.sentEndChars = [".", "?", "!"]; /** * Regular expression for boundary realignment */ this.reBoundaryRealignment = /["'\])}]+(?:\s+|(?=--)|$)/m; /** * Excludes some characters from starting word tokens */ this.reWordStart = /[^("`{[:;&#*@)}\]\-,]/; /** * Hyphen and ellipsis are multi-character punctuation */ this.reMultiCharPunct = /(?:-{2,}|\.{2,}|(?:\.\s){2,}\.)/; if (sentEndChars) this.sentEndChars = sentEndChars; this.reSentEndChars = new RegExp(`[${escapeRegExp(this.sentEndChars.join(""))}]`); this.reNonWordChars = new RegExp(`(?:[)\\";\\]\\*:@\\'\\({\\[${escapeRegExp(this.sentEndChars.filter(c => c !== ".").join(""))}])`); this.reWordTokenizer = new RegExp(`(${this.reMultiCharPunct.source}|(?=${this.reWordStart.source})\\S+?(?=\\s|$|${this.reNonWordChars.source}|${this.reMultiCharPunct.source}|,(?=$|\\s|${this.reNonWordChars.source}|${this.reMultiCharPunct.source}))|\\S)`, "g"); this.rePeriodContext = new RegExp(`${this.reSentEndChars.source}(?=(?<after_tok>${this.reNonWordChars.source}|\\s+(?<next_tok>\\S+)))`, "g"); } } //# sourceMappingURL=languageVars.js.map