punkt
Version:
A port of NLTK's Punkt sentence tokenizer to JS.
36 lines • 1.8 kB
JavaScript
import { escapeRegExp } from "./utils.js";
/**
* Stores variables, mostly regular expressions, which may be
* language-dependent for correct application of the algorithm.
* An extension of this class may modify its properties to suit
* a language other than English; an instance can then be passed
* as an argument to PunktSentenceTokenizer and PunktTrainer
* constructors.
*/
export default class PunktLanguageVars {
constructor(sentEndChars) {
/**
* Characters which are candidates for sentence boundaries
*/
this.sentEndChars = [".", "?", "!"];
/**
* Regular expression for boundary realignment
*/
this.reBoundaryRealignment = /["'\])}]+(?:\s+|(?=--)|$)/m;
/**
* Excludes some characters from starting word tokens
*/
this.reWordStart = /[^("`{[:;&#*@)}\]\-,]/;
/**
* Hyphen and ellipsis are multi-character punctuation
*/
this.reMultiCharPunct = /(?:-{2,}|\.{2,}|(?:\.\s){2,}\.)/;
if (sentEndChars)
this.sentEndChars = sentEndChars;
this.reSentEndChars = new RegExp(`[${escapeRegExp(this.sentEndChars.join(""))}]`);
this.reNonWordChars = new RegExp(`(?:[)\\";\\]\\*:@\\'\\({\\[${escapeRegExp(this.sentEndChars.filter(c => c !== ".").join(""))}])`);
this.reWordTokenizer = new RegExp(`(${this.reMultiCharPunct.source}|(?=${this.reWordStart.source})\\S+?(?=\\s|$|${this.reNonWordChars.source}|${this.reMultiCharPunct.source}|,(?=$|\\s|${this.reNonWordChars.source}|${this.reMultiCharPunct.source}))|\\S)`, "g");
this.rePeriodContext = new RegExp(`${this.reSentEndChars.source}(?=(?<after_tok>${this.reNonWordChars.source}|\\s+(?<next_tok>\\S+)))`, "g");
}
}
//# sourceMappingURL=languageVars.js.map