punkt

Version:

A port of NLTK's Punkt sentence tokenizer to JS.

44 lines • 1.28 kB

TypeScript

/** * Stores variables, mostly regular expressions, which may be * language-dependent for correct application of the algorithm. * An extension of this class may modify its properties to suit * a language other than English; an instance can then be passed * as an argument to PunktSentenceTokenizer and PunktTrainer * constructors. */ export default class PunktLanguageVars { /** * Characters which are candidates for sentence boundaries */ sentEndChars: string[]; /** * Regular expression for boundary realignment */ reBoundaryRealignment: RegExp; /** * Excludes some characters from starting word tokens */ reWordStart: RegExp; /** * Hyphen and ellipsis are multi-character punctuation */ reMultiCharPunct: RegExp; /** * Characters which are candidates for sentence boundaries */ reSentEndChars: RegExp; /** * Regular expression for non-word characters */ reNonWordChars: RegExp; /** * Regular expression for word tokenization */ reWordTokenizer: RegExp; /** * Regular expression to find contexts including possible sentence boundaries. */ rePeriodContext: RegExp; constructor(); } //# sourceMappingURL=languageVars.d.ts.map