punkt
Version:
A port of NLTK's Punkt sentence tokenizer to JS.
44 lines • 1.28 kB
TypeScript
/**
* Stores variables, mostly regular expressions, which may be
* language-dependent for correct application of the algorithm.
* An extension of this class may modify its properties to suit
* a language other than English; an instance can then be passed
* as an argument to PunktSentenceTokenizer and PunktTrainer
* constructors.
*/
export default class PunktLanguageVars {
/**
* Characters which are candidates for sentence boundaries
*/
sentEndChars: string[];
/**
* Regular expression for boundary realignment
*/
reBoundaryRealignment: RegExp;
/**
* Excludes some characters from starting word tokens
*/
reWordStart: RegExp;
/**
* Hyphen and ellipsis are multi-character punctuation
*/
reMultiCharPunct: RegExp;
/**
* Characters which are candidates for sentence boundaries
*/
reSentEndChars: RegExp;
/**
* Regular expression for non-word characters
*/
reNonWordChars: RegExp;
/**
* Regular expression for word tokenization
*/
reWordTokenizer: RegExp;
/**
* Regular expression to find contexts including possible sentence boundaries.
*/
rePeriodContext: RegExp;
constructor();
}
//# sourceMappingURL=languageVars.d.ts.map