UNPKG

punkt

Version:

A port of NLTK's Punkt sentence tokenizer to JS.

85 lines 2.4 kB
class PunktToken { constructor(tok, params = {}) { var _a; this.parastart = false; this.linestart = false; this.sentbreak = false; this.abbr = false; this.ellipsis = false; this.tok = tok; this.type = this.getType(tok); this.periodFinal = tok.endsWith("."); const mutableProps = [ "parastart", "linestart", "sentbreak", "abbr", "ellipsis", ]; for (const prop of mutableProps) { if (prop in params) { this[prop] = (_a = params[prop]) !== null && _a !== void 0 ? _a : false; } } } getType(tok) { return tok.toLowerCase().replace(PunktToken.RE_NUMERIC, "##number##"); } get typeNoPeriod() { if (this.type.length > 1 && this.type.endsWith(".")) { return this.type.slice(0, -1); } return this.type; } get typeNoSentPeriod() { if (this.sentbreak) { return this.typeNoPeriod; } return this.type; } get firstUpper() { var _a; return ((_a = this.tok[0]) === null || _a === void 0 ? void 0 : _a.toUpperCase()) === this.tok[0]; } get firstLower() { var _a; return ((_a = this.tok[0]) === null || _a === void 0 ? void 0 : _a.toLowerCase()) === this.tok[0]; } get firstCase() { return this.firstUpper ? "upper" : this.firstLower ? "lower" : "none"; } get isEllipsis() { return PunktToken.RE_ELLIPSIS.test(this.tok); } get isNumber() { return this.type.startsWith("##number##"); } get isInitial() { return PunktToken.RE_INITIAL.test(this.tok); } get isAlpha() { return PunktToken.RE_ALPHA.test(this.tok); } get isNonPunct() { return /[^\W\d]/.test(this.type); } toString() { let res = this.tok; if (this.abbr) { res += "<A>"; } if (this.ellipsis) { res += "<E>"; } if (this.sentbreak) { res += "<S>"; } return res; } } PunktToken.RE_NUMERIC = /-?[.,]?\d[\d,.-]*\.?$/; PunktToken.RE_ELLIPSIS = /^\.\.+$/; PunktToken.RE_INITIAL = /^[^\W\d]\.$/u; PunktToken.RE_ALPHA = /^[^\W\d]+$/u; export default PunktToken; //# sourceMappingURL=token.js.map