punkt
Version:
A port of NLTK's Punkt sentence tokenizer to JS.
85 lines • 2.4 kB
JavaScript
class PunktToken {
constructor(tok, params = {}) {
var _a;
this.parastart = false;
this.linestart = false;
this.sentbreak = false;
this.abbr = false;
this.ellipsis = false;
this.tok = tok;
this.type = this.getType(tok);
this.periodFinal = tok.endsWith(".");
const mutableProps = [
"parastart",
"linestart",
"sentbreak",
"abbr",
"ellipsis",
];
for (const prop of mutableProps) {
if (prop in params) {
this[prop] = (_a = params[prop]) !== null && _a !== void 0 ? _a : false;
}
}
}
getType(tok) {
return tok.toLowerCase().replace(PunktToken.RE_NUMERIC, "##number##");
}
get typeNoPeriod() {
if (this.type.length > 1 && this.type.endsWith(".")) {
return this.type.slice(0, -1);
}
return this.type;
}
get typeNoSentPeriod() {
if (this.sentbreak) {
return this.typeNoPeriod;
}
return this.type;
}
get firstUpper() {
var _a;
return ((_a = this.tok[0]) === null || _a === void 0 ? void 0 : _a.toUpperCase()) === this.tok[0];
}
get firstLower() {
var _a;
return ((_a = this.tok[0]) === null || _a === void 0 ? void 0 : _a.toLowerCase()) === this.tok[0];
}
get firstCase() {
return this.firstUpper ? "upper" : this.firstLower ? "lower" : "none";
}
get isEllipsis() {
return PunktToken.RE_ELLIPSIS.test(this.tok);
}
get isNumber() {
return this.type.startsWith("##number##");
}
get isInitial() {
return PunktToken.RE_INITIAL.test(this.tok);
}
get isAlpha() {
return PunktToken.RE_ALPHA.test(this.tok);
}
get isNonPunct() {
return /[^\W\d]/.test(this.type);
}
toString() {
let res = this.tok;
if (this.abbr) {
res += "<A>";
}
if (this.ellipsis) {
res += "<E>";
}
if (this.sentbreak) {
res += "<S>";
}
return res;
}
}
PunktToken.RE_NUMERIC = /-?[.,]?\d[\d,.-]*\.?$/;
PunktToken.RE_ELLIPSIS = /^\.\.+$/;
PunktToken.RE_INITIAL = /^[^\W\d]\.$/u;
PunktToken.RE_ALPHA = /^[^\W\d]+$/u;
export default PunktToken;
//# sourceMappingURL=token.js.map