UNPKG

rita

Version:

tools for generative natural language

1,695 lines (1,692 loc) 1.5 MB
var __defProp = Object.defineProperty; var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value; var __publicField = (obj, key, value) => { __defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value); return value; }; // src/stemmer.js var SnowballStemmer = class { constructor() { this.bra = 0; this.ket = 0; this.limit = 0; this.cursor = 0; this.limit_backward = 0; this.current; } setCurrent(word) { this.current = word; this.cursor = 0; this.limit = word.length; this.limit_backward = 0; this.bra = this.cursor; this.ket = this.limit; } getCurrent() { var result = this.current; this.current = null; return result; } //////////////////////////////////////////////////////// in_grouping(s, min, max) { if (this.cursor < this.limit) { var ch = this.current.charCodeAt(this.cursor); if (ch <= max && ch >= min) { ch -= min; if (s[ch >> 3] & 1 << (ch & 7)) { this.cursor++; return true; } } } return false; } in_grouping_b(s, min, max) { if (this.cursor > this.limit_backward) { var ch = this.current.charCodeAt(this.cursor - 1); if (ch <= max && ch >= min) { ch -= min; if (s[ch >> 3] & 1 << (ch & 7)) { this.cursor--; return true; } } } return false; } out_grouping(s, min, max) { if (this.cursor < this.limit) { var ch = this.current.charCodeAt(this.cursor); if (ch > max || ch < min) { this.cursor++; return true; } ch -= min; if (!(s[ch >> 3] & 1 << (ch & 7))) { this.cursor++; return true; } } return false; } out_grouping_b(s, min, max) { if (this.cursor > this.limit_backward) { var ch = this.current.charCodeAt(this.cursor - 1); if (ch > max || ch < min) { this.cursor--; return true; } ch -= min; if (!(s[ch >> 3] & 1 << (ch & 7))) { this.cursor--; return true; } } return false; } eq_s(s_size, s) { if (this.limit - this.cursor < s_size) return false; for (var i = 0; i < s_size; i++) if (this.current.charCodeAt(this.cursor + i) != s.charCodeAt(i)) return false; this.cursor += s_size; return true; } eq_s_b(s_size, s) { if (this.cursor - this.limit_backward < s_size) return false; for (var i = 0; i < s_size; i++) if (this.current.charCodeAt(this.cursor - s_size + i) != s.charCodeAt(i)) return false; this.cursor -= s_size; return true; } find_among(v, v_size) { var i = 0, j = v_size, c = this.cursor, l = this.limit, common_i = 0, common_j = 0, first_key_inspected = false; while (true) { var k = i + (j - i >> 1), diff = 0, common = common_i < common_j ? common_i : common_j, w = v[k]; for (var i2 = common; i2 < w.s_size; i2++) { if (c + common == l) { diff = -1; break; } diff = this.current.charCodeAt(c + common) - w.s[i2]; if (diff) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0 || j == i || first_key_inspected) break; first_key_inspected = true; } } while (true) { var w = v[i]; if (common_i >= w.s_size) { this.cursor = c + w.s_size; if (!w.method) return w.result; var res = w.method(); this.cursor = c + w.s_size; if (res) return w.result; } i = w.substring_i; if (i < 0) return 0; } } find_among_b(v, v_size) { var i = 0, j = v_size, c = this.cursor, lb = this.limit_backward, common_i = 0, common_j = 0, first_key_inspected = false; while (true) { var k = i + (j - i >> 1), diff = 0, common = common_i < common_j ? common_i : common_j, w = v[k]; for (var i2 = w.s_size - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = this.current.charCodeAt(c - 1 - common) - w.s[i2]; if (diff) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0 || j == i || first_key_inspected) break; first_key_inspected = true; } } while (true) { var w = v[i]; if (common_i >= w.s_size) { this.cursor = c - w.s_size; if (!w.method) return w.result; var res = w.method(); this.cursor = c - w.s_size; if (res) return w.result; } i = w.substring_i; if (i < 0) return 0; } } replace_s(c_bra, c_ket, s) { var adjustment = s.length - (c_ket - c_bra), left = this.current.substring(0, c_bra), right = this.current.substring(c_ket); this.current = left + s + right; this.limit += adjustment; if (this.cursor >= c_ket) this.cursor += adjustment; else if (this.cursor > c_bra) this.cursor = c_bra; return adjustment; } slice_check() { if (this.bra < 0 || this.bra > this.ket || this.ket > this.limit || this.limit > this.current.length) throw "faulty slice operation"; } slice_from(s) { this.slice_check(); this.replace_s(this.bra, this.ket, s); } slice_del() { this.slice_from(""); } insert(c_bra, c_ket, s) { var adjustment = this.replace_s(c_bra, c_ket, s); if (c_bra <= this.bra) this.bra += adjustment; if (c_bra <= this.ket) this.ket += adjustment; } slice_to() { this.slice_check(); return this.current.substring(this.bra, this.ket); } eq_v_b(s) { return this.eq_s_b(s.length, s); } }; var Among = class { constructor(s, substring_i, result) { if (!s && s != "" || !substring_i && substring_i != 0 || !result) throw "Bad Among initialisation: s:" + s + ", substring_i: " + substring_i + ", result: " + result; this.s_size = s.length; this.s = this.toCharArray(s); this.substring_i = substring_i; this.result = result; } toCharArray(s) { var sLength = s.length, charArr = new Array(sLength); for (var i = 0; i < sLength; i++) charArr[i] = s.charCodeAt(i); return charArr; } }; var a_0 = [ new Among("arsen", -1, -1), new Among("commun", -1, -1), new Among("gener", -1, -1) ]; var a_1 = [ new Among("'", -1, 1), new Among("'s'", 0, 1), new Among("'s", -1, 1) ]; var a_2 = [ new Among("ied", -1, 2), new Among("s", -1, 3), new Among("ies", 1, 2), new Among("sses", 1, 1), new Among("ss", 1, -1), new Among("us", 1, -1) ]; var a_3 = [ new Among("", -1, 3), new Among("bb", 0, 2), new Among("dd", 0, 2), new Among("ff", 0, 2), new Among("gg", 0, 2), new Among("bl", 0, 1), new Among("mm", 0, 2), new Among("nn", 0, 2), new Among("pp", 0, 2), new Among("rr", 0, 2), new Among("at", 0, 1), new Among("tt", 0, 2), new Among("iz", 0, 1) ]; var a_4 = [ new Among("ed", -1, 2), new Among("eed", 0, 1), new Among("ing", -1, 2), new Among("edly", -1, 2), new Among("eedly", 3, 1), new Among("ingly", -1, 2) ]; var a_5 = [ new Among("anci", -1, 3), new Among("enci", -1, 2), new Among("ogi", -1, 13), new Among("li", -1, 16), new Among("bli", 3, 12), new Among("abli", 4, 4), new Among("alli", 3, 8), new Among("fulli", 3, 14), new Among("lessli", 3, 15), new Among("ousli", 3, 10), new Among("entli", 3, 5), new Among("aliti", -1, 8), new Among("biliti", -1, 12), new Among("iviti", -1, 11), new Among("tional", -1, 1), new Among("ational", 14, 7), new Among("alism", -1, 8), new Among("ation", -1, 7), new Among("ization", 17, 6), new Among("izer", -1, 6), new Among("ator", -1, 7), new Among("iveness", -1, 11), new Among("fulness", -1, 9), new Among("ousness", -1, 10) ]; var a_6 = [ new Among("icate", -1, 4), new Among("ative", -1, 6), new Among("alize", -1, 3), new Among("iciti", -1, 4), new Among("ical", -1, 4), new Among("tional", -1, 1), new Among("ational", 5, 2), new Among("ful", -1, 5), new Among("ness", -1, 5) ]; var a_7 = [ new Among("ic", -1, 1), new Among("ance", -1, 1), new Among("ence", -1, 1), new Among("able", -1, 1), new Among("ible", -1, 1), new Among("ate", -1, 1), new Among("ive", -1, 1), new Among("ize", -1, 1), new Among("iti", -1, 1), new Among("al", -1, 1), new Among("ism", -1, 1), new Among("ion", -1, 2), new Among("er", -1, 1), new Among("ous", -1, 1), new Among("ant", -1, 1), new Among("ent", -1, 1), new Among("ment", 15, 1), new Among("ement", 16, 1) ]; var a_8 = [ new Among("e", -1, 1), new Among("l", -1, 2) ]; var a_9 = [ new Among("succeed", -1, -1), new Among("proceed", -1, -1), new Among("exceed", -1, -1), new Among("canning", -1, -1), new Among("inning", -1, -1), new Among("earring", -1, -1), new Among("herring", -1, -1), new Among("outing", -1, -1) ]; var a_10 = [ new Among("andes", -1, -1), new Among("atlas", -1, -1), new Among("bias", -1, -1), new Among("cosmos", -1, -1), new Among("dying", -1, 3), new Among("early", -1, 9), new Among("gently", -1, 7), new Among("howe", -1, -1), new Among("idly", -1, 6), new Among("lying", -1, 4), new Among("news", -1, -1), new Among("only", -1, 10), new Among("singly", -1, 11), new Among("skies", -1, 2), new Among("skis", -1, 1), new Among("sky", -1, -1), new Among("tying", -1, 5), new Among("ugly", -1, 8) ]; var g_v = [17, 65, 16, 1]; var g_v_WXY = [ 1, 17, 65, 208, 1 ]; var g_valid_LI = [55, 141, 2]; var habr = [r_Step_1b, r_Step_1c, r_Step_2, r_Step_3, r_Step_4, r_Step_5]; var B_Y_found; var I_p2; var I_p1; function r_prelude() { var v_1 = Stemmer.impl.cursor, v_2; B_Y_found = false; Stemmer.impl.bra = Stemmer.impl.cursor; if (Stemmer.impl.eq_s(1, "'")) { Stemmer.impl.ket = Stemmer.impl.cursor; Stemmer.impl.slice_del(); } Stemmer.impl.cursor = v_1; Stemmer.impl.bra = v_1; if (Stemmer.impl.eq_s(1, "y")) { Stemmer.impl.ket = Stemmer.impl.cursor; Stemmer.impl.slice_from("Y"); B_Y_found = true; } Stemmer.impl.cursor = v_1; while (true) { v_2 = Stemmer.impl.cursor; if (Stemmer.impl.in_grouping(g_v, 97, 121)) { Stemmer.impl.bra = Stemmer.impl.cursor; if (Stemmer.impl.eq_s(1, "y")) { Stemmer.impl.ket = Stemmer.impl.cursor; Stemmer.impl.cursor = v_2; Stemmer.impl.slice_from("Y"); B_Y_found = true; continue; } } if (v_2 >= Stemmer.impl.limit) { Stemmer.impl.cursor = v_1; return; } Stemmer.impl.cursor = v_2 + 1; } } function r_mark_regions() { var v_1 = Stemmer.impl.cursor; I_p1 = Stemmer.impl.limit; I_p2 = I_p1; if (!Stemmer.impl.find_among(a_0, 3)) { Stemmer.impl.cursor = v_1; if (habr1()) { Stemmer.impl.cursor = v_1; return; } } I_p1 = Stemmer.impl.cursor; if (!habr1()) I_p2 = Stemmer.impl.cursor; } function habr1() { while (!Stemmer.impl.in_grouping(g_v, 97, 121)) { if (Stemmer.impl.cursor >= Stemmer.impl.limit) return true; Stemmer.impl.cursor++; } while (!Stemmer.impl.out_grouping(g_v, 97, 121)) { if (Stemmer.impl.cursor >= Stemmer.impl.limit) return true; Stemmer.impl.cursor++; } return false; } function r_shortv() { var v_1 = Stemmer.impl.limit - Stemmer.impl.cursor; if (!(Stemmer.impl.out_grouping_b(g_v_WXY, 89, 121) && Stemmer.impl.in_grouping_b(g_v, 97, 121) && Stemmer.impl.out_grouping_b( g_v, 97, 121 ))) { Stemmer.impl.cursor = Stemmer.impl.limit - v_1; if (!Stemmer.impl.out_grouping_b(g_v, 97, 121) || !Stemmer.impl.in_grouping_b(g_v, 97, 121) || Stemmer.impl.cursor > Stemmer.impl.limit_backward) return false; } return true; } function r_R1() { return I_p1 <= Stemmer.impl.cursor; } function r_R2() { return I_p2 <= Stemmer.impl.cursor; } function r_Step_1a() { var among_var, v_1 = Stemmer.impl.limit - Stemmer.impl.cursor; Stemmer.impl.ket = Stemmer.impl.cursor; among_var = Stemmer.impl.find_among_b(a_1, 3); if (among_var) { Stemmer.impl.bra = Stemmer.impl.cursor; if (among_var == 1) Stemmer.impl.slice_del(); } else Stemmer.impl.cursor = Stemmer.impl.limit - v_1; Stemmer.impl.ket = Stemmer.impl.cursor; among_var = Stemmer.impl.find_among_b(a_2, 6); if (among_var) { Stemmer.impl.bra = Stemmer.impl.cursor; switch (among_var) { case 1: Stemmer.impl.slice_from("ss"); break; case 2: var c = Stemmer.impl.cursor - 2; if (Stemmer.impl.limit_backward > c || c > Stemmer.impl.limit) { Stemmer.impl.slice_from("ie"); break; } Stemmer.impl.cursor = c; Stemmer.impl.slice_from("i"); break; case 3: do { if (Stemmer.impl.cursor <= Stemmer.impl.limit_backward) return; Stemmer.impl.cursor--; } while (!Stemmer.impl.in_grouping_b(g_v, 97, 121)); Stemmer.impl.slice_del(); break; } } } function r_Step_1b() { var among_var, v_1, v_3, v_4; Stemmer.impl.ket = Stemmer.impl.cursor; among_var = Stemmer.impl.find_among_b(a_4, 6); if (among_var) { Stemmer.impl.bra = Stemmer.impl.cursor; switch (among_var) { case 1: if (r_R1()) Stemmer.impl.slice_from("ee"); break; case 2: v_1 = Stemmer.impl.limit - Stemmer.impl.cursor; while (!Stemmer.impl.in_grouping_b(g_v, 97, 121)) { if (Stemmer.impl.cursor <= Stemmer.impl.limit_backward) return; Stemmer.impl.cursor--; } Stemmer.impl.cursor = Stemmer.impl.limit - v_1; Stemmer.impl.slice_del(); v_3 = Stemmer.impl.limit - Stemmer.impl.cursor; among_var = Stemmer.impl.find_among_b(a_3, 13); if (among_var) { Stemmer.impl.cursor = Stemmer.impl.limit - v_3; switch (among_var) { case 1: var c = Stemmer.impl.cursor; Stemmer.impl.insert(Stemmer.impl.cursor, Stemmer.impl.cursor, "e"); Stemmer.impl.cursor = c; break; case 2: Stemmer.impl.ket = Stemmer.impl.cursor; if (Stemmer.impl.cursor > Stemmer.impl.limit_backward) { Stemmer.impl.cursor--; Stemmer.impl.bra = Stemmer.impl.cursor; Stemmer.impl.slice_del(); } break; case 3: if (Stemmer.impl.cursor == I_p1) { v_4 = Stemmer.impl.limit - Stemmer.impl.cursor; if (r_shortv()) { Stemmer.impl.cursor = Stemmer.impl.limit - v_4; var c = Stemmer.impl.cursor; Stemmer.impl.insert(Stemmer.impl.cursor, Stemmer.impl.cursor, "e"); Stemmer.impl.cursor = c; } } break; } } break; } } } function r_Step_1c() { var v_1 = Stemmer.impl.limit - Stemmer.impl.cursor; Stemmer.impl.ket = Stemmer.impl.cursor; if (!Stemmer.impl.eq_s_b(1, "y")) { Stemmer.impl.cursor = Stemmer.impl.limit - v_1; if (!Stemmer.impl.eq_s_b(1, "Y")) return; } Stemmer.impl.bra = Stemmer.impl.cursor; if (Stemmer.impl.out_grouping_b(g_v, 97, 121) && Stemmer.impl.cursor > Stemmer.impl.limit_backward) Stemmer.impl.slice_from("i"); } function r_Step_2() { var among_var; Stemmer.impl.ket = Stemmer.impl.cursor; among_var = Stemmer.impl.find_among_b(a_5, 24); if (among_var) { Stemmer.impl.bra = Stemmer.impl.cursor; if (r_R1()) { switch (among_var) { case 1: Stemmer.impl.slice_from("tion"); break; case 2: Stemmer.impl.slice_from("ence"); break; case 3: Stemmer.impl.slice_from("ance"); break; case 4: Stemmer.impl.slice_from("able"); break; case 5: Stemmer.impl.slice_from("ent"); break; case 6: Stemmer.impl.slice_from("ize"); break; case 7: Stemmer.impl.slice_from("ate"); break; case 8: Stemmer.impl.slice_from("al"); break; case 9: Stemmer.impl.slice_from("ful"); break; case 10: Stemmer.impl.slice_from("ous"); break; case 11: Stemmer.impl.slice_from("ive"); break; case 12: Stemmer.impl.slice_from("ble"); break; case 13: if (Stemmer.impl.eq_s_b(1, "l")) Stemmer.impl.slice_from("og"); break; case 14: Stemmer.impl.slice_from("ful"); break; case 15: Stemmer.impl.slice_from("less"); break; case 16: if (Stemmer.impl.in_grouping_b(g_valid_LI, 99, 116)) Stemmer.impl.slice_del(); break; } } } } function r_Step_3() { var among_var; Stemmer.impl.ket = Stemmer.impl.cursor; among_var = Stemmer.impl.find_among_b(a_6, 9); if (among_var) { Stemmer.impl.bra = Stemmer.impl.cursor; if (r_R1()) { switch (among_var) { case 1: Stemmer.impl.slice_from("tion"); break; case 2: Stemmer.impl.slice_from("ate"); break; case 3: Stemmer.impl.slice_from("al"); break; case 4: Stemmer.impl.slice_from("ic"); break; case 5: Stemmer.impl.slice_del(); break; case 6: if (r_R2()) Stemmer.impl.slice_del(); break; } } } } function r_Step_4() { var among_var, v_1; Stemmer.impl.ket = Stemmer.impl.cursor; among_var = Stemmer.impl.find_among_b(a_7, 18); if (among_var) { Stemmer.impl.bra = Stemmer.impl.cursor; if (r_R2()) { switch (among_var) { case 1: Stemmer.impl.slice_del(); break; case 2: v_1 = Stemmer.impl.limit - Stemmer.impl.cursor; if (!Stemmer.impl.eq_s_b(1, "s")) { Stemmer.impl.cursor = Stemmer.impl.limit - v_1; if (!Stemmer.impl.eq_s_b(1, "t")) return; } Stemmer.impl.slice_del(); break; } } } } function r_Step_5() { var among_var, v_1; Stemmer.impl.ket = Stemmer.impl.cursor; among_var = Stemmer.impl.find_among_b(a_8, 2); if (among_var) { Stemmer.impl.bra = Stemmer.impl.cursor; switch (among_var) { case 1: v_1 = Stemmer.impl.limit - Stemmer.impl.cursor; if (!r_R2()) { Stemmer.impl.cursor = Stemmer.impl.limit - v_1; if (!r_R1() || r_shortv()) return; Stemmer.impl.cursor = Stemmer.impl.limit - v_1; } Stemmer.impl.slice_del(); break; case 2: if (!r_R2() || !Stemmer.impl.eq_s_b(1, "l")) return; Stemmer.impl.slice_del(); break; } } } function r_exception2() { Stemmer.impl.ket = Stemmer.impl.cursor; if (Stemmer.impl.find_among_b(a_9, 8)) { Stemmer.impl.bra = Stemmer.impl.cursor; return Stemmer.impl.cursor <= Stemmer.impl.limit_backward; } return false; } function r_exception1() { var among_var; Stemmer.impl.bra = Stemmer.impl.cursor; among_var = Stemmer.impl.find_among(a_10, 18); if (among_var) { Stemmer.impl.ket = Stemmer.impl.cursor; if (Stemmer.impl.cursor >= Stemmer.impl.limit) { switch (among_var) { case 1: Stemmer.impl.slice_from("ski"); break; case 2: Stemmer.impl.slice_from("sky"); break; case 3: Stemmer.impl.slice_from("die"); break; case 4: Stemmer.impl.slice_from("lie"); break; case 5: Stemmer.impl.slice_from("tie"); break; case 6: Stemmer.impl.slice_from("idl"); break; case 7: Stemmer.impl.slice_from("gentl"); break; case 8: Stemmer.impl.slice_from("ugli"); break; case 9: Stemmer.impl.slice_from("earli"); break; case 10: Stemmer.impl.slice_from("onli"); break; case 11: Stemmer.impl.slice_from("singl"); break; } return true; } } return false; } function r_postlude() { var v_1; if (B_Y_found) { while (true) { v_1 = Stemmer.impl.cursor; Stemmer.impl.bra = v_1; if (Stemmer.impl.eq_s(1, "Y")) { Stemmer.impl.ket = Stemmer.impl.cursor; Stemmer.impl.cursor = v_1; Stemmer.impl.slice_from("y"); continue; } Stemmer.impl.cursor = v_1; if (Stemmer.impl.cursor >= Stemmer.impl.limit) return; Stemmer.impl.cursor++; } } } var _Stemmer = class _Stemmer { static stem(input) { if (typeof input !== "string") throw Error("Expects string"); if (!input.includes(" ")) { return _Stemmer.stemEnglish(input); } const words = _Stemmer.tokenizer.tokenize(input); const stems = _Stemmer.stemAll(words); return _Stemmer.tokenizer.untokenize(stems); } static stemAll(input) { return input.map((i) => _Stemmer.stemEnglish(i)); } static stemEnglish(word) { _Stemmer.impl.setCurrent(word); var v_1 = _Stemmer.impl.cursor; if (!r_exception1()) { _Stemmer.impl.cursor = v_1; var c = _Stemmer.impl.cursor + 3; if (0 <= c && c <= _Stemmer.impl.limit) { _Stemmer.impl.cursor = v_1; r_prelude(); _Stemmer.impl.cursor = v_1; r_mark_regions(); _Stemmer.impl.limit_backward = v_1; _Stemmer.impl.cursor = _Stemmer.impl.limit; r_Step_1a(); _Stemmer.impl.cursor = _Stemmer.impl.limit; if (!r_exception2()) for (var i = 0; i < habr.length; i++) { _Stemmer.impl.cursor = _Stemmer.impl.limit; habr[i](); } _Stemmer.impl.cursor = _Stemmer.impl.limit_backward; r_postlude(); } } return _Stemmer.impl.getCurrent(); } }; __publicField(_Stemmer, "tokenizer"); __publicField(_Stemmer, "impl", new SnowballStemmer()); var Stemmer = _Stemmer; var stemmer_default = Stemmer; // src/tokenizer.js var Tokenizer = class { constructor(parent) { this.RiTa = parent; this.splitter = /(\S.+?[.!?]["\u201D]?)(?=\s+|$)/g; } /** * Returns an array containing all unique alphabetical words (tokens) in the text. * Punctuation and case are ignored unless specified otherwise. * @param {string} text - The text from which to extract the tokens * @param {object} [options] - The options * @param {boolean} [options.caseSensitive=false] - Whether to pay attention to case * @param {boolean} [options.ignoreStopWords=false] - Whether to ignore words like 'the', 'and', 'a', 'of', etc, as specified in RiTa.STOP_WORDS * @param {boolean} [options.splitContractions=false] - Whether to convert contractions (e.g., "I'd" or "she'll") into multiple individual tokens * @param {boolean} [options.includePunct=false] - Whether to include punctuation in the results * @param {boolean} [options.sort=false] - Whether to sort the tokens before returning them * @returns {string[]} Array of tokens */ tokens(text, options = { caseSensitive: false, ignoreStopWords: false, splitContractions: false, includePunct: false, sort: false }) { let words = this.tokenize(text, options), map = {}; words.forEach((w) => { if (!options.caseSensitive) w = w.toLowerCase(); if (options.includePunct || ALPHA_RE.test(w)) map[w] = 1; }); let tokens = Object.keys(map); if (options.ignoreStopWords) tokens = tokens.filter((t) => !this.RiTa.isStopWord(t)); return options.sort ? tokens.sort() : tokens; } tokenize(input, opts = { // regex: null, // debug: false, // splitHyphens: false, // splitContractions: false }) { if (typeof input !== "string") return []; if (opts.regex) return input.split(opts.regex); let { tags, text } = this.pushTags(input.trim()); for (let i = 0; i < TOKENIZE_RE.length; i += 2) { if (opts.debug) var pre = text; text = text.replace(TOKENIZE_RE[i], TOKENIZE_RE[i + 1]); if (opts.debug && text !== pre) console.log("HIT" + i, pre + " -> " + text, TOKENIZE_RE[i], TOKENIZE_RE[i + 1]); } if (opts.splitHyphens) { text = text.replace(/([a-zA-Z]+)-([a-zA-Z]+)/g, "$1 - $2"); } if (this.RiTa.SPLIT_CONTRACTIONS || opts.splitContractions) { for (let i = 0; i < CONTRACTS_RE.length; i += 2) { text = text.replace(CONTRACTS_RE[i], CONTRACTS_RE[i + 1]); } } let result = this.popTags(text.trim().split(WS_RE), tags); return result; } untokenize(arr, delim = " ") { if (!arr || !Array.isArray(arr)) return ""; arr = this.preProcessTags(arr); let nextNoSpace = false, afterQuote = false, midSentence = false; let withinQuote = arr.length && QUOTE_RE.test(arr[0]); let result = arr[0] || ""; for (let i = 1; i < arr.length; i++) { if (!arr[i]) continue; let thisToken = arr[i]; let lastToken = arr[i - 1]; let thisComma = thisToken === ",", lastComma = lastToken === ","; let thisNBPunct = NOSP_BF_PUNCT_RE.test(thisToken) || UNTAG_RE[2].test(thisToken) || LINEBREAK_RE.test(thisToken); let thisLBracket = LB_RE.test(thisToken); let thisRBracket = RB_RE.test(thisToken); let lastNBPunct = NOSP_BF_PUNCT_RE.test(lastToken) || LINEBREAK_RE.test(lastToken); let lastNAPunct = NOSP_AF_PUNCT_RE.test(lastToken) || UNTAG_RE[1].test(lastToken) || LINEBREAK_RE.test(lastToken); let lastLB = LB_RE.test(lastToken), lastRB = RB_RE.test(lastToken); let lastEndWithS = lastToken[lastToken.length - 1] === "s" && lastToken != "is" && lastToken != "Is" && lastToken != "IS"; let lastIsWWW = WWW_RE.test(lastToken), isDomain = DOMAIN_RE.test(thisToken); let nextIsS = i == arr.length - 1 ? false : arr[i + 1] === "s" || arr[i + 1] === "S"; let lastQuote = QUOTE_RE.test(lastToken), isLast = i == arr.length - 1; let thisQuote = QUOTE_RE.test(thisToken); let thisLineBreak = LINEBREAK_RE.test(thisToken); if (lastToken === "." && isDomain || nextNoSpace) { nextNoSpace = false; result += thisToken; continue; } else if (thisToken === "." && lastIsWWW) { nextNoSpace = true; } else if (thisLBracket) { result += delim; } else if (lastRB) { if (!thisNBPunct && !thisLBracket) { result += delim; } } else if (thisQuote) { if (withinQuote) { afterQuote = true; withinQuote = false; } else if (!(APOS_RE.test(thisToken) && lastEndWithS || APOS_RE.test(thisToken) && nextIsS)) { withinQuote = true; afterQuote = false; result += delim; } } else if (afterQuote && !thisNBPunct) { result += delim; afterQuote = false; } else if (lastQuote && thisComma) { midSentence = true; } else if (midSentence && lastComma) { result += delim; midSentence = false; } else if (!thisNBPunct && !lastQuote && !lastNAPunct && !lastLB && !thisRBracket || !isLast && thisNBPunct && lastNBPunct && !lastNAPunct && !lastQuote && !lastLB && !thisRBracket && !thisLineBreak) { result += delim; } result += thisToken; if (thisNBPunct && !lastNBPunct && !withinQuote && SQUOTE_RE.test(thisToken) && lastEndWithS) { result += delim; } } return result.trim(); } /** * Split the input text into sentences according to the options * @param {string} text - The text to split * @param {(string|RegExp)} [regex] - An optional custom regex to split on * @returns {string[]} An array of sentences */ sentences(text, regex) { if (!text || !text.length) return [text]; let clean = text.replace(NL_RE, " "); let delim = "___"; let re = new RegExp(delim, "g"); let pattern = regex || this.splitter; let unescapeAbbrevs = (arr2) => { for (let i = 0; i < arr2.length; i++) { arr2[i] = arr2[i].replace(re, "."); } return arr2; }; let escapeAbbrevs = (text2) => { let abbrevs = this.RiTa.ABRV; for (let i = 0; i < abbrevs.length; i++) { let abv = abbrevs[i]; let idx = text2.indexOf(abv); while (idx > -1) { text2 = text2.replace(abv, abv.replace(".", delim)); idx = text2.indexOf(abv); } } return text2; }; let arr = escapeAbbrevs(clean).match(pattern); return arr?.length ? unescapeAbbrevs(arr) : [text]; } pushTags(text) { let tags = [], tagIdx = 0; while (TAG_RE.test(text)) { tags.push(text.match(TAG_RE)[0]); text = text.replace(TAG_RE, " _" + TAG + tagIdx++ + "_ "); } return { tags, text }; } popTags(result, tags) { for (let i = 0; i < result.length; i++) { if (POPTAG_RE.test(result[i])) { result[i] = tags.shift(); } if (result[i].includes("_") && !EMAIL_RE.test(result[i]) && !URL_RE.test(result[i])) { result[i] = result[i].replace(UNDER_RE, "$1 $2"); } } return result; } preProcessTags(array) { let result = [], currentIdx = 0; while (currentIdx < array.length) { let currentToken = array[currentIdx]; if (!LT_RE.test(currentToken)) { result.push(currentToken); currentIdx++; continue; } let subArray = [array[currentIdx]]; let inspectIdx = currentIdx + 1; while (inspectIdx < array.length) { subArray.push(array[inspectIdx]); if (LT_RE.test(array[inspectIdx])) break; if (GT_RE.test(array[inspectIdx])) break; inspectIdx++; } if (LT_RE.test(subArray[subArray.length - 1])) { result = result.concat(subArray.slice(0, subArray.length - 1)); currentIdx = inspectIdx; continue; } if (!GT_RE.test(subArray[subArray.length - 1])) { result = result.concat(subArray); currentIdx = inspectIdx + 1; continue; } if (!TAG_RE.test(subArray.join(""))) { result = result.concat(subArray); currentIdx = inspectIdx + 1; continue; } let tag = this.tagSubarrayToString(subArray); result.push(tag); currentIdx = inspectIdx + 1; } return result; } tagSubarrayToString(array) { if (!LT_RE.test(array[0]) || !GT_RE.test(array[array.length - 1])) { throw Error(array + "is not a tag"); } let start = array[0].trim(); let end = array[array.length - 1].trim(); let inspectIdx = 1; while (inspectIdx < array.length - 1 && TAGSTART_RE.test(array[inspectIdx])) { start += array[inspectIdx].trim(); inspectIdx++; } let contentStartIdx = inspectIdx; inspectIdx = array.length - 2; while (inspectIdx > contentStartIdx && TAGEND_RE.test(array[inspectIdx])) { end = array[inspectIdx].trim() + end; inspectIdx--; } let contentEndIdx = inspectIdx; let result = start + this.untokenize(array.slice(contentStartIdx, contentEndIdx + 1)).trim() + end; return result; } }; var UNTAG_RE = [ /^ *<[a-z][a-z0-9='"#;:&\s\-\+\/\.\?]*\/> *$/i, // empty tags <br/> <img /> etc. -> like a normal word /^ *<([a-z][a-z0-9='"#;:&\s\-\+\/\.\?]*[a-z0-9='"#;:&\s\-\+\.\?]|[a-z])> *$/i, // opening tags <a>, <p> etc. -> no space after /^ *<\/[a-z][a-z0-9='"#;:&\s\-\+\/\.\?]*> *$/i, // closing tags </a> </p> etc. -> no space before /^ *<!DOCTYPE[^>]*> *$/i, // <!DOCTYPE> -> like a normal word /^ *<!--[^->]*--> *$/i // <!-- --> -> like a normal word ]; var LT_RE = /^ *< *$/; var GT_RE = /^ *> *$/; var TAGSTART_RE = /^ *[!\-\/] *$/; var TAGEND_RE = /^ *[\-\/] *$/; var NOSP_AF_PUNCT_RE = /^[\^\*\$\/\u2044#\-@\u00b0\u2012\u2013\u2014]+$/; var TAG = "TAG"; var UNDER_RE = /([0-9a-zA-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF]|[\.\,])_([0-9a-zA-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF])/g; var LB_RE = /^[\[\(\{\u27e8]+$/; var RB_RE = /^[\)\]\}\u27e9]+$/; var QUOTE_RE = /^[""\u201c\u201d\u2019\u2018`''\u00ab\u00bb]+$/; var DOMAIN_RE = /^(com|org|edu|net|xyz|gov|int|eu|hk|tw|cn|de|ch|fr)$/; var SQUOTE_RE = /^[\u2019\u2018`']+$/; var ALPHA_RE = /^[A-Za-z’']+$/; var WS_RE = / +/; var APOS_RE = /^[\u2019']+$/; var NL_RE = /(\r?\n)+/g; var WWW_RE = /^(www[0-9]?|WWW[0-9]?)$/; var NOSP_BF_PUNCT_RE = /^[,\.\;\:\?\!\)""\u201c\u201d\u2019\u2018`'%\u2026\u2103\^\*\u00b0\/\u2044\u2012\u2013\u2014\-@]+$/; var LINEBREAK_RE = /\r?\n/; var URL_RE = /((http[s]?):(\/\/))?([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b)([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)/; var EMAIL_RE = /^\w+([\.-]?\w+)*@\w+([\.-]?\w+)*(\.\w{2,3})+$/; var TOKENIZE_RE = [ // save -------- /\b([Ee])[.]([Gg])[.]/g, "_$1$2_", //E.g /\b([Ii])[.]([Ee])[.]/g, "_$1$2_", //i.e /\b([Aa])[.]([Mm])[.]/g, "_$1$2_", //a.m. /\b([Pp])[.]([Mm])[.]/g, "_$1$2_", //p.m. /\b(Cap)[\.]/g, "_Cap_", //Cap. /\b([Cc])[\.]/g, "_$1_", //c. /\b([Ee][Tt])[\s]([Aa][Ll])[\.]/, "_$1zzz$2_", // et al. /\b(etc|ETC)[\.]/g, "_$1_", //etc. /\b([Pp])[\.]([Ss])[\.]/g, "_$1$2dot_", // p.s. /\b([Pp])[\.]([Ss])/g, "_$1$2_", // p.s /\b([Pp])([Hh])[\.]([Dd])/g, "_$1$2$3_", // Ph.D /\b([Rr])[\.]([Ii])[\.]([Pp])/g, "_$1$2$3_", // R.I.P /\b([Vv])([Ss]?)[\.]/g, "_$1$2_", // vs. and v. /\b([Mm])([Rr]|[Ss]|[Xx])\./g, "_$1$2_", // Mr. Ms. and Mx. /\b([Dd])([Rr])[\.]/g, "_$1$2_", // Dr. /\b([Pp])([Ff])[\.]/g, "_$1$2_", // Pf. /\b([Ii])([Nn])([Dd]|[Cc])[\.]/g, "_$1$2$3_", // Ind. and Inc. /\b([Cc])([Oo])[\.][\,][\s]([Ll])([Tt])([Dd])[\.]/g, "_$1$2dcs$3$4$5_", // co., ltd. /\b([Cc])([Oo])[\.][\s]([Ll])([Tt])([Dd])[\.]/g, "_$1$2ds$3$4$5_", // co. ltd. /\b([Cc])([Oo])[\.][\,]([Ll])([Tt])([Dd])[\.]/g, "_$1$2dc$3$4$5_", // co.,ltd. /\b([Cc])([Oo])([Rr]?)([Pp]?)[\.]/g, "_$1$2$3$4_", // Corp. and Co. /\b([Ll])([Tt])([Dd])[\.]/g, "_$1$2$3_", // ltd. /\b(prof|Prof|PROF)\./g, "_$1_", //Prof. // /(\w+([\.-_]?\w+)*)@(\w+([\.-_]?\w+)*)\.(\w{2,3})/g, "$1__AT__$3.$5", //email addresses // /^\w+([\.-]?\w+)+@\w+([\.:]?\w+)+(\.[a-zA-Z0-9]{2,3})+$/g, "$1__AT__$2", //email addresses /\b([\w.]+)@(\w+\.\w+)/g, "$1__AT__$2", /\b((http[s]?):(\/\/))([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b)([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)/g, "$2COLON$3$4$5", //urls with http(s) //decimal # /([\-]?[0-9]+)\.([0-9]+)/g, "$1DECIMALDOT$2_", //(-)27.3 /([\-]?[0-9]+)\.([0-9]+)e([\-]?[0-9]+)/g, "_$1DECIMALDOT$2POWERE$3_", //(-)1.2e10 /([0-9]{1,3}),([0-9]{3})/g, "$1_DECIMALCOMMA_$2", // large numbers like 19,700 or 200,000,000.13 /([A-Za-z0-9])\.([A-Za-z0-9])/g, "$1_DECIMALDOT_$2", //www.example.com //escape sequences of line breaks in ASCII /\r\n/g, " _CARRIAGERETURNLINEFEED_ ", // CR LF /\n\r/g, " _LINEFEEDCARRIAGERETURN_ ", // LF CR /\n/g, " _LINEFEED_ ", // LF /\r/g, " _CARRIAGERETURN_ ", // CR ///\036/g, " _RECORDSEPARATOR_ ", // RS //-------------------------- /\.\.\.\s/g, "_elipsis_ ", /([\?!\"\u201C\.,;:@#$%&])/g, " $1 ", /\u2026/g, " \u2026 ", /\s+/g, " ", /,([^0-9])/g, " , $1", /([^.])([.])([\])}>\"'\u2019]*)\s*$/g, "$1 $2$3 ", /([\[\](){}<>\u27e8\u27e9])/g, " $1 ", /--/g, " -- ", /\u2012/g, " \u2012 ", //" ‒ " /\u2013/g, " \u2013 ", // " — " /\u2014/g, " \u2014 ", //" – " /$/g, " ", /^/g, " ", /([^'])' | '/g, "$1 ' ", / \u2018/g, " \u2018 ", /'([SMD]) /g, " '$1 ", / ([A-Z]) \./g, " $1. ", /^\s+/g, "", /\^/g, " ^ ", /\u00b0/g, " \xB0 ", /_elipsis_/g, " ... ", //pop ------------------ /_([Ee])([Gg])_/g, "$1.$2.", //Eg /_([Ii])([Ee])_/g, "$1.$2.", //ie /_([Aa])([Mm])_/g, "$1.$2.", //a.m. /_([Pp])([Mm])_/g, "$1.$2.", //p.m. /_Cap_/g, "Cap.", //Cap. /_([Cc])_/g, "$1.", //c. /_([Ee][Tt])zzz([Aa][Ll])_/, "$1_$2.", // et al. /_(etc|ETC)_/g, "$1.", //etc. /_([Pp])([Ss])dot_/g, "$1.$2.", // p.s. /_([Pp])([Ss])_/g, "$1.$2", /_([Pp])([Hh])([Dd])_/g, "$1$2.$3", // Ph.D /_([Rr])([Ii])([Pp])_/g, "$1.$2.$3", // R.I.P /_([Vv])([Ss]?)_/g, "$1$2.", // vs. and v. /_([Mm])([Rr]|[Ss]|[Xx])_/g, "$1$2.", // Mr. Ms. and Mx. /_([Dd])([Rr])_/g, "$1$2.", // Dr. /_([Pp])([Ff])_/g, "$1$2.", // Pf. /_([Ii])([Nn])([Dd]|[Cc])_/g, "$1$2$3.", // Ind. and Inc. /_([Cc])([Oo])([Rr]?)([Pp]?)_/g, "$1$2$3$4.", // Corp. and Co. /_([Cc])([Oo])dc([Ll])([Tt])([Dd])_/g, "$1$2.,$3$4$5.", // co.,ltd. /_([Ll])([Tt])([Dd])_/g, "$1$2$3.", // ltd. /_([Cc])([Oo])dcs([Ll])([Tt])([Dd])_/g, "$1$2.,_$3$4$5.", // co., ltd. /_([Cc])([Oo])ds([Ll])([Tt])([Dd])_/g, "$1$2._$3$4$5.", // co. ltd. /_(prof|PROF|Prof)_/g, "$1.", //Prof. /([\-]?[0-9]+)DECIMALDOT([0-9]+)_/g, "$1.$2", //(-)27.3 /_([\-]?[0-9]+)\DECIMALDOT([0-9]+)POWERE([\-]?[0-9]+)_/g, "$1.$2e$3", //(-)1.2e(-)9 /_DECIMALCOMMA_/g, ",", // large numbers like 200,000,000.13 /_DECIMALDOT_/g, ".", // /(\w+([\.-]?\w+)*)__AT__(\w+([\.-]?\w+)*)\.(\w{2,3})/g, "$1@$3.$5", /__AT__/g, "@", /((http[s]?)COLON(\/\/))([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b)([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)/g, "$2:$3$4$5", /_LINEFEED_/g, "\n", // LF /_CARRIAGERETURN_/g, "\r", // CR /_CARRIAGERETURNLINEFEED_/g, "\r\n", // CR LF /_LINEFEEDCARRIAGERETURN_/g, "\n\r" // LF CR ///_RECORDSEPARATOR_/g, "\\036", // RS ]; var CONTRACTS_RE = [ // TODO: 'She'd have wanted' -> 'She would have wanted' // WORKING HERE: add word boundaries \b to these /\b([Cc])an['\u2019]t/g, "$1an not", /\b([Dd])idn['\u2019]t/g, "$1id not", /\b([CcWw])ouldn['\u2019]t/g, "$1ould not", /\b([Ss])houldn['\u2019]t/g, "$1hould not", /\b([Ii])t['\u2019]s/g, "$1t is", /\b([tT]hat)['\u2019]s/g, "$1 is", /\b(she|he|you|they|i)['\u2019]d/gi, "$1 had", // changed from would, 12/8/23 /\b(she|he|you|they|i)['\u2019]ll/gi, "$1 will", /n['\u2019]t /g, " not ", /['\u2019]ve /g, " have ", /['\u2019]re /g, " are " ]; var TAG_RE = /(<\/?[a-z][a-z0-9='"#;:&\s\-\+\/\.\?]*\/?>|<!DOCTYPE[^>]*>|<!--[^>-]*-->)/i; var POPTAG_RE = new RegExp(`_${TAG}[0-9]+_`); var tokenizer_default = Tokenizer; // src/conjugator.js var RegularExpression = class { constructor(regex, offset, suffix) { this.raw = regex; this.regex = new RegExp(regex); this.offset = offset; this.suffix = suffix || ""; } applies(word) { return this.regex.test(word); } fire(word) { return this.truncate(word) + this.suffix; } truncate(word) { return this.offset === 0 ? word : word.substr(0, word.length - this.offset); } toString() { return "/" + this.raw + "/"; } }; var RE = (a, b, c, _) => new RegularExpression(a, b, c); var Conjugator = class { constructor(parent) { __publicField(this, "_handleStem", function(word) { if (this.RiTa.lexicon.data.hasOwnProperty(word) && this.RiTa.tagger.allTags(word).includes("vb")) { return word; } let w = word; let allVerb = this.allVerbs; while (w.length > 1) { let pattern = new RegExp("^" + w); let guess = allVerb.filter((item) => pattern.test(item)); if (!guess || guess.length < 1) { w = w.slice(0, -1); continue; } guess.sort((a, b) => a.length - b.length); for (let i = 0; i < guess.length; i++) { if (word === guess[i]) return word; if (this.RiTa.stem(guess[i]) === word) return guess[i]; if (this.unconjugate(this.RiTa.stem(guess[i])) === word) return guess[i]; } w = w.slice(0, -1); } return word; }); this.RiTa = parent; this._reset(); let data = this.RiTa.lexicon.data; this.allVerbs = Object.keys(data).filter((word) => data[word][1].split(" ").includes("vb")); this.verbsEndingInE = this.allVerbs.filter((v) => v.endsWith("e")); this.verbsEndingInDouble = this.allVerbs.filter((v) => /([^])\1$/.test(v)); } // TODO: add handling of past tense modals. conjugate(verb, args) { if (!verb || !verb.length) throw Error("No verb"); if (!args) return verb; verb = verb.toLowerCase(); if (!this.RiTa.tagger.allTags(verb).includes("vb")) { verb = this.unconjugate(verb) || verb; } args = this._parseArgs(args); let frontVG = TO_BE.includes(verb) ? "be" : this._handleStem(verb); let actualModal, verbForm, conjs = [], RiTa2 = this.RiTa; if (this.form === RiTa2.INFINITIVE) { actualModal = "to"; } if (this.tense === RiTa2.FUTURE) { actualModal = "will"; } if (this.passive) { conjs.push(this.pastPart(frontVG)); frontVG = "be"; } if (this.progressive) { conjs.push(this.presentPart(frontVG)); frontVG = "be"; } if (this.perfect) { conjs.push(this.pastPart(frontVG)); frontVG = "have"; } if (actualModal) { conjs.push(frontVG); frontVG = null; } if (frontVG) { if (this.form === RiTa2.GERUND) { conjs.push(this.presentPart(frontVG)); } else if (this.interrogative && frontVG != "be" && conjs.length < 1) { conjs.push(frontVG); } else { verbForm = this._verbForm(frontVG, this.tense, this.person, this.number); conjs.push(verbForm); } } actualModal && conjs.push(actualModal); return conjs.reduce((acc, cur) => cur + " " + acc).trim(); } unconjugate(word, opts = {}) { if (typeof word !== "string") return; let dbug = opts && opts.dbug; if (IRREG_VERBS_LEX.hasOwnProperty(word)) { dbug && console.log(word + " in exceptions1 (in lex)"); return IRREG_VERBS_LEX[word]; } else if (Object.values(IRREG_VERBS_LEX).includes(word)) { dbug && console.log(word + " is base form in exceptions1 (in lex)"); return word; } if (IRREG_VERBS_NOLEX.hasOwnProperty(word)) { dbug && console.log(word + " is in exceptions2"); return IRREG_VERBS_NOLEX[word]; } else if (Object.values(IRREG_VERBS_NOLEX).includes(word)) { dbug && console.log(word + " is base form in exceptions2 (not in lex)"); return word; } let tags = this.RiTa.tagger.allTags(word, { noGuessing: true }); if (tags.some((t) => t === "vb")) { dbug && console.log(word + " is a base form verb"); return word; } if (word.endsWith("s")) { if (word.endsWith("ies")) { dbug && console.log("'" + word + "' hit rule: ends with -ies"); return word.slice(0, -3) + "y"; } else if (/(ch|s|sh|x|z|o)es$/.test(word)) { dbug && console.log("'" + word + "' hit rule: ends with -(ch|s|sh|x|z|o)es"); return word.slice(0, -2); } dbug && console.log("'" + word + "' hit rule: ends with -s"); return word.slice(0, -1); } else if (word.endsWith("ed")) { if (word.endsWith("ied")) { dbug && console.log("'" + word + "' hit rule: ends with -ied"); return word.slice(0, -3) + "y"; } else if (/([a-z])\1ed$/.test(word)) { if (this.verbsEndingInDouble.includes(word.replace(/ed$/, ""))) { dbug && console.log("'" + word + "' hit rule: ends with -ed"); return word.slice(0, -2); } dbug && console.log("'" + word + "' hit rule: ends with -..ed"); return word.slice(0, -3); } else if (word.endsWith("ed")) { if (this.verbsEndingInE.includes(word.replace(/d$/, ""))) { dbug && console.log("'" + word + "' hit rule: ends with -(e)d"); return word.slice(0, -1); } else { dbug && console.log("'" + word + "' hit rule: ends with -ed"); return word.slice(0, -2); } } } else if (word.endsWith("ing")) { if (/([a-z])\1ing$/.test(word)) { if (this.verbsEndingInDouble.includes(word.slice(0, -3))) { dbug && console.log("'" + word + "' hit rule: ends with -(XX)ing [in-list]"); return word.slice(0, -3); } dbug && console.log("'" + word + "' hit rule: ends with -XXing [no-list]"); return word.slice(0, -4); } if (word.endsWith("ying")) { if (this.verbsEndingInE.includes(word.replace(/ying$/, "ie"))) { dbug && console.log("'" + word + "' hit rule: base ends with -ying"); return word.slice(0, -4) + "ie"; } } if (this.verbsEndingInE.includes(word.replace(/ing$/, "e"))) { dbug && console.log("'" + word + "' hit rule: base ends with -(e)ing"); return word.slice(0, -3) + "e"; } dbug && console.log("'" + word + "' hit rule: ends with -ing"); return word.slice(0, -3); } else { if (!tags.some((t) => t.startsWith("vb"))) { dbug && console.log(word + " is not a known verb"); return word; } } dbug && console.log("'" + word + "' hit no rules"); return word; } presentPart(theVerb) { return theVerb === "be" ? "being" : this._checkRules(PRESENT_PART_RULESET, theVerb); } pastPart(theVerb) { if (this._isPastParticiple(theVerb)) return theVerb; return this._checkRules(PAST_PART_RULESET, theVerb); } toString() { return " ---------------------\n Passive = " + this.passive + "\n Perfect = " + this.perfect + "\n Progressive = " + this.progressive + "\n ---------------------\n Number = " + this.number + "\n Person = " + this.person + "\n Tense = " + this.tense + "\n ---------------------\n"; } /////////////////////////////// End API /////////////////////////////////// _reset() { this.IRREG_VERBS_LEX_VB = IRREG_VERBS_LEX; this.IRREG_VERBS_NOLEX = IRREG_VERBS_NOLEX; this.IRREG_PAST_PART = IRREG_PAST_PART; this.perfect = this.progressive = this.passive = this.interrogative = false; this.tense = this.RiTa.PRESENT; this.person = this.RiTa.FIRST; this.number = this.RiTa.SINGULAR; this.form = this.RiTa.NORMAL; } _parseArgs(args) { this._reset(); const RiTa2 = this.RiTa; if (typeof args === "string") { if (/^[123][SP](Pr|Pa|Fu)$/.test(args)) { let opts = {}; opts.person = parseInt(args[0]); opts.number = args[1] === "S" ? RiTa2.SINGULAR : RiTa2.PLURAL; let tense = args.substr(2); if (tense === "Pr") opts.tense = RiTa2.PRESENT; if (tense === "Fu") opts.tense = RiTa2.FUTURE; if (tense === "Pa") opts.tense = RiTa2.PAST; args = opts; } else { throw Error("Invalid args: " + args); } } if (args.number) this.number = args.number; if (args.person) this.person = args.person; if (args.tense) this.tense = args.tense; if (args.form) this.form = args.form; if (args.passive) this.passive = args.passive; if (args.progressive) this.progressive = args.progressive; if (args.interrogative) this.interrogative = args.interrogative; if (args.perfect) this.perfect = args.perfect; } _checkRules(ruleSet, theVerb) { if (!theVerb || !theVerb.length) return ""; theVerb = theVerb.trim(); let dbug = 0, res, name = ruleSet.name; let rules = ruleSet.rules, defRule = ruleSet.defaultRule; if (!rules) console.error("no rule: " + ruleSet.name + " of " + theVerb); if (MODALS.includes(theVerb)) return theVerb; for (let i = 0; i < rules.length; i++) { dbug && console.log("checkRules(" + name + ").fire(" + i + ")=" + rules[i].regex); if (rules[i].applies(theVerb)) { let got = rules[i].fire(theVerb); dbug && console.log("HIT(" + name + ").fire(" + i + ")=" + rules[i].regex + "_returns: " + got); return got; } } dbug && console.log("NO HIT!"); if (ruleSet.doubling && VERB_CONS_DOUBLING.includes(theVerb)) { dbug && console.log("doDoubling!"); theVerb = this._doubleFinalConsonant(theVerb); } res = defRule.fire(theVerb); dbug && console.log("checkRules(" + name + ").returns: " + res); return res; } _doubleFinalConsonant(word) { return word + word.charAt(word.length - 1); } _isPastParticiple(word) { const w = word.toLowerCase(); const lex = this.RiTa.lexicon; const posArr = lex._posArr(w); if (posArr && posArr.includes("vbn")) return true; if (IRREG_PAST_PART.includes(w)) return true; if (w.endsWith("ed")) { let pos = lex._posArr(w.substring(0, w.length - 1)) || lex._posArr(w.substring(0, w.length - 2)); if (!pos && w.charAt(w.length - 3) === w.charAt(w.length - 4)) { pos = lex._posArr(w.substring(0, w.length - 3)); } if (!pos && w.endsWith("ied")) { pos = lex._posArr(w.substring(0, w.length - 3) + "y"); } if (pos && pos.includes("vb")) return true; } if (w.endsWith("en")) { let pos = lex._posArr(w.substring(0, w.length - 1)) || lex._posArr(w.substring(0, w.length - 2)); if (!pos &&