UNPKG

@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

111 lines (110 loc) 3.2 kB
const SUFFIXES_STEP2 = { alism: "al", aliti: "al", alli: "al", anci: "ance", ation: "ate", ational: "ate", ator: "ate", biliti: "ble", bli: "ble", eli: "e", enci: "ence", entli: "ent", fulness: "ful", iveness: "ive", iviti: "ive", ization: "ize", izer: "ize", logi: "log", ousli: "ous", ousness: "ous", tional: "tion" }; const SUFFIXES_STEP3 = { alize: "al", ative: "", ful: "", ical: "ic", icate: "ic", iciti: "ic", ness: "" }; const c = "[^aeiou]"; const v = "[aeiouy]"; const C = c + "[^aeiouy]*"; const V = v + "[aeiou]*"; const RE_MGR0 = new RegExp("^(" + C + ")?" + V + C); const RE_MEQ1 = new RegExp("^(" + C + ")?" + V + C + "(" + V + ")?$"); const RE_MGR1 = new RegExp("^(" + C + ")?" + V + C + V + C); const RE_STEM_VOWEL = new RegExp("^(" + C + ")?" + v); const OTHER = new RegExp("^" + C + v + "[^aeiouwxy]$"); const RE_STEP1A = /^(.+?)(ss|i)es$/; const RE_STEP1A_ALT = /^(.+?)([^s])s$/; const RE_STEP1B = /^(.+?)eed$/; const RE_STEP1B_ALT = /^(.+?)(ed|ing)$/; const RE_STEP1B_2 = /(at|bl|iz)$/; const RE_STEP1B_3 = /([^aeiouylsz])\1$/; const RE_STEP1B_4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); const RE_STEP1C = /^(.+?)y$/; const RE_STEP2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; const RE_STEP3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; const RE_STEP4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; const RE_STEP4A = /^(.+?)(s|t)(ion)$/; const RE_STEP5 = /^(.+?)e$/; const stemWord = (word) => { let initialY; let fp; let stem; let suffix; if (word.length < 3) return word; initialY = word[0] === "y"; if (initialY) word = "Y" + word.substring(1); word = word.replace(RE_STEP1A, "$1$2").replace(RE_STEP1A_ALT, "$1$2"); if (fp = RE_STEP1B.exec(word)) { if (RE_MGR0.test(fp[1])) word = word.slice(0, -1); } else if (fp = RE_STEP1B_ALT.exec(word)) { stem = fp[1]; if (RE_STEM_VOWEL.test(stem)) { word = stem; if (RE_STEP1B_2.test(word)) { word = word + "e"; } else if (RE_STEP1B_3.test(word)) { word = word.slice(0, -1); } else if (RE_STEP1B_4.test(word)) { word = word + "e"; } } } if (fp = RE_STEP1C.exec(word)) { stem = fp[1]; if (RE_STEM_VOWEL.test(stem)) word = stem + "i"; } if (fp = RE_STEP2.exec(word)) { stem = fp[1]; suffix = fp[2]; if (RE_MGR0.test(stem)) word = stem + SUFFIXES_STEP2[suffix]; } if (fp = RE_STEP3.exec(word)) { stem = fp[1]; suffix = fp[2]; if (RE_MGR0.test(stem)) word = stem + SUFFIXES_STEP3[suffix]; } if (fp = RE_STEP4.exec(word)) { stem = fp[1]; if (RE_MGR1.test(stem)) word = stem; } else if (fp = RE_STEP4A.exec(word)) { stem = fp[1] + fp[2]; if (RE_MGR1.test(stem)) word = stem; } if (fp = RE_STEP5.exec(word)) { stem = fp[1]; if (RE_MGR1.test(stem) || RE_MEQ1.test(stem) && !OTHER.test(stem)) word = stem; } if (word.endsWith("ll") && RE_MGR1.test(word)) word = word.slice(0, -1); return initialY ? "y" + word.substring(1) : word; }; export { stemWord };