UNPKG

snowball

Version:

snowball word stemming algorithm implementation

479 lines (478 loc) 13.3 kB
/*! * Snowball JavaScript Library v0.3 * http://code.google.com/p/urim/ * http://snowball.tartarus.org/ * * Copyright 2010, Oleg Mazko * http://www.mozilla.org/MPL/ */ function ItalianStemmer() { var a_0 = [new Among("", -1, 7), new Among("qu", 0, 6), new Among("\u00E1", 0, 1), new Among("\u00E9", 0, 2), new Among("\u00ED", 0, 3), new Among("\u00F3", 0, 4), new Among("\u00FA", 0, 5)], a_1 = [new Among("", -1, 3), new Among("I", 0, 1), new Among("U", 0, 2)], a_2 = [ new Among("la", -1, -1), new Among("cela", 0, -1), new Among("gliela", 0, -1), new Among("mela", 0, -1), new Among("tela", 0, -1), new Among("vela", 0, -1), new Among("le", -1, -1), new Among("cele", 6, -1), new Among("gliele", 6, -1), new Among("mele", 6, -1), new Among("tele", 6, -1), new Among("vele", 6, -1), new Among("ne", -1, -1), new Among("cene", 12, -1), new Among("gliene", 12, -1), new Among("mene", 12, -1), new Among("sene", 12, -1), new Among("tene", 12, -1), new Among("vene", 12, -1), new Among("ci", -1, -1), new Among("li", -1, -1), new Among("celi", 20, -1), new Among("glieli", 20, -1), new Among("meli", 20, -1), new Among("teli", 20, -1), new Among("veli", 20, -1), new Among("gli", 20, -1), new Among("mi", -1, -1), new Among("si", -1, -1), new Among("ti", -1, -1), new Among("vi", -1, -1), new Among("lo", -1, -1), new Among("celo", 31, -1), new Among("glielo", 31, -1), new Among("melo", 31, -1), new Among("telo", 31, -1), new Among("velo", 31, -1)], a_3 = [new Among("ando", -1, 1), new Among("endo", -1, 1), new Among("ar", -1, 2), new Among("er", -1, 2), new Among("ir", -1, 2)], a_4 = [ new Among("ic", -1, -1), new Among("abil", -1, -1), new Among("os", -1, -1), new Among("iv", -1, 1)], a_5 = [ new Among("ic", -1, 1), new Among("abil", -1, 1), new Among("iv", -1, 1)], a_6 = [new Among("ica", -1, 1), new Among("logia", -1, 3), new Among("osa", -1, 1), new Among("ista", -1, 1), new Among("iva", -1, 9), new Among("anza", -1, 1), new Among("enza", -1, 5), new Among("ice", -1, 1), new Among("atrice", 7, 1), new Among("iche", -1, 1), new Among("logie", -1, 3), new Among("abile", -1, 1), new Among("ibile", -1, 1), new Among("usione", -1, 4), new Among("azione", -1, 2), new Among("uzione", -1, 4), new Among("atore", -1, 2), new Among("ose", -1, 1), new Among("ante", -1, 1), new Among("mente", -1, 1), new Among("amente", 19, 7), new Among("iste", -1, 1), new Among("ive", -1, 9), new Among("anze", -1, 1), new Among("enze", -1, 5), new Among("ici", -1, 1), new Among("atrici", 25, 1), new Among("ichi", -1, 1), new Among("abili", -1, 1), new Among("ibili", -1, 1), new Among("ismi", -1, 1), new Among("usioni", -1, 4), new Among("azioni", -1, 2), new Among("uzioni", -1, 4), new Among("atori", -1, 2), new Among("osi", -1, 1), new Among("anti", -1, 1), new Among("amenti", -1, 6), new Among("imenti", -1, 6), new Among("isti", -1, 1), new Among("ivi", -1, 9), new Among("ico", -1, 1), new Among("ismo", -1, 1), new Among("oso", -1, 1), new Among("amento", -1, 6), new Among("imento", -1, 6), new Among("ivo", -1, 9), new Among("it\u00E0", -1, 8), new Among("ist\u00E0", -1, 1), new Among("ist\u00E8", -1, 1), new Among("ist\u00EC", -1, 1)], a_7 = [ new Among("isca", -1, 1), new Among("enda", -1, 1), new Among("ata", -1, 1), new Among("ita", -1, 1), new Among("uta", -1, 1), new Among("ava", -1, 1), new Among("eva", -1, 1), new Among("iva", -1, 1), new Among("erebbe", -1, 1), new Among("irebbe", -1, 1), new Among("isce", -1, 1), new Among("ende", -1, 1), new Among("are", -1, 1), new Among("ere", -1, 1), new Among("ire", -1, 1), new Among("asse", -1, 1), new Among("ate", -1, 1), new Among("avate", 16, 1), new Among("evate", 16, 1), new Among("ivate", 16, 1), new Among("ete", -1, 1), new Among("erete", 20, 1), new Among("irete", 20, 1), new Among("ite", -1, 1), new Among("ereste", -1, 1), new Among("ireste", -1, 1), new Among("ute", -1, 1), new Among("erai", -1, 1), new Among("irai", -1, 1), new Among("isci", -1, 1), new Among("endi", -1, 1), new Among("erei", -1, 1), new Among("irei", -1, 1), new Among("assi", -1, 1), new Among("ati", -1, 1), new Among("iti", -1, 1), new Among("eresti", -1, 1), new Among("iresti", -1, 1), new Among("uti", -1, 1), new Among("avi", -1, 1), new Among("evi", -1, 1), new Among("ivi", -1, 1), new Among("isco", -1, 1), new Among("ando", -1, 1), new Among("endo", -1, 1), new Among("Yamo", -1, 1), new Among("iamo", -1, 1), new Among("avamo", -1, 1), new Among("evamo", -1, 1), new Among("ivamo", -1, 1), new Among("eremo", -1, 1), new Among("iremo", -1, 1), new Among("assimo", -1, 1), new Among("ammo", -1, 1), new Among("emmo", -1, 1), new Among("eremmo", 54, 1), new Among("iremmo", 54, 1), new Among("immo", -1, 1), new Among("ano", -1, 1), new Among("iscano", 58, 1), new Among("avano", 58, 1), new Among("evano", 58, 1), new Among("ivano", 58, 1), new Among("eranno", -1, 1), new Among("iranno", -1, 1), new Among("ono", -1, 1), new Among("iscono", 65, 1), new Among("arono", 65, 1), new Among("erono", 65, 1), new Among("irono", 65, 1), new Among("erebbero", -1, 1), new Among("irebbero", -1, 1), new Among("assero", -1, 1), new Among("essero", -1, 1), new Among("issero", -1, 1), new Among("ato", -1, 1), new Among("ito", -1, 1), new Among("uto", -1, 1), new Among("avo", -1, 1), new Among("evo", -1, 1), new Among("ivo", -1, 1), new Among("ar", -1, 1), new Among("ir", -1, 1), new Among("er\u00E0", -1, 1), new Among("ir\u00E0", -1, 1), new Among("er\u00F2", -1, 1), new Among("ir\u00F2", -1, 1)], g_v = [17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 8, 2, 1], g_AEIO = [17, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 8, 2], g_CG = [17], I_p2, I_p1, I_pV, sbp = new SnowballProgram(); this.setCurrent = function(word) { sbp.setCurrent(word); }; this.getCurrent = function() { return sbp.getCurrent(); }; function habr1(c1, c2, v_1) { if (sbp.eq_s(1, c1)) { sbp.ket = sbp.cursor; if (sbp.in_grouping(g_v, 97, 249)) { sbp.slice_from(c2); sbp.cursor = v_1; return true; } } return false; } function r_prelude() { var among_var, v_1 = sbp.cursor, v_2, v_3, v_4; while (true) { sbp.bra = sbp.cursor; among_var = sbp.find_among(a_0, 7); if (among_var) { sbp.ket = sbp.cursor; switch (among_var) { case 1 : sbp.slice_from("\u00E0"); continue; case 2 : sbp.slice_from("\u00E8"); continue; case 3 : sbp.slice_from("\u00EC"); continue; case 4 : sbp.slice_from("\u00F2"); continue; case 5 : sbp.slice_from("\u00F9"); continue; case 6 : sbp.slice_from("qU"); continue; case 7 : if (sbp.cursor >= sbp.limit) break; sbp.cursor++; continue; } } break; } sbp.cursor = v_1; while (true) { v_2 = sbp.cursor; while (true) { v_3 = sbp.cursor; if (sbp.in_grouping(g_v, 97, 249)) { sbp.bra = sbp.cursor; v_4 = sbp.cursor; if (habr1("u", "U", v_3)) break; sbp.cursor = v_4; if (habr1("i", "I", v_3)) break; } sbp.cursor = v_3; if (sbp.cursor >= sbp.limit) { sbp.cursor = v_2; return; } sbp.cursor++; } } } function habr2(v_1) { sbp.cursor = v_1; if (!sbp.in_grouping(g_v, 97, 249)) return false; while (!sbp.out_grouping(g_v, 97, 249)) { if (sbp.cursor >= sbp.limit) return false; sbp.cursor++; } return true; } function habr3() { if (sbp.in_grouping(g_v, 97, 249)) { var v_1 = sbp.cursor; if (sbp.out_grouping(g_v, 97, 249)) { while (!sbp.in_grouping(g_v, 97, 249)) { if (sbp.cursor >= sbp.limit) return habr2(v_1); sbp.cursor++; } return true; } return habr2(v_1); } return false; } function habr4() { var v_1 = sbp.cursor, v_2; if (!habr3()) { sbp.cursor = v_1; if (!sbp.out_grouping(g_v, 97, 249)) return; v_2 = sbp.cursor; if (sbp.out_grouping(g_v, 97, 249)) { while (!sbp.in_grouping(g_v, 97, 249)) { if (sbp.cursor >= sbp.limit) { sbp.cursor = v_2; if (sbp.in_grouping(g_v, 97, 249) && sbp.cursor < sbp.limit) sbp.cursor++; return; } sbp.cursor++; } I_pV = sbp.cursor; return; } sbp.cursor = v_2; if (!sbp.in_grouping(g_v, 97, 249) || sbp.cursor >= sbp.limit) return; sbp.cursor++; } I_pV = sbp.cursor; } function habr5() { while (!sbp.in_grouping(g_v, 97, 249)) { if (sbp.cursor >= sbp.limit) return false; sbp.cursor++; } while (!sbp.out_grouping(g_v, 97, 249)) { if (sbp.cursor >= sbp.limit) return false; sbp.cursor++; } return true; } function r_mark_regions() { var v_1 = sbp.cursor; I_pV = sbp.limit; I_p1 = I_pV; I_p2 = I_pV; habr4(); sbp.cursor = v_1; if (habr5()) { I_p1 = sbp.cursor; if (habr5()) I_p2 = sbp.cursor; } } function r_postlude() { var among_var; while (true) { sbp.bra = sbp.cursor; among_var = sbp.find_among(a_1, 3); if (!among_var) break; sbp.ket = sbp.cursor; switch (among_var) { case 1 : sbp.slice_from("i"); break; case 2 : sbp.slice_from("u"); break; case 3 : if (sbp.cursor >= sbp.limit) return; sbp.cursor++; break; } } } function r_RV() { return I_pV <= sbp.cursor; } function r_R1() { return I_p1 <= sbp.cursor; } function r_R2() { return I_p2 <= sbp.cursor; } function r_attached_pronoun() { var among_var; sbp.ket = sbp.cursor; if (sbp.find_among_b(a_2, 37)) { sbp.bra = sbp.cursor; among_var = sbp.find_among_b(a_3, 5); if (among_var && r_RV()) { switch (among_var) { case 1 : sbp.slice_del(); break; case 2 : sbp.slice_from("e"); break; } } } } function r_standard_suffix() { var among_var; sbp.ket = sbp.cursor; among_var = sbp.find_among_b(a_6, 51); if (!among_var) return false; sbp.bra = sbp.cursor; switch (among_var) { case 1 : if (!r_R2()) return false; sbp.slice_del(); break; case 2 : if (!r_R2()) return false; sbp.slice_del(); sbp.ket = sbp.cursor; if (sbp.eq_s_b(2, "ic")) { sbp.bra = sbp.cursor; if (r_R2()) sbp.slice_del(); } break; case 3 : if (!r_R2()) return false; sbp.slice_from("log"); break; case 4 : if (!r_R2()) return false; sbp.slice_from("u"); break; case 5 : if (!r_R2()) return false; sbp.slice_from("ente"); break; case 6 : if (!r_RV()) return false; sbp.slice_del(); break; case 7 : if (!r_R1()) return false; sbp.slice_del(); sbp.ket = sbp.cursor; among_var = sbp.find_among_b(a_4, 4); if (among_var) { sbp.bra = sbp.cursor; if (r_R2()) { sbp.slice_del(); if (among_var == 1) { sbp.ket = sbp.cursor; if (sbp.eq_s_b(2, "at")) { sbp.bra = sbp.cursor; if (r_R2()) sbp.slice_del(); } } } } break; case 8 : if (!r_R2()) return false; sbp.slice_del(); sbp.ket = sbp.cursor; among_var = sbp.find_among_b(a_5, 3); if (among_var) { sbp.bra = sbp.cursor; if (among_var == 1) if (r_R2()) sbp.slice_del(); } break; case 9 : if (!r_R2()) return false; sbp.slice_del(); sbp.ket = sbp.cursor; if (sbp.eq_s_b(2, "at")) { sbp.bra = sbp.cursor; if (r_R2()) { sbp.slice_del(); sbp.ket = sbp.cursor; if (sbp.eq_s_b(2, "ic")) { sbp.bra = sbp.cursor; if (r_R2()) sbp.slice_del(); } } } break; } return true; } function r_verb_suffix() { var among_var, v_1; if (sbp.cursor >= I_pV) { v_1 = sbp.limit_backward; sbp.limit_backward = I_pV; sbp.ket = sbp.cursor; among_var = sbp.find_among_b(a_7, 87); if (among_var) { sbp.bra = sbp.cursor; if (among_var == 1) sbp.slice_del(); } sbp.limit_backward = v_1; } } function habr6() { var v_1 = sbp.limit - sbp.cursor; sbp.ket = sbp.cursor; if (sbp.in_grouping_b(g_AEIO, 97, 242)) { sbp.bra = sbp.cursor; if (r_RV()) { sbp.slice_del(); sbp.ket = sbp.cursor; if (sbp.eq_s_b(1, "i")) { sbp.bra = sbp.cursor; if (r_RV()) { sbp.slice_del(); return; } } } } sbp.cursor = sbp.limit - v_1; } function r_vowel_suffix() { habr6(); sbp.ket = sbp.cursor; if (sbp.eq_s_b(1, "h")) { sbp.bra = sbp.cursor; if (sbp.in_grouping_b(g_CG, 99, 103)) if (r_RV()) sbp.slice_del(); } } this.stem = function() { var v_1 = sbp.cursor; r_prelude(); sbp.cursor = v_1; r_mark_regions(); sbp.limit_backward = v_1; sbp.cursor = sbp.limit; r_attached_pronoun(); sbp.cursor = sbp.limit; if (!r_standard_suffix()) { sbp.cursor = sbp.limit; r_verb_suffix(); } sbp.cursor = sbp.limit; r_vowel_suffix(); sbp.cursor = sbp.limit_backward; r_postlude(); return true; } }