UNPKG

mmir-lib

Version:

MMIR (Mobile Multimodal Interaction and Relay) library

167 lines (138 loc) 3.98 kB
define( /** * Word stemming * * @class stemmer * @memberOf mmir.grammar * @hideconstructor */ function(){ /** * Simplified German snowball stemmer: * * <quote> * Really simple JavaScript stemmer based on the Snowball stemmer * http://snowball.tartarus.org/algorithms/german/stemmer.html * Some simplifications were made, e.g. ignoring R2 and the special * provision for words ending in -isse- * </quote> * * @prop {Boolean} allowUmlauts allow umlauts in stemmed words (DEFAULT: <code>false</code>) * * @function stem * @memberOf mmir.grammar.stemmer * @param {String} word the (German) word that should be stemmed * @returns {String} the stemmed word * * * * @example * * mmir.require(['mmirf/stemmer'], function(stem){ * * var strStemmed = stem('abspielen'); // -> "abspiel" * var strStemmedUmlauts = stem('anhören'); // -> "anhor" * * // disable stemming umlauts * // (by default, umlauts will be "normalized") * stem.allowUmlauts = true; * var strWithUmlauts = stem('anhören'); // -> "anhör" * * }); */ var snowballSimpleStemFunc = (function(){ /* Definitions */ // var vowel = /[aeiouyäöüUY]/; // var cons = /[^aeiouyäöüUY]/; var sEnding = "[bdfghklmnrt]"; var stEnding = "[bdfghklmnt]"; var prefix = "^((.[aeiouyäöüUY][^aeiouyäöüUY])|([aeiouyäöüUY][^aeiouyäöüUY].))"; var stem_word = function simpleStemmer(word){ word = word.toLowerCase(); word = word.replace(/ß/g, "ss"); if (word.length < 4) { return word; } word = word.replace(/([aeiouyäöü])y([aeiouyäöü])/g, "$1Y$2"); // replace y between vowels with Y word = word.replace(/([aeiouyäöü])u([aeiouyäöü])/g, "$1U$2"); // replace u between vowels with U /* Step 1 */ if (word.match(prefix + "(.*)" + "ern$")) { word = word.slice(0, -3); } else if (word.match(prefix + "(.*)" + "(em$|en$|er$|es$)")) { word = word.slice(0, -2); } else if (word.match(prefix + "(.*)" + "(e$)")) { word = word.slice(0, -1); } else if (word.match(sEnding + "s$") && word.match(prefix + "(.*)" + "(s$)")) { word = word.slice(0, -1); } /* Step 2 */ if (word.match(prefix + "(.*)" + "est$")) { word = word.slice(0, -3); } else if (word.match(prefix + "(.*)" + "(en$|er$)")) { word = word.slice(0, -2); } else if (word.match(prefix + "(.*)" + stEnding + "(st$)")) { word = word.slice(0, -2); } /* Step 3 */ // simplified!! Really these should be in R2 not R1 if (word.match(prefix + "(.*)" + "keit$")) { word = word.slice(0, -4); } if (word.match(prefix + "(.*)" + "(lich$|heit$)")) { word = word.slice(0, -4); if (word.match(prefix + "(.*)" + "(er$|en$)")) { word = word.slice(0, -2); } } else if (word.match(prefix + "(.*)" + "(isch$)")) { if (!word.match("eisch$")) { word = word.slice(0, -4); } } else if (word.match(prefix + "(.*)" + "(ig$|ik$)")) { if (!word.match("e..$")) { word = word.slice(0, -2); } } else if (word.match(prefix + "(.*)" + "(end$|ung$)")) { word = word.slice(0, -3); } /* Clean up */ word = word.replace(/([aeiouyäöü])Y/g, "$1y"); // replace Y with y word = word.replace(/([aeiouyäöü])U/g, "$1u"); // replace U with u if(!snowballSimpleStemFunc.allowUmlauts){ word = word.replace(/ä/g, "a"); word = word.replace(/ö/g, "o"); word = word.replace(/ü/g, "u"); } return word; }; return stem_word; })(); /** * allow umlauts in stemmed words * * @type {Boolean} * @default false * * @member allowUmlauts * @memberOf mmir.grammar.stemmer.stem * @ignore */ snowballSimpleStemFunc.allowUmlauts = false; //exported function: return snowballSimpleStemFunc; });