@orama/stemmers

Version:

Stemmers for Orama

1 lines • 8.39 kB

Source Map (JSON)

{"version":3,"sources":["<anon>"],"sourcesContent":["/**\n * Light Stemmer for Bulgarian.\n *\n * Converted from the Java implementation of Apache {@link https://github.com/apache/lucene | Lucene}.\n *\n * Implements the algorithm described in: *Searching Strategies for the Bulgarian Language*\n * {@link http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf}\n */ \"use strict\";\nObject.defineProperty(exports, \"__esModule\", {\n value: true\n});\nfunction _export(target, all) {\n for(var name in all)Object.defineProperty(target, name, {\n enumerable: true,\n get: Object.getOwnPropertyDescriptor(all, name).get\n });\n}\n_export(exports, {\n get language () {\n return language;\n },\n get stemmer () {\n return stemmer;\n }\n});\nclass BulgarianStemmer {\n stem(s) {\n const calculatedLength = this.calculateStemLength(s, s.length);\n return s.substring(0, calculatedLength);\n }\n /**\n * Stem an input buffer of Bulgarian text.\n *\n * @param {string} s input buffer\n * @param {number} len length of input buffer\n * @return {number} length of input buffer after normalization\n */ calculateStemLength(s, len) {\n if (len < 4) {\n // do not stem\n return len;\n }\n if (len > 5 && this.endsWith(s, len, 'ища')) {\n return len - 3;\n }\n len = this.removeArticle(s, len);\n len = this.removePlural(s, len);\n if (len > 3) {\n if (this.endsWith(s, len, 'я')) {\n len--;\n }\n if (this.endsWith(s, len, 'а') || this.endsWith(s, len, 'о') || this.endsWith(s, len, 'е')) {\n len--;\n }\n }\n // the rule to rewrite ен -> н is duplicated in the paper.\n // in the perl implementation referenced by the paper, this is fixed.\n // (it is fixed here as well)\n if (len > 4 && this.endsWith(s, len, 'ен')) {\n s[len - 2] = 'н'; // replace with н\n len--;\n }\n if (len > 5 && s[len - 2] == 'ъ') {\n s[len - 2] = s[len - 1]; // replace ъN with N\n len--;\n }\n return len;\n }\n /**\n * Mainly remove the definite article\n *\n * @param {string} s input buffer\n * @param {number} len length of input buffer\n * @return {number} new stemmed length\n */ removeArticle(s, len) {\n if (len > 6 && this.endsWith(s, len, 'ият')) return len - 3;\n if (len > 5) {\n if (this.endsWith(s, len, 'ът') || this.endsWith(s, len, 'то') || this.endsWith(s, len, 'те') || this.endsWith(s, len, 'та') || this.endsWith(s, len, 'ия')) {\n return len - 2;\n }\n }\n if (len > 4 && this.endsWith(s, len, 'ят')) {\n return len - 2;\n }\n return len;\n }\n /**\n * Remove the plural from the input string\n *\n * @param {string} s input buffer\n * @param {number} len length of input buffer\n * @return {number} new stemmed length\n */ removePlural(s, len) {\n if (len > 6) {\n if (this.endsWith(s, len, 'овци')) return len - 3 // replace with о\n ;\n if (this.endsWith(s, len, 'ове')) return len - 3;\n if (this.endsWith(s, len, 'еве')) {\n s[len - 3] = 'й'; // replace with й\n return len - 2;\n }\n }\n if (len > 5) {\n if (this.endsWith(s, len, 'ища')) return len - 3;\n if (this.endsWith(s, len, 'та')) return len - 2;\n if (this.endsWith(s, len, 'ци')) {\n s[len - 2] = 'к'; // replace with к\n return len - 1;\n }\n if (this.endsWith(s, len, 'зи')) {\n s[len - 2] = 'г'; // replace with г\n return len - 1;\n }\n if (s[len - 3] == 'е' && s[len - 1] == 'и') {\n s[len - 3] = 'я'; // replace е with я, remove и\n return len - 1;\n }\n }\n if (len > 4) {\n if (this.endsWith(s, len, 'си')) {\n s[len - 2] = 'х'; // replace with х\n return len - 1;\n }\n if (this.endsWith(s, len, 'и')) return len - 1;\n }\n return len;\n }\n /**\n * Returns true if the character array ends with the suffix.\n *\n * This is a helper function for the stemmer from the original Java implementation.\n * {@link https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java#L68}\n *\n * @param {string} s Input Buffer\n * @param {number} len length of input buffer\n * @param {string} suffix Suffix string to test\n * @return true if `s` ends with `suffix`\n */ endsWith(s, len, suffix) {\n let suffixLen = suffix.length;\n if (suffixLen > len) return false;\n for(let i = suffixLen - 1; i >= 0; i--)if (s[len - (suffixLen - i)] != suffix[i]) return false;\n return true;\n }\n}\nconst stemmerInstance = new BulgarianStemmer();\nfunction stemmer(word) {\n return stemmerInstance.stem(word);\n}\nconst language = 'bulgarian';\n"],"names":["_export","target","all","name","Object","defineProperty","enumerable","get","getOwnPropertyDescriptor","exports","value","language","stemmer","BulgarianStemmer","stem","s","calculatedLength","calculateStemLength","length","substring","len","endsWith","removeArticle","removePlural","suffix","suffixLen","i","stemmerInstance","word"],"mappings":"AAOI,aAIJ,SAASA,QAAQC,CAAM,CAAEC,CAAG,EACxB,IAAI,IAAIC,KAAQD,EAAIE,OAAOC,cAAc,CAACJ,EAAQE,EAAM,CACpDG,WAAY,CAAA,EACZC,IAAKH,OAAOI,wBAAwB,CAACN,EAAKC,GAAMI,GAAG,AACvD,EACJ,CARAH,OAAOC,cAAc,CAACI,QAAS,aAAc,CACzCC,MAAO,CAAA,CACX,GAOAV,QAAQS,QAAS,CACb,IAAIE,UAAY,CACZ,OAAOA,QACX,EACA,IAAIC,SAAW,CACX,OAAOA,OACX,CACJ,EACA,OAAMC,iBACFC,KAAKC,CAAC,CAAE,CACJ,IAAMC,EAAmB,IAAI,CAACC,mBAAmB,CAACF,EAAGA,EAAEG,MAAM,EAC7D,OAAOH,EAAEI,SAAS,CAAC,EAAGH,EAC1B,CAOEC,oBAAoBF,CAAC,CAAEK,CAAG,CAAE,QAC1B,AAAIA,EAAM,EAECA,EAEPA,EAAM,GAAK,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,OAC1BA,EAAM,GAEjBA,EAAM,IAAI,CAACE,aAAa,CAACP,EAAGK,GAC5BA,CAAAA,EAAM,IAAI,CAACG,YAAY,CAACR,EAAGK,EAAG,EACpB,IACF,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,MACtBA,IAEA,CAAA,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,MAAQ,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,MAAQ,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,IAAG,GACrFA,KAMJA,EAAM,GAAK,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,QACjCL,CAAC,CAACK,EAAM,EAAE,CAAG,IACbA,KAEAA,EAAM,GAAKL,AAAc,KAAdA,CAAC,CAACK,EAAM,EAAE,GACrBL,CAAC,CAACK,EAAM,EAAE,CAAGL,CAAC,CAACK,EAAM,EAAE,CACvBA,KAEGA,EACX,CAOEE,cAAcP,CAAC,CAAEK,CAAG,CAAE,QACpB,AAAIA,EAAM,GAAK,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,OAAeA,EAAM,EACtDA,EAAM,GACF,CAAA,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,OAAS,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,OAAS,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,OAAS,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,OAAS,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,KAAI,GAI1JA,EAAM,GAAK,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,MAHtBA,EAAM,EAMdA,CACX,CAOEG,aAAaR,CAAC,CAAEK,CAAG,CAAE,CACnB,GAAIA,EAAM,EAAG,CACT,GAAI,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,SAEtB,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,OAFS,OAAOA,EAAM,EAGhD,GAAI,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,OAEtB,OADAL,CAAC,CAACK,EAAM,EAAE,CAAG,IACNA,EAAM,CAErB,CACA,GAAIA,EAAM,EAAG,CACT,GAAI,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,OAAQ,OAAOA,EAAM,EAC/C,GAAI,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,MAAO,OAAOA,EAAM,EAC9C,GAAI,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,MAEtB,OADAL,CAAC,CAACK,EAAM,EAAE,CAAG,IACNA,EAAM,EAEjB,GAAI,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,MAEtB,OADAL,CAAC,CAACK,EAAM,EAAE,CAAG,IACNA,EAAM,EAEjB,GAAIL,AAAc,KAAdA,CAAC,CAACK,EAAM,EAAE,EAAWL,AAAc,KAAdA,CAAC,CAACK,EAAM,EAAE,CAE/B,OADAL,CAAC,CAACK,EAAM,EAAE,CAAG,IACNA,EAAM,CAErB,CACA,GAAIA,EAAM,EAAG,CACT,GAAI,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,MAEtB,OADAL,CAAC,CAACK,EAAM,EAAE,CAAG,IACNA,EAAM,EAEjB,GAAI,IAAI,CAACC,QAAQ,CAACN,EAAGK,EAAK,KAAM,OAAOA,EAAM,CACjD,CACA,OAAOA,CACX,CAWEC,SAASN,CAAC,CAAEK,CAAG,CAAEI,CAAM,CAAE,CACvB,IAAIC,EAAYD,EAAON,MAAM,CAC7B,GAAIO,EAAYL,EAAK,MAAO,CAAA,EAC5B,IAAI,IAAIM,EAAID,EAAY,EAAGC,GAAK,EAAGA,IAAI,GAAIX,CAAC,CAACK,EAAOK,CAAAA,EAAYC,CAAAA,EAAG,EAAIF,CAAM,CAACE,EAAE,CAAE,MAAO,CAAA,EACzF,MAAO,CAAA,CACX,CACJ,CACA,IAAMC,gBAAkB,IAAId,iBAC5B,SAASD,QAAQgB,CAAI,EACjB,OAAOD,gBAAgBb,IAAI,CAACc,EAChC,CACA,IAAMjB,SAAW"}