@bntk/stemming
Version:
Build applications with Bengali natural language processing tools.
253 lines (251 loc) • 5.83 kB
JavaScript
// packages/core/stemming/assets/stemming.json
var stemming_default = {
language: "bn",
prefixes: {
"অ": "",
"অন": "",
"অনু": "",
"অপ": "",
"অভি": "",
"অধি": "",
"অধো": "",
"অতি": "",
"আ": "",
"আন": "",
"আত্ম": "",
"আধি": "",
"আন্ত": "",
"আপ": "",
"আব": "",
"উ": "",
"উপ": "",
"উত্": "",
"উন": "",
"উদ": "",
"কু": "",
"কদ": "",
"দু": "",
"দুর": "",
"দুস": "",
"নি": "",
"নির": "",
"নিস": "",
"পরা": "",
"পরি": "",
"প্র": "",
"প্রতি": "",
"প্রধান": "ধান",
"বি": "",
"বে": "",
"ব্য": "",
"সু": "",
"সম": "",
"স": "",
"সহ": "",
"সৎ": "",
"স্ব": "",
"পুন": "",
"পুনঃ": ""
},
suffixes: {
"অন": "",
"অনা": "",
"আই": "",
"আন": "",
"আনি": "",
"আনো": "া",
"আম": "",
"আল": "",
"আলি": "",
"আলু": "",
"ই": "",
"ইক": "",
"ইত": "",
"ইতা": "",
"ইনী": "ী",
"ইয়া": "",
"ইয়ান": "",
"ইয়ানি": "",
"ইয়ার": "",
"ইয়াল": "",
"ইয়ে": "",
"উক": "",
"উড়ি": "",
"উড়ে": "",
"উন": "",
"উনি": "",
"উয়া": "",
"উরে": "",
"এ": "",
"ওয়া": "",
"ওয়াল": "",
"ওয়ালা": "",
"ওয়ালি": "",
"ক": "",
"কা": "",
"কার": "",
"কি": "",
"কী": "",
"কে": "",
"গত": "",
"গুলি": "",
"গুলো": "",
"চ্ছ": "",
"ছিলাম": "",
"জন": "",
"জনক": "",
"টা": "",
"টি": "",
"টে": "",
"ড়ি": "",
"ত": "",
"তম": "",
"তর": "",
"তা": "",
"তি": "",
"তির": "ত",
"তে": "",
"ৎ": "",
"ত্ব": "",
"দার": "",
"দাস": "",
"দের": "",
"ধারী": "ার",
"ন": "",
"না": "",
"নি": "",
"নী": "",
"বাজ": "",
"বাদ": "",
"বান": "",
"বার": "",
"বি": "",
"বিদ": "",
"বিধ": "",
"বিহীন": "",
"বৃন্দ": "",
"ভাবে": "াব",
"ভিত্তিক": "",
"মন্ত": "",
"মন্ত্রী": "মন্ত্র",
"মন্ত্রীর": "মন্ত্র",
"ময়": "",
"মান": "",
"মানা": "",
"মূলক": "",
"য়": "",
"য়া": "া",
"য়ান": "",
"য়ানো": "া",
"য়িত": "",
"য়ে": "া",
"র": "",
"রা": "",
"রাজি": "",
"রে": "",
"ল": "",
"লা": "",
"লাম": "",
"লি": "",
"লী": "",
"লে": "",
"লেন": "",
"শালী": "ালী",
"শীল": "",
"সমূহ": "",
"স্থ": "",
"স্থিত": "",
"স্থিতি": "স্থিত",
"স্থিতির": "স্থিত",
"স্য": "",
"হীন": "",
"ি": "",
"িক": "",
"িত": "",
"িতে": "",
"িনী": "ী",
"ির": "ত",
"িল": "",
"িলাম": "",
"িলি": "",
"িলে": "",
"িলেন": "",
"িস": "",
"ী": "",
"ীন": "",
"ীয়": "",
"ীর": "",
"ু": "",
"ুক": "",
"ুন": "",
"ুম": "",
"ুয়া": "",
"ূ": "",
"ে": "া",
"েক": "া",
"েছ": "",
"েছি": "",
"েছিল": "",
"েছিলাম": "",
"েছিলি": "",
"েছিলেন": "",
"েছিস": "",
"েছে": "",
"েছেন": "",
"েতাম": "",
"েতি": "",
"েতিস": "",
"েতে": "",
"েতেন": "",
"েন": "",
"ের": "",
"েরা": ""
},
special_cases: {
"প্রতিদিন": "দিন",
"অনুপস্থিতির": "পস্থিত"
}
};
// packages/core/stemming/src/stemming.ts
var typedStemmingData = stemming_default;
var sortedPrefixes = Object.keys(typedStemmingData.prefixes).sort((a, b) => b.length - a.length);
var sortedSuffixes = Object.keys(typedStemmingData.suffixes).sort((a, b) => b.length - a.length);
function removePrefix(word) {
if (word.length <= 2)
return word;
for (const prefix of sortedPrefixes) {
if (word.startsWith(prefix) && word.length > prefix.length + 1) {
const replacement = typedStemmingData.prefixes[prefix] || "";
return replacement + word.substring(prefix.length);
}
}
return word;
}
function removeSuffix(word) {
if (word.length <= 2)
return word;
for (const suffix of sortedSuffixes) {
if (word.endsWith(suffix) && word.length > suffix.length + 1) {
const replacement = typedStemmingData.suffixes[suffix] || "";
return word.substring(0, word.length - suffix.length) + replacement;
}
}
return word;
}
function stemWord(word) {
if (typedStemmingData.special_cases && typedStemmingData.special_cases[word]) {
return typedStemmingData.special_cases[word];
}
let stemmed = removePrefix(word);
stemmed = removeSuffix(stemmed);
return stemmed;
}
function stemWords(words) {
return words.map((word) => stemWord(word));
}
export {
stemWords,
stemWord,
removeSuffix,
removePrefix
};