UNPKG

yoastseo-dep

Version:

Yoast clientside page analysis

246 lines (237 loc) 9.47 kB
import stem from "../../../../../../src/languageProcessing/languages/ar/helpers/internal/stem"; import getMorphologyData from "../../../../../specHelpers/getMorphologyData"; const morphologyDataAR = getMorphologyData( "ar" ).ar; const wordsToStem = [ // Words that have two letter root that is not on any exception list [ "جص", "جص" ], [ "جصكم", "جص" ], [ "للمظ", "مظ" ], // Words that are not stemmed [ "صوتيات", "صوتيات" ], [ "كمبيوتر", "كمبيوتر" ], // Words with a definite article and a suffix [ "الجدولين", "جدول" ], // Two letter word with a removed duplicate letter. [ "صف", "صفف" ], // Two letter word with the word-final letter (alif) removed. [ "عد", "عدا" ], // Two letter word with the word-initial letter (waw) removed. [ "بأ", "وبأ" ], // Two letter word with the middle letter (yah) removed. [ "غض", "غيض" ], // Two letter word with the first letter (yah) removed. [ "خت", "يخت" ], // Two letter word with the middle letter (waw) removed. [ "أي", "أوي" ], // Two letter words that are not on any list. [ "با", "با" ], [ "بم", "بم" ], // Three letter words which is in the list of threeLetterRoots [ "رحل", "رحل" ], [ "طمو", "طمو" ], // Three letter words which are not in the list of threeLetterRoots [ "ويس", "ويس" ], // Three letter word with initial letter ا/ ؤ/ ئ (yeh_hamza/waw_hamza/alef), change the initial letter to أ (alef_hamza_above) [ "ؤكد", "أكد" ], [ "ازب", "أزب" ], // Three letter words ending in weak letter ي/ا/ى/ء (yeh_hamza/hamza/yeh_maksorah/alef/yeh/waw) // And the root is in the list of words with the last weak letter or hamza removed. [ "غبي", "غبي" ], [ "دعا", "دعا" ], [ "بدء", "بدأ" ], [ "كرى", "كرى" ], // Three letter words ending in weak letter و/ي/ا/ى/ء/ئ (yeh_hamza/hamza/yeh_maksorah/alef/yeh/waw) [ "باء", "باء" ], [ "غذى", "غذى" ], [ "رشا", "رشا" ], // Three letter words with و/ي (yeh/waw) as their second letter and the root is in the exception list after و/ي (yeh/waw) removal [ "أيد", "أيد" ], [ "أوز", "أوز" ], [ "أوم", "أيم" ], // Three letter words with و/ي/ا/ئ (yeh_hamza/alef/yeh/waw) as their second letter // And the root is NOT in the exception list after the deletion of و/ي/ا/ئ (yeh_hamza/alef/yeh/waw) [ "موظ", "موظ" ], [ "جيم", "جيم" ], [ "توت", "توت" ], // Three letter words with ئ/ؤ (yeh_hamza/waw_hamza) as their second letter and end in ر/ز/ن (noon/zai/reh), // Change ئ/ؤ (yeh_hamza/waw_hamza) to ا (alef), otherwise ئ/ؤ is changed to أ (alef_hamza_above) [ "مؤن", "مان" ], [ "فئة", "فأة" ], [ "بؤس", "بأس" ], [ "رؤي", "رأي" ], // Three letter words that ends in a shadda, duplicate the second character and the root is in threeLetterRoots list [ "ودّ", "ودد" ], [ "ألّ", "ألل" ], // Four letter word that is in the list of four-letter roots. [ "أبزم", "أبزم" ], // Words that start with أ/إ/آ (alef_madda/alef_hamza_above/alef_hamza_below), the أ/إ/آ is changed to ا [ "آلهات", "لهت" ], [ "أفطر", "فطر" ], [ "إنطباع", "طبع" ], // Words that match one of the patterns [ "تبادل", "بدل" ], // Words that match pattern افعلال [ "ابيضاض", "بيض" ], // Words with a definite article [ "بالضبط", "ضبط" ], [ "كالقمر", "قمر" ], [ "والشمس", "شمس" ], // Words with a definite article whose root matched one of the patterns [ "الضارب", "ضرب" ], // Words with a definite article and a suffix [ "الجدولين", "جدول" ], [ "الزينة", "زين" ], // Three letter words plus prefix waw [ "وكرو", "كرو" ], [ "وغبي", "غبي" ], // Four letter words plus prefix waw [ "وضفدع", "ضفدع" ], [ "وبعبع", "بعبع" ], // Words which match a pattern and have prefix waw [ "وتمثّل", "مثل" ], [ "وتشاور", "شور" ], // Word with a prefix which matches a pattern [ "للزهور", "زهر" ], // Words with suffix and prefix waw [ "وفتيات", "فتي" ], [ "ومنزلنا", "نزل" ], // Words with prefix and prefix waw [ "وتتلقّى", "لقي" ], [ "وكرجل", "رجل" ], // Words with suffix and the word matches one of the patterns [ "ملابسك", "لبس" ], // Words with suffix [ "بؤسهم", "بأس" ], [ "أسبوعة", "سبع" ], // Three letter word with a suffix. [ "جمعكم", "جمع" ], // Words with prefix [ "ستنجب", "نجب" ], [ "فنجبت", "نجب" ], [ "لأغفلنا", "غفل" ], [ "الأسبوع", "سبع" ], [ "الصوتيات", "صوتيات" ], [ "والظيفتك", "ظيفتك" ], // Currently these forms are not stemmed at all, in which they should be [ "وظيفتك", "وظيفتك" ], // Specs for words with prefix ( with made up words ) [ "الننس", "ننس" ], [ "وإعدجص", "اعدجص" ], // Specs from the external stemmer [ "الرحمن", "رحم" ], [ "الرحيم", "رحم" ], [ "العالمين", "علم" ], [ "نعبد", "عبد" ], [ "صراط", "صرط" ], // This word أنعمت belongs to a group of verb that incorrectly matches انفعل even though it should not. // Somehow the external stemmer stems this word correctly but we are still not sure how. // This problem will be further investigated and later fixed. // [ "أنعمت", "نعم" ], [ "المؤمنين", "أمن" ], [ "الصالحات", "صلح" ], [ "الحديث", "حدث" ], // This word السماوات has a different output stem in this stemmer (سمو) as we check the longer suffix (ات) first before the shorter one (ت) // We think that our stem output makes more sense, thus we don't change any functionality to accommodate this difference. // [ "السماوات", "سمي" ], [ "بسلطان", "سلط" ], [ "تفلحوا", "فلح" ], [ "يتنازعون", "نزع" ], [ "الشراب", "شرب" ], [ "أغفلنا", "غفل" ], [ "خلالهما", "خلل" ], [ "منقلبا", "قلب" ], [ "خاوية", "خوي" ], [ "والباقيات", "بقي" ], [ "المجرمين", "جرم" ], [ "للظالمين", "ظلم" ], // This word لنتخذن is also commented out in the original external stemmer. // [ "لنتخذن", "أخذ" ], /* * Specs that do not pass because of overlap issues (they match multiple conditions, and they get stemmed in another * condition than where we would want it to). They are probably also a problem in the external stemmer; we may solve it * at some point with exception lists. */ // The current stem is قا as قوا is matched by regexRemoveMiddleWeakLetterOrHamza instead of this regex regexRemoveLastWeakLetterOrHamza // [ "قوا", "قوا" ], // The current stem is وفي, as وف is also detected in wordsWithLastYahRemoved list. For words found in the list, ي (yeh) is added to the root // [ "وفى", "وف" ], // The current stem is برا, as بر is also detected in wordsWithLastAlifRemoved list. For words found in the list, ا (alef) is added to the root // [ "برئ", "بر" ], // The current stem is بلي, as بل is also detected in wordsWithLastYahRemoved list. For words found in the list, ي (yeh) is added to the root // [ "بلا", "بل" ], // The current stem is بور, as بر is also detected in wordsWithMiddleWawRemoved list. For words found in the list, و (waw) is added to the root. // [ "بار", "بر" ], // The current stem is ين, it is because يئن is matched first by this regex regexRemoveMiddleWeakLetterOrHamza. // [ "يئن", "يان" ], // The current stem is بور, as بر is also detected in wordsWithMiddleWawRemoved list. For words found in the list, و (waw) is added to the root. // [ "بئر", "بار" ], ]; const paradigms = [ { // Complete paradigm of a verb. stem: "عود", forms: [ "عاودت", "عاود", "عاودتما", "عاودا", "عاودتا", "عاودنا", "عاودتم", "عاودتن", "عاودوا", "عاودن", "أعاود", "تعاود", "تعاودين", "يعاود", "تعاودان", "يعاودان", "نعاود", "تعاودون", "تعاودن", "يعاودون", "يعاودن", "تعاودي", "يعاود", "تعاودا", "يعاودا", "تعاودوا", "يعاودوا", "عوودت", "عوود", "عوودتما", "عوودا", "عوودتا", "عوودنا", "عوودتم", "عوودتن", "عوودوا", "عوودن", "عاودا", "عاودي", "عاود", "عاودوا", "عاودا", "معاود", "معاودة", ], }, ]; describe( "Test for stemming Arabic words", () => { for ( let i = 0; i < wordsToStem.length; i++ ) { const wordToCheck = wordsToStem[ i ]; it( "stems the word " + wordToCheck[ 0 ], () => { expect( stem( wordToCheck[ 0 ], morphologyDataAR ) ).toBe( wordToCheck[ 1 ] ); } ); } } ); describe( "Test to make sure all forms of a paradigm get stemmed to the same stem", () => { for ( const paradigm of paradigms ) { for ( const form of paradigm.forms ) { it( "correctly stems the word: " + form + " to " + paradigm.stem, () => { expect( stem( form, morphologyDataAR ) ).toBe( paradigm.stem ); } ); } } } );