UNPKG

yoastseo-dep

Version:

Yoast clientside page analysis

462 lines (456 loc) 20.1 kB
import determineStem from "../../../../../../src/languageProcessing/languages/nl/helpers/internal/determineStem"; import getMorphologyData from "../../../../../specHelpers/getMorphologyData"; const morphologyDataNL = getMorphologyData( "nl" ).nl; /** * The first word in each array is the word, the second one is the expected stem. The numbers/letters assigned to the specs * refer to the numbers assigned to the different paths in the flowcharts * (https://drive.google.com/drive/u/0/folders/1O5pOnRBCpZnTFAUlrNB3ViiqY3Voj9L3) that the spec covers. */ const wordsToStem = [ // A word that exists on the list of words that should not be stemmed and that is a strong verb (3-11 condition in flowchart). [ "smolt", "smelt" ], // Words that are on the verb sub-list of words that should not be stemmed (3-12) [ "hoest", "hoest" ], [ "aanplant", "aanplant" ], [ "verpest", "verpest" ], // Words that are on the ending match sub-list of words that should not be stemmed (3-12) [ "economist", "economist" ], [ "antifascist", "antifascist" ], [ "koffer", "koffer" ], // Words that are on the exact match sub-list of words that should not be stemmed (3-12) [ "loft", "loft" ], [ "rijst", "rijst" ], // Words that are on the list of words with full forms (1-12) // Words that are on the verb sub-list of full forms exception list (1-12) [ "aanbevelen", "aanbeveel" ], [ "krijsend", "krijs" ], // Words that are on the exact match sub-list of full forms exception list (1-13) [ "bijdehandst", "bijdehand" ], // Words that are on the exact match sub-list of full forms exception list (1-12) [ "blaséëre", "blasé" ], // Words that are on the ending match sub-list of full forms exception list (1-12) [ "hoekschoppen", "hoekschop" ], [ "raderen", "rad" ], [ "grofs", "grof" ], [ "cafeetje", "café" ], // Words that are on the list of nonParticiples (are matched with participle regex but are not actually participles) (2a-15). [ "gevelwand", "gevelwand" ], [ "beurt", "beurt" ], // A word on the list of inseparable compound verbs that should not be stemmed (2b) [ "onderricht", "onderricht" ], // A participle without separable or inseparable prefixes that should not have the ge- stemmed (2c) [ "gebeurd", "gebeur" ], // A word on the list of inseparable compound verbs (2d) [ "onderverhuurd", "onderverhuur" ], // A participle of a strong verb without separable or inseparable prefixes which should have the suffix removed (2e-11) [ "gevraagd", "vraag" ], [ "gejaagd", "jaag" ], // A participle of a regular verb without separable or inseparable prefixes which should have the suffix removed (2e-12) [ "gewandeld", "wandel" ], [ "gezwikt", "zwik" ], [ "geankerd", "anker" ], // A participle of a strong verb without separable or inseparable prefixes which should not have the suffix removed (2f-11) [ "gebracht", "breng" ], // A participle of a regular verb without separable or inseparable prefixes which should not have the suffix removed (2f-15) [ "gesport", "sport" ], [ "getocht", "tocht" ], [ "gekorst", "korst" ], // A participle of a strong verb with a separable prefix which should have the suffix removed. (2h-11) [ "afgescheerd", "afscheer" ], // A participle of a regular verb with a separable prefix which should have the suffix removed (2h-12). [ "aangegroeid", "aangroei" ], [ "opgesmukt", "opsmuk" ], [ "aangebakt", "aanbak" ], // A participle of a strong verb with a separable prefix which should not have the suffix removed. (2i-11) [ "aangebracht", "aanbreng" ], // A participle of a regular verb with a separable prefix which should not have the suffix removed. (2i-15) [ "afgerond", "afrond" ], [ "uitgerust", "uitrust" ], [ "aangelicht", "aanlicht" ], // A participle of a strong verb with an inseparable prefix which should have the suffix removed. (2k-11) [ "verzind", "verzin" ], // A participle of a regular verb with an inseparable prefix which should have the suffix removed. (2k-12) [ "verlakt", "verlak" ], [ "onderdrukt", "onderdruk" ], [ "herhaald", "herhaal" ], // A participle of a strong verb with an inseparable prefix which should not have the suffix removed. (2l-11) [ "onderbracht", "onderbreng" ], // A participle of a regular verb with an inseparable prefix which should not have the suffix removed. (2l-15) [ "vertroost", "vertroost" ], [ "beantwoord", "beantwoord" ], [ "verlicht", "verlicht" ], // A verb ending in -t that is on the list of verbs that should have the -t stemmed (4a-12) [ "scratcht", "scratch" ], [ "vrijt", "vrij" ], [ "groeit", "groei" ], // A noun matched with a regex for when -t should not be stemmed, that is on the list of nouns with multiple stems (4b-10) [ "vaat", "vat" ], [ "loot", "lot" ], // A strong verb matched with a regex for when -t should not be stemmed (4b-11) [ "kweet", "kwijt" ], // A word matched with a regex for when -t should not be stemmed (4b-13) [ "astronaut", "astronaut" ], [ "nacht", "nacht" ], [ "favoriet", "favoriet" ], // A strong verb ending in -tte/tten/dde/dden (4c-11) [ "zoutten", "zout" ], [ "barstten", "barst" ], [ "ziedden", "zied" ], // A word ending in -tte/tten/dde/dden (4c-13) [ "katten", "kat" ], [ "grondden", "grond" ], [ "splitten", "split" ], // Word that ends in -heid/-heden [ "snelheid", "snelheid" ], [ "snelheden", "snelheid" ], [ "gezonheid", "gezonheid" ], [ "gezonheden", "gezonheid" ], /* * Word that ends in -den with -d being part of the stem which is in the verb sub-list of wordsStemOnlyEnEnding list * and the stem is in verb exception list (4e-11). */ [ "belijden", "belijd" ], // Word that ends in -den with -d being part of the stem and the stem ends in t/d (4e-13) [ "onthoofden", "onthoofd" ], /* * Word that ends in -den with -d being part of the stem which is in the ending match sub-list of wordsStemOnlyEnEnding list * and undergoes vowel doubling after suffix deletion and the stem ends in t/d (4d-13) */ [ "potloden", "potlood" ], // Past-tense forms of verbs ending in fden/sden [ "aanblaasden", "aanblaas" ], [ "aandurfden", "aandurf" ], /* * Word that ends in -de with -d being part of the stem and undergoes vowel doubling after suffix deletion * and the stem ends in t/d (4f-13) */ [ "mode", "mood" ], /* * Word that ends in -te/-ten with -t being part of the stem and undergoes vowel doubling after suffix deletion * and the stem ends in t/d (4h-13) */ [ "blaten", "blaat" ], /* * Word that ends in -te/-ten with -t being part of the stem and undergoes vowel doubling after suffix deletion * and the stem is in verb exception list (4h-11) */ [ "stoten", "stoot" ], // Word that ends in -te/-ten with -t being part of the stem and the stem is in verb exception list (4i-11) [ "zouten", "zout" ], // Word that ends in -te/-ten with -t being part of the stem and the stem ends in t/d (4i-13) [ "taarten", "taart" ], // Word that is in adjective exception list and gets suffix -er/-ere and the stem ends in t/d (6-13) [ "harder", "hard" ], [ "absurdere", "absurd" ], // Word that gets suffix -je and is in doNotStemTOrD list (7-13) [ "ingrediëntje", "ingrediënt" ], // Word that gets suffix -tje/-etje and is in removeSuffixFromFullForms list and the stem does not end in -t/-d (7-12) [ "taxietje", "taxi" ], [ "watertaxietje", "watertaxi" ], // Word that gets suffix -etje and the stem is in the exception list with two stems. [ "garagetje", "garaag" ], [ "parkeergaragetje", "parkeergaraag" ], // Noun that is in the ending match stemJeAndOnePrecedingVowel list and the stem does not end in -t/-d (8-12) [ "dramaatje", "drama" ], [ "cameraatje", "camera" ], [ "filmcameraatje", "filmcamera" ], // Noun that is in the exact match stemJeAndOnePrecedingVowel list and the stem does not end in -t/-d (8-12) [ "omaatje", "oma" ], [ "lamaatje", "lama" ], // Verb that gets suffix -ten and is in verb exception list (9b-11) [ "lachten", "lach" ], [ "bakten", "bak" ], // Verb with a separable prefix that gets suffix -ten and is in verb exception list (9b-11) [ "afbakten", "afbak" ], // Verb with an inseparable prefix that gets suffix -ten and is in verb exception list (9b-11) [ "belachten", "belach" ], // Verb that gets suffix -ten and is neither in an exception list nor ends in t/d (9b-12) [ "etsten", "ets" ], // Word that gets suffix -en and undergoes stem modification after suffix deletion and is in noun exception list (9co-10) [ "bruggen", "brug" ], // Word that gets suffix -en and is in verb exception list (9c-11) [ "duiken", "duik" ], [ "ontduiken", "ontduik" ], [ "opduiken", "opduik" ], // Word that gets suffix -en and undergoes stem modification after suffix deletion and is in verb exception list (9co-11) [ "klimmen", "klim" ], [ "afklimmen", "afklim" ], [ "beklimmen", "beklim" ], // Word that gets suffix -en and is in getVowelDoubling list and does not end in t/d (9di-12) [ "nivelleren", "nivelleer" ], [ "verafgoden", "verafgood" ], // Word that gets suffix -en and is in noVowelOrConsonantDoubling list and is in noun exception list (9dii-10) [ "vaten", "vat" ], // Word that gets suffix -en and is in noVowelOrConsonantDoubling list and is in verb exception list (9dii-11) [ "traden", "treed" ], // Word that gets suffix -en and does not end in t/d (9c-12) [ "werken", "werk" ], // Word that gets suffix -en and does not end in t/d (9diii-12) [ "luttelen", "luttel" ], // Word that gets suffix -en and the stem ends in -t/-d (9c-15) [ "duiden", "duid" ], // Change -ied/-ïed to -id after suffix -st/-ste deletion and the stem ends in -t/-d (9e-15) [ "rigiedst", "rigid" ], [ "paranoïedste", "paranoïd" ], // Word that gets suffix -t and is in an verb exception list (9f-11) [ "sterft", "sterf" ], [ "afsterft", "afsterf" ], [ "versterft", "versterf" ], [ "bindt", "bind" ], /* * Word that gets suffix -ën and also gets its last -e deleted after suffix deletion * and is neither in an exception list nor ends in t/d (9fm-12) */ [ "melodieën", "melodie" ], // Word that gets suffix -der and is neither in an exception list nor ends in t/d (9f-12) [ "lekkerder", "lekker" ], // Word that gets suffix -je and is in an noun exception list (9g-10) [ "glaasje", "glas" ], [ "biggetje", "big" ], // Word that gets suffix -je and the stem ends in -t/-d (9g-15) [ "plaatje", "plaat" ], // Word that gets suffix -pje and is neither in an exception list nor ends in t/d (9g-12) [ "museumpje", "museum" ], // Word that gets suffix -kje and is neither in an exception list nor ends in t/d (9h-12) [ "kettinkje", "ketting" ], [ "woninkje", "woning" ], // Word that gets suffix -end/-ende and is neither in an exception list nor ends in t/d (9i-12) [ "werkend", "werk" ], /* * Word that gets suffix -end/-ende and gets stem modification after suffix deletion * and is neither in an exception list nor ends in t/d (9io-12) */ [ "rennende", "ren" ], // Word that gets suffix -end/-ende and is in an verb exception list (9i-11) [ "smeltend", "smelt" ], [ "afsmeltend", "afsmelt" ], [ "helpende", "help" ], // Word that gets suffix -end/-ende and gets stem modification after suffix deletion and is in an verb exception list (9j-11) [ "wegend", "weeg" ], [ "wegende", "weeg" ], // Past participle that starts with an inseparable prefix and ends in -end. [ "erkend", "erken" ], // Present participle that starts with an inseparable prefix and ends in -end(e). [ "bedelvend", "bedelf" ], [ "bedelvende", "bedelf" ], // Word that ends in -t end gets -end suffix (4h-13) [ "plantend", "plant" ], /* * Word that gets suffix -end/-ende and gets stem modification after suffix deletion * and is neither in an exception list nor ends in t/d (9j-12) */ [ "makende", "maak" ], [ "makend", "maak" ], // Word that gets suffix -de/-den and is in an verb exception list (9k-11) [ "beginde", "begin" ], [ "vouwden", "vouw" ], [ "omvouwden", "omvouw" ], // Word that gets suffix -de/-den and is neither in an exception list nor ends in t/d (9k-12) [ "waagde", "waag" ], [ "humde", "hum" ], // Word that gets suffix -e and is neither in an exception list nor ends in t/d (9m-12) [ "snelle", "snel" ], [ "mooie", "mooi" ], /* * Word that gets suffix -e and gets stem modification after suffix deletion * and is neither in an exception list nor ends in t/d (9n-12) */ [ "schone", "schoon" ], [ "hoge", "hoog" ], // Adjectives ending in -e in their base form [ "luxe", "luxe" ], [ "luxer", "luxe" ], /* * Word that gets suffix -etje and gets stem modification after suffix deletion * and is neither in an exception list nor ends in t/d (9go-12) */ [ "balletje", "bal" ], /* * Word that gets suffix -e and gets stem modification after suffix deletion * and is neither in an exception list nor ends in t/d (9mo-12) */ [ "zwakke", "zwak" ], // Change -iël to -ieel after suffix -er/-ers/-ere deletion and the stem is neither in an exception list nor ends in t/d (9cp-12) [ "partiëlere", "partieel" ], [ "initiëlere", "initieel" ], // Change -iël to -ieel after suffix -e deletion and the stem is neither in an exception list nor ends in t/d (9mp-12) [ "initiële", "initieel" ], [ "essentiële", "essentieel" ], /* * Word that gets suffix -en and undergoes stem-ending devoicing after suffix deletion * and is in an verb exception list (9cq-11) */ [ "bedelven", "bedelf" ], [ "vriezen", "vries" ], [ "ontvriezen", "ontvries" ], /* * Word that gets suffix -en and gets stem modification after suffix deletion * and is in verb exception list (9dq-11) */ [ "geblazen", "blaas" ], [ "afgeblazen", "afblaas" ], /* * Word that gets suffix -en and undergoes stem-ending devoicing after suffix deletion * and is neither in an exception list nor ends in t/d (9cq-12) */ [ "luizen", "luis" ], /* * Word that gets suffix -en and gets stem modification after suffix deletion * and is neither in an exception list nor ends in t/d (9dq-12) */ [ "reven", "reef" ], /* * Word that gets suffix -end and undergoes stem-ending devoicing after suffix deletion * and is in verb exception list (9iq-11) */ [ "drinkend", "drink" ], [ "afdrinkend", "afdrink" ], [ "delvend", "delf" ], /* * Word that gets suffix -ende and gets stem modification after suffix deletion * and is in an verb exception list (9jq-11) */ [ "blazende", "blaas" ], /* * Word that gets suffix -end/-ende and undergoes stem-ending devoicing after suffix deletion * and is neither in an exception list nor ends in t/d (9iq-12) */ [ "luizend", "luis" ], [ "grievende", "grief" ], /* * Word that gets suffix -end/-ende and gets stem modification after suffix deletion * and is neither in an exception list nor ends in t/d (9jq-12) */ [ "snevende", "sneef" ], [ "dovend", "doof" ], /* * Word that gets suffix -e and undergoes stem-ending devoicing after suffix deletion * and is neither in an exception list nor ends in t/d (9mq-12) */ [ "vieze", "vies" ], // Word that gets suffix -e and gets stem modification after suffix deletion and is neither in an exception list nor ends in t/d (9nq-12) [ "daze", "daas" ], // Word with plural diminutive suffix and the stem is neither in any exception list nor ends in t/d (9fg-12) [ "toetertjes", "toeter" ], // Word with plural diminutive suffix and the stem is neither in any exception list nor ends in t/d (9fh-12) [ "kettinkjes", "ketting" ], // Word with plural diminutive suffix and the stem is in noun exception list (9fg-10) [ "poppetjes", "pop" ], // Word with plural diminutive suffix and the stem ends in -t (9fg-15) [ "momentjes", "moment" ], [ "piraatjes", "piraat" ], [ "hondjes", "hond" ], // Noun that looks like a participle [ "gebied", "gebied" ], [ "gebieden", "gebied" ], [ "gezondheidsrecht", "gezondheidsrecht" ], [ "gezondheidsrechten", "gezondheidsrecht" ], /* * Word with plural diminutive suffix -etjes whith is preceded by -e-, the -e- will be further stemmed * and the final stem does not end in t/d (9fgn-12). */ [ "chimpanseetjes", "chimpans" ], /* * Word with diminutive suffix -etje which is preceded by -e-, the -e- will be further stemmed * and the final stem does not end in t/d (9gn-12). */ [ "chimpanseetje", "chimpans" ], // Word with diminutive suffix that ends in -etje [ "typetje", "typ" ], // Words in -eau that get -s in plural [ "niveaus", "niveau" ], [ "niveau", "niveau" ], // Words in -é that get -s in plural and -tje/-tjes in diminutive forms [ "maté", "maté" ], [ "matés", "maté" ], [ "souffleetje", "soufflé" ], [ "plisseetje", "plissé" ], // Words in -ou that get -s in plural [ "bijou", "bijou" ], [ "bijous", "bijou" ], // Plurals in -es where only -s should be stemmed [ "ordonnanties", "ordonnantie" ], // Other specs: // Return the unique stem from noun exception list with multiple stems [ "loot", "lot" ], [ "bigget", "big" ], [ "krab", "krab" ], // Return the unique stem from verb exception list [ "doorliep", "doorloop" ], [ "begin", "begin" ], [ "berg", "berg" ], [ "zeek", "zeik" ], // Return the unique stem from word that end in -t/-d [ "roeit", "roei" ], [ "effect", "effect" ], [ "katten", "kat" ], [ "ontbieden", "ontbied" ], [ "potloden", "potlood" ], [ "beenharde", "beenhard" ], [ "mode", "mood" ], [ "compote", "compoot" ], [ "taarten", "taart" ], // Vowel doubling checks: // Is on ending match sub-list of no vowel doubling list [ "hagelslagen", "hagelslag" ], // Is on exact match sub-list of no vowel doubling list [ "flexibeler", "flexibel" ], // Is on verb match sub-list of no vowel doubling list [ "ademen", "adem" ], [ "uitademen", "uitadem" ], [ "verademen", "veradem" ], // Words that should not be stemmed (not matched in any stemming steps): [ "verantwoordelijk", "verantwoordelijk" ], [ "aangenaam", "aangenaam" ], [ "gelukkig", "gelukkig" ], [ "aardbei", "aardbei" ], // A noun ending in -e which gets diminutive suffix -tje/-tjes. [ "aspergetje", "asperg" ], [ "aspergetjes", "asperg" ], // A word ending in -e [ "asperge", "asperg" ], /* * A noun with diminutive suffix -tje in which undergoes vowel doubling before adding the suffix -tje/-tjes * and the stem is in the exception list with two stems (9-11) */ [ "dynamootje", "dynamo" ], [ "dynamootjes", "dynamo" ], [ "dramaatje", "drama" ], [ "cameraatje", "camera" ], [ "studentjes", "student" ], // Words that are in the list of doNotStemTOrD in which the t/d is part of the stem and should not be stemmed. [ "abonnement", "abonnement" ], [ "communicatiedienst", "communicatiedienst" ], [ "botermarkt", "botermarkt" ], [ "rijafstand", "rijafstand" ], // Nouns ending in -e and are in the list of exception with two stems [ "etiologieën", "etiologie" ], [ "etiologietjes", "etiologie" ], [ "verjaardagscadeautje", "verjaardagscadeau" ], // Nouns ending in -e that get suffix -s, -es will be stemmed and apply stem modification after suffix deletion [ "gazelles", "gazel" ], [ "hoeves", "hoef" ], [ "finales", "finaal" ], // Adjectives ending in -er which get adjective suffixes and is in exception list removeSuffixesFromFullForms [ "bittere", "bitter" ], [ "lekkers", "lekker" ], [ "lekkerdere", "lekker" ], [ "lekkere", "lekker" ], // Adjectives ending in -st which get adjective suffixes and is in exception list removeSuffixesFromFullForms [ "beheerstste", "beheerst" ], [ "geruste", "gerust" ], // Adjectives ending in -s which get superlative suffix -t/-te and is in exception list [ "precieste", "precies" ], [ "preciest", "precies" ], [ "trotste", "trots" ], // Plural nouns ending in -ors which should have -or stemmed [ "alligators", "alligator" ], [ "tenors", "tenor" ], // A word that can be both a noun and a verb, which gets stemmed as a noun (to 'bad', not 'bid') [ "baden", "bad" ], ]; describe( "Test for determining unique stem for Dutch words", () => { it( "stems Dutch words", () => { wordsToStem.forEach( wordToStem => expect( determineStem( wordToStem[ 0 ], morphologyDataNL ) ).toBe( wordToStem[ 1 ] ) ); } ); } );