UNPKG

yoastseo-dep

Version:

Yoast clientside page analysis

273 lines (240 loc) 11.9 kB
import { languageProcessing } from "yoastseo-dep"; const { regexHelpers: { doesWordMatchRegex } } = languageProcessing; import nonParticiples from "../../config/internal/nonParticiples.js"; import { modifyStem } from "./stemModificationHelpers"; /** * Checks whether the word is on an exception list of participles that do not have a ge- prefix. If it is found on the list, * remove only the last letter (the suffix). * * @param {array} dataExceptionListToCheck The list of the exception words. * @param {string} word The (unstemmed) word to check. * * @returns {null|string} The stemmed word or null if the word was not found on the exception list. */ const checkAndStemIfExceptionWithoutGePrefix = function( dataExceptionListToCheck, word ) { if ( dataExceptionListToCheck.includes( word ) ) { return word.slice( 0, -1 ); } return null; }; /** * Checks whether a word that was detected as a participle should not have the suffix (-t or -d) removed because it is part * of the stem. For example, in the participle 'geantwoord', the -d belongs to the stem so it should not be removed. * The checks are conducted on the word without the prefix, so 'antwoord' in the case of 'geantwoord'. * * For words ending in -t, there are three checks: * 1) An exception list (exceptions to a rule) containing words where -t SHOULD be stemmed, * 2) The rule, defined using a regex with word endings where -t is part of the stem, * 3) A list of verbs with stem ending in -t, to cover cases that were not possible to find using a regex. * * For words ending in -d, we check a list of verbs with stem ending in -d. * * @param {string} wordWithoutPrefix The word without prefix(es). * @param {Object} morphologyDataNL The Dutch morphology data. * * @returns {boolean} Whether the suffix should be stemmed. */ const shouldSuffixBeStemmed = function( wordWithoutPrefix, morphologyDataNL ) { if ( wordWithoutPrefix.endsWith( "t" ) ) { // Return true (suffix should be stemmed) if word was found on the exception list of verbs which should have the final -t stemmed. const exceptionsTShouldBeStemmed = morphologyDataNL.ambiguousTAndDEndings.wordsTShouldBeStemmed; if ( exceptionsTShouldBeStemmed.includes( wordWithoutPrefix ) ) { return true; } // Return false (suffix should not be stemmed) if word matches the regex for stems ending in -t. if ( doesWordMatchRegex( wordWithoutPrefix, morphologyDataNL.ambiguousTAndDEndings.tOrDArePartOfStem.tEnding ) ) { return false; } /* * Return false (suffix should not be stemmed) if the word was found on the list of verbs with stem ending in -t (e.g. haast) * Otherwise, return true (if no checks are matched, the default condition is for -t to be stemmed). */ const exceptionsTShouldNotBeStemmed = morphologyDataNL.stemExceptions.wordsNotToBeStemmedExceptions.verbs; return ! exceptionsTShouldNotBeStemmed.includes( wordWithoutPrefix ); } const exceptionsDShouldNotBeStemmed = morphologyDataNL.pastParticipleStemmer.doNotStemD; return ! exceptionsDShouldNotBeStemmed.includes( wordWithoutPrefix ); }; /** * Detects whether a word is a participle of a regular verb without prefixes other than ge-. If it is, checks whether * the word is an exception that should not have the prefix or the suffix stemmed. Then stems the word accordingly * (remove prefix, suffix, or both). * * @param {Object} morphologyDataNL The Dutch morphology data. * @param {string} word The word (not stemmed) to check. * * @returns {string|null} The stem or null if no participle was matched. */ const detectAndStemParticiplesWithoutPrefixes = function( morphologyDataNL, word ) { const geStemTParticipleRegex = new RegExp( "^" + morphologyDataNL.pastParticipleStemmer.participleStemmingClasses[ 0 ].regex ); // Check if it's a ge + stem + t/d participle. if ( geStemTParticipleRegex.test( word ) ) { // Check if the ge- is actually part of the stem. If yes, stem only the suffix. const exception = ( checkAndStemIfExceptionWithoutGePrefix( morphologyDataNL.pastParticipleStemmer.doNotStemGe, word ) ); if ( exception ) { return exception; } // Remove the prefix. let wordWithoutPrefix = word.slice( 2 ); // Check if stem starts with ë. If yes, replace ë with e. if ( wordWithoutPrefix.startsWith( "ë" ) ) { wordWithoutPrefix = "e" + wordWithoutPrefix.slice( 1 ); } // Check whether the suffix should be stemmed. If yes, remove it and return the stem. if ( shouldSuffixBeStemmed( wordWithoutPrefix, morphologyDataNL ) ) { return ( wordWithoutPrefix.slice( 0, -1 ) ); } return wordWithoutPrefix; } return null; }; /** * Determines whether a given participle pattern combined with prefixes from a given class (separable or inseparable) * applies to a given word and if so, returns the stem. * * @param {Object} morphologyDataNL The Dutch morphology data. * @param {string} word The word (not stemmed) to check. * @param {boolean} separable Whether the prefix is separable or not. * @param {string[]} prefixes The prefixes of a certain prefix class. * @param {string} regexPart The regex part for a given class (completed to a full regex within the function). * * @returns {string|null} The stem or null if no prefixed participle was matched. */ const detectAndStemParticiplePerPrefixClass = function( morphologyDataNL, word, separable, prefixes, regexPart ) { for ( const currentPrefix of prefixes ) { const participleRegex = new RegExp( "^" + currentPrefix + regexPart ); if ( participleRegex.test( word ) ) { let wordWithoutPrefix = word.slice( currentPrefix.length - word.length ); /* * After removing a separable prefix, check whether the ge- belongs to the stem (e.g. the -ge- in opgebruikt). * If it does, stem only the suffix. */ if ( separable ) { const exception = ( checkAndStemIfExceptionWithoutGePrefix( morphologyDataNL.pastParticipleStemmer.doNotStemGe, wordWithoutPrefix ) ); if ( exception ) { return ( currentPrefix + exception ); } wordWithoutPrefix = wordWithoutPrefix.slice( 2 ); } // Check whether stem starts with ë. If yes, replace ë with e. if ( wordWithoutPrefix.startsWith( "ë" ) ) { wordWithoutPrefix = "e" + wordWithoutPrefix.slice( 1 ); } if ( shouldSuffixBeStemmed( wordWithoutPrefix, morphologyDataNL ) ) { return ( currentPrefix + ( wordWithoutPrefix.slice( 0, -1 ) ) ); } return ( currentPrefix + wordWithoutPrefix ); } } return null; }; /** * Detects whether a word is a regular participle of a compound verb. A compound verb has a prefix in addition to, or instead of, ge-. * For example, afgemaakt has the separable prefix af-, and beantwoord has the inseparable prefix be-. If a participle * of a compound verb is detected, it is stemmed by removing the ge- (in case of a verb with a separable prefix) and the suffix -t or -d. * * @param {Object} morphologyDataNL The Dutch morphology data. * @param {string} word The word (not stemmed) to check. * * @returns {string|null} The stem or null if no participle with prefix was matched. */ const detectAndStemParticiplesWithPrefixes = function( morphologyDataNL, word ) { /* * It's important to preserve order here, since the ge + stem ending in -t regex is more specific than * the stem + t regex, and therefore must be checked first. */ for ( const participleClass of morphologyDataNL.pastParticipleStemmer.participleStemmingClasses ) { const regex = participleClass.regex; const separable = participleClass.separable; const prefixes = separable ? morphologyDataNL.pastParticipleStemmer.compoundVerbsPrefixes.separable : morphologyDataNL.pastParticipleStemmer.compoundVerbsPrefixes.inseparable; const stem = detectAndStemParticiplePerPrefixClass( morphologyDataNL, word, separable, prefixes, regex ); if ( stem ) { return stem; } } return null; }; /** * Checks whether the word is on the list of participles that do not need to be stemmed, because the participle form * is the same as the stem. * * @param {string[]} dataParticiplesSameAsStem The list of exceptions whose stem is the same as the participle. * @param {string} word The word to check. * @returns {boolean} Whether the word is found on the exception list. */ const checkIfParticipleIsSameAsStem = function( dataParticiplesSameAsStem, word ) { return dataParticiplesSameAsStem.includes( word ); }; /** * Check whether the word is on an exception list of past participles with inseparable prefixes and ending in -end. * If not, stem the word that starts with an inseparable verb prefix and ends in -end as a present participle. * * @param {array} inseparablePrefixes The list of inseparable prefixes. * @param {array} dataExceptionListToCheck The list of the exception words. * @param {array} finalChangesRules The array of regex-based rules to be applied to the stem. * @param {string} word The (unstemmed) word to check. * * @returns {null|string} The stemmed word or null if the word was found on the exception list. */ const checkAndStemIfInseparablePrefixWithEndEnding = function( inseparablePrefixes, dataExceptionListToCheck, finalChangesRules, word ) { const startsWithInseparablePrefix = inseparablePrefixes.map( prefix => word.startsWith( prefix ) ).some( value => value === true ); if ( startsWithInseparablePrefix && word.endsWith( "end" ) && ! dataExceptionListToCheck.includes( word ) ) { return modifyStem( word.slice( 0, -3 ), finalChangesRules ); } return null; }; /** * Detects whether a word is a regular participle and if so, returns the stem. * * @param {Object} morphologyDataNL The Dutch morphology data. * @param {string} word The word (not stemmed) to check. * * @returns {string|null} The participle stem or null if no regular participle was matched. */ export function detectAndStemRegularParticiple( morphologyDataNL, word ) { // Check whether the word is not a participle. If it is not, return empty string. if ( word.endsWith( "heid" ) || word.endsWith( "teit" ) || word.endsWith( "tijd" ) || nonParticiples.includes( word ) ) { return ""; } /** * Check whether the word is on an exception list of verbs whose participle is the same as the stem. If the word is found * on the list, return the stem. */ if ( checkIfParticipleIsSameAsStem( morphologyDataNL.pastParticipleStemmer.inseparableCompoundVerbsNotToBeStemmed, word ) ) { return word; } // Check and stem if the word is a participle without any separable or inseparable prefix let stem = detectAndStemParticiplesWithoutPrefixes( morphologyDataNL, word ); if ( stem ) { return stem; } /** * Check whether the word is on an exception list of inseparable compound verbs with a prefix that is usually separable. * If it is, remove just the suffix and return the stem. */ stem = checkAndStemIfExceptionWithoutGePrefix( morphologyDataNL.pastParticipleStemmer.inseparableCompoundVerbs, word ); if ( stem ) { return stem; } /** * Check whether the word is on an exception list of past participles with inseparable prefixes and ending in -end. * If not, stem the word that starts with an inseparable verb prefix and ends in -end as a present participle. */ stem = checkAndStemIfInseparablePrefixWithEndEnding( morphologyDataNL.pastParticipleStemmer.compoundVerbsPrefixes.inseparable, morphologyDataNL.pastParticipleStemmer.pastParticiplesEndingOnEnd, morphologyDataNL.regularStemmer.stemModifications.finalChanges, word ); if ( stem ) { return stem; } // Check and stem if the word is a participle with a separable or inseparable prefix stem = detectAndStemParticiplesWithPrefixes( morphologyDataNL, word ); if ( stem ) { return stem; } return null; }