yoastseo-dep
Version:
Yoast clientside page analysis
160 lines (141 loc) • 6.22 kB
JavaScript
import { languageProcessing } from "yoastseo-dep";
const {
regexHelpers: { searchAndReplaceWithRegex, doesWordMatchRegex },
exceptionListHelpers: {
checkIfWordEndingIsOnExceptionList,
checkIfWordIsOnListThatCanHavePrefix,
},
} = languageProcessing;
import { isVowelDoublingAllowed } from "./stemModificationHelpers";
/**
* Checks whether the word ends in suffixes -e or -en which are preceded by -t or -d, and the -t/-d is part of the stem.
* If it does, stem the -e/-en. Also checks if after suffix deletion the stemmed word needs modification, and applies it if
* needed. e.g. doden -> dod -> dood
*
* @param {Object} morphologyDataNL The Dutch morphology data file.
* @param {string[]} regexAndReplacement The regex to check and the string replacement that should be made.
* @param {string} word The word to be checked.
*
* @returns {?string} The stem created or null.
*/
const stemWordsWithEOrEnSuffix = function( morphologyDataNL, regexAndReplacement, word ) {
if ( doesWordMatchRegex( word, regexAndReplacement[ 0 ] ) ) {
const stemmedWord = word.replace( new RegExp( regexAndReplacement[ 0 ] ), regexAndReplacement[ 1 ] );
if ( isVowelDoublingAllowed( stemmedWord,
morphologyDataNL.regularStemmer.stemModifications.exceptionsStemModifications,
morphologyDataNL.pastParticipleStemmer.compoundVerbsPrefixes ) ) {
const replacement = searchAndReplaceWithRegex( stemmedWord, morphologyDataNL.regularStemmer.stemModifications.doubleVowel );
return replacement ? replacement : stemmedWord;
}
return stemmedWord;
}
return null;
};
/**
* Stems words for which we know that -t/-d is the ending of the stem (so the -t/-d is not stemmed). This is done through
* checking lists of words and matching the word with regexes.
*
* @param {string} word The word to check.
* @param {Object} morphologyDataNL The Dutch morphology data.
*
* @returns {?string} The stemmed word, if matched in one of the checks, or null if not matched.
*/
const checkWhetherTOrDIsPartOfStem = function( word, morphologyDataNL ) {
const tAndDPartOfStemData = morphologyDataNL.ambiguousTAndDEndings.tOrDArePartOfStem;
/*
* Step 1:
* - If the stem ends in -tte, -tten, -dde or -dden leave the first -t/-d and stem the remaining ending.
* - Example: "katten" (-ten should be stemmed, leaving "kat").
*/
let stemmedWord = searchAndReplaceWithRegex( word, tAndDPartOfStemData.firstTOrDPartOfStem );
if ( stemmedWord ) {
return stemmedWord;
}
/*
* Step 2:
* 2a)
* - Checks whether the word is in the exception list of verbal forms ending in long vowel + -fden/sden. If so, stems -den off.
* - Example: "hoefden" (-den should be stemmed, leaving "hoef").
* 2b)
* - Check whether the word has the suffix -en preceded by -d, where the -d is part of the stem. If it is, stem only -en.
* - Example: "eenden" (-en should be stemmed, leaving "eend").
*/
if ( tAndDPartOfStemData.verbsDenShouldBeStemmed.includes( word ) ) {
return word.slice( 0, -3 );
}
if ( checkIfWordEndingIsOnExceptionList( word, tAndDPartOfStemData.wordsStemOnlyEnEnding.endingMatch ) ||
checkIfWordIsOnListThatCanHavePrefix( word,
tAndDPartOfStemData.wordsStemOnlyEnEnding.verbs, morphologyDataNL.pastParticipleStemmer.compoundVerbsPrefixes ) ||
doesWordMatchRegex( word, tAndDPartOfStemData.denEnding ) ) {
stemmedWord = word.slice( 0, -2 );
// Check if the vowel needs to be doubled after deleting suffix -en.
if ( isVowelDoublingAllowed( stemmedWord,
morphologyDataNL.regularStemmer.stemModifications.exceptionsStemModifications,
morphologyDataNL.pastParticipleStemmer.compoundVerbsPrefixes ) ) {
const replacement = searchAndReplaceWithRegex( stemmedWord, morphologyDataNL.regularStemmer.stemModifications.doubleVowel );
return replacement ? replacement : stemmedWord;
}
return stemmedWord;
}
/*
* Step 3:
* - Checks whether the word matches the regex for words ending in -de with -d being part of the stem. If it is matched,
* only stem the -e.
* - Example: "beenharde" (-e should be stemmed, leaving "beenhard")
*/
const dIsPartOfStemRegex = tAndDPartOfStemData.deEnding;
stemmedWord = stemWordsWithEOrEnSuffix( morphologyDataNL, dIsPartOfStemRegex, word );
if ( stemmedWord ) {
return stemmedWord;
}
/*
* Step 4:
* - Checks whether the word matches the regex for words ending in -te or -ten with -t being part of the stem. If it is
* matched, only stem the -e/-en.
* - Example: "castraten" (-en should be stemmed, leaving "castraat")
*/
const tIsPartOfStemRegex = tAndDPartOfStemData.teAndTenEndings;
stemmedWord = stemWordsWithEOrEnSuffix( morphologyDataNL, tIsPartOfStemRegex, word );
if ( stemmedWord ) {
return stemmedWord;
}
return null;
};
/**
* Creates the correct stem for words which end in ambiguous endings -t, -te, -ten, -de, or -den.
*
* @param {Object} morphologyDataNL The Dutch morphology data.
* @param {string} word The word to be checked.
*
* @returns {?string} The stemmed word or null.
*/
export function generateCorrectStemWithTAndDEnding( morphologyDataNL, word ) {
/*
* Step 1:
* - Check whether the word is in the exception list of words in which -t ending needs to be stemmed. If it is, stem -t.
* - Example: "squasht".
* - This is an exception to one of the rule in step 2.
*/
if ( checkIfWordEndingIsOnExceptionList( word, morphologyDataNL.ambiguousTAndDEndings.wordsTShouldBeStemmed ) ) {
return word.slice( 0, -1 );
}
/*
* Step 2:
* - Check if word is matched by a regex for a t that shouldn't be stemmed.
* - Example: "boot".
*/
if ( doesWordMatchRegex( word, morphologyDataNL.ambiguousTAndDEndings.tOrDArePartOfStem.tEnding ) ) {
return word;
}
/*
* Step 3:
* - Check whether the word has another suffix that should be stemmed (e.g. -en) preceded by -t or -d which is part of the stem.
* If yes, stem the suffix that should be stemmed and return the stem which ends in -t/-d.
* - Example: "tijden" (only -en should be removed, not -den).
*/
const stemmedWord = checkWhetherTOrDIsPartOfStem( word, morphologyDataNL );
if ( stemmedWord ) {
return stemmedWord;
}
return null;
}