yoastseo-dep
Version:
Yoast clientside page analysis
165 lines (146 loc) • 6.62 kB
JavaScript
/**
* @file Dutch stemming algorithm. Adapted from:
* @author:
* @copyright
* All rights reserved.
* Implementation of the stemming algorithm from http://snowball.tartarus.org/algorithms/dutch/stemmer.html
* Copyright of the algorithm is: Copyright (c) 2001, Dr Martin Porter and can be found at http://snowball.tartarus.org/license.php
*
* Redistribution and use in source and binary forms, with or without modification, is covered by the standard BSD license.
*/
import { isVowelDoublingAllowed, modifyStem } from "./stemModificationHelpers";
/**
* Determines the start index of the R1 region.
* R1 is the region after the first non-vowel following a vowel. It should include at least 3 letters.
*
* @param {string} word The word for which to determine the R1 region.
* @returns {number} The start index of the R1 region.
*/
const determineR1 = function( word ) {
// Start with matching the first cluster that consists of a vowel and a non-vowel.
let r1Index = word.search( /[aeiouyèäüëïöáéíóú][^aeiouyèäüëïöáéíóú]/ );
// Then add 2 since the R1 index is the index after the first vowel & non-vowel matched with the regex.
if ( r1Index !== -1 ) {
r1Index += 2;
}
// Adjust R1 so that the region preceding it includes at least 3 letters.
if ( r1Index !== -1 && r1Index < 3 ) {
r1Index = 3;
}
return r1Index;
};
/**
* Searches for suffixes in a word.
*
* @param {string} word The word in which to look for suffixes.
* @param {Object} suffixStep One of the three steps of findings suffixes.
* @param {number} r1Index The index of the R1 region.
*
* @returns {Object} The index of the suffix and extra information about whether, and how, the stem will need to be modified.
*/
const findSuffix = function( word, suffixStep, r1Index ) {
const suffixStepArray = Object.entries( suffixStep );
for ( const suffixClass of suffixStepArray ) {
const suffixes = suffixClass[ 1 ].suffixes;
const matchedRegex = suffixes.find( suffixRegex => new RegExp( suffixRegex ).exec( word ) );
if ( matchedRegex ) {
const matched = new RegExp( matchedRegex ).exec( word );
const suffix = matched[ matched.length - 1 ];
const suffixIndex = word.lastIndexOf( suffix );
if ( r1Index !== -1 && suffixIndex >= r1Index ) {
return {
suffixIndex: suffixIndex,
stemModification: suffixClass[ 1 ].stemModification,
};
}
}
}
};
/**
* Deletes the suffix and modifies the stem according to the required modifications.
*
* @param {string} word The word from which to delete the suffix.
* @param {Object} suffixStep One of the three steps of deleting a suffix.
* @param {number} suffixIndex The index of the found suffix.
* @param {string} stemModification The type of stem modification that needs to be done.
* @param {Object} morphologyDataNL The Dutch morphology data file.
* @returns {string} The stemmed and modified word.
*/
const deleteSuffixAndModifyStem = function( word, suffixStep, suffixIndex, stemModification, morphologyDataNL ) {
if ( stemModification === "hedenToHeid" ) {
return modifyStem( word, morphologyDataNL.regularStemmer.stemModifications.hedenToHeid );
}
word = word.substring( 0, suffixIndex );
if ( stemModification === "changeIedtoId" ) {
return modifyStem( word, morphologyDataNL.regularStemmer.stemModifications.iedToId );
} else if ( stemModification === "changeInktoIng" && word.endsWith( "ink" ) ) {
return modifyStem( word, morphologyDataNL.regularStemmer.stemModifications.inkToIng );
} else if (
stemModification === "vowelDoubling" &&
isVowelDoublingAllowed(
word,
morphologyDataNL.regularStemmer.stemModifications.exceptionsStemModifications,
morphologyDataNL.pastParticipleStemmer.compoundVerbsPrefixes
)
) {
return modifyStem( word, morphologyDataNL.regularStemmer.stemModifications.doubleVowel );
}
return word;
};
/**
* Finds and deletes the suffix found in a particular step, and modifies the stem.
*
* @param {string} word The word for which to find and delete a suffix.
* @param {Object} suffixStep One of the three suffix steps.
* @param {number} r1Index The index of the R1 region.
* @param {Object} morphologyDataNL The Dutch morphology data file.
* @returns {string} The word with the deleted suffix.
*/
const findAndDeleteSuffix = function( word, suffixStep, r1Index, morphologyDataNL ) {
const foundSuffix = findSuffix( word, suffixStep, r1Index );
if ( typeof foundSuffix !== "undefined" ) {
word = deleteSuffixAndModifyStem( word, suffixStep, foundSuffix.suffixIndex, foundSuffix.stemModification, morphologyDataNL );
}
return word;
};
/**
* Runs through three stemming steps that process different kinds of suffixes, determines if there is a valid suffix
* within the R1 region that can be deleted for stemming and deletes it, as well as applies suffix-specific stem
* modifications if needed.
*
* @param {string} word The word for which to find and delete suffixes.
* @param {Object} suffixSteps All of the suffix steps.
* @param {number} r1Index The index of the R1 region
* @param {Object} morphologyDataNL The Dutch morphology data file.
* @returns {string} The word with the delete suffix.
*/
const findAndDeleteSuffixes = function( word, suffixSteps, r1Index, morphologyDataNL ) {
const suffixStepsArray = Object.entries( suffixSteps );
for ( const suffixStep of suffixStepsArray ) {
word = findAndDeleteSuffix( word, suffixStep[ 1 ], r1Index, morphologyDataNL );
}
return word;
};
/**
* Search for suffixes in a word, remove them if found, and modify the stem if needed.
*
* @param {string} word The word to stem.
* @param {Object} morphologyDataNL The Dutch morphology data file.
*
* @returns {string} The stemmed word.
*/
export default function detectAndStemSuffixes( word, morphologyDataNL ) {
/*
* Put i and y in between vowels, initial y, and y after a vowel into upper case. This is because they should
* be treated as consonants so we want to differentiate them from other i's and y's when matching regexes.
*/
word = modifyStem( word, morphologyDataNL.regularStemmer.stemModifications.IAndYToUppercase );
// Find the start index of the R1 region.
const r1Index = determineR1( word );
// Import the suffixes from all three steps.
const suffixSteps = morphologyDataNL.regularStemmer.suffixes;
// Run through the three steps of possible de-suffixation.
word = findAndDeleteSuffixes( word, suffixSteps, r1Index, morphologyDataNL );
// Do final modifications to the stem.
return modifyStem( word, morphologyDataNL.regularStemmer.stemModifications.finalChanges );
}