UNPKG

yoastseo-dep

Version:

Yoast clientside page analysis

575 lines (525 loc) 22.9 kB
/* eslint-disable max-statements, complexity */ import { languageProcessing } from "yoastseo-dep"; const { exceptionListHelpers: { checkIfWordEndingIsOnExceptionList }, regexHelpers: { applyAllReplacements }, } = languageProcessing; /* * MIT License * * Copyright (c) 2017 Bastien Botella * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* * Author: Kasun Gajasinghe, University of Moratuwa * E-Mail: kasunbg AT gmail DOT com * Date: 09.08.2010 * * LICENSE: * * Copyright (c) 2010, Kasun Gajasinghe. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * * THIS SOFTWARE IS PROVIDED BY KASUN GAJASINGHE ""AS IS"" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KASUN GAJASINGHE BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ /** * Determines R1, R2 and RV. * * @param {string} word The word for which to determine the R regions. * @param {Object} rIntervalsData The regexes that cover French RV and R1 intervals. * * @returns {[number]} The R1, R2 and RV. */ const determineRs = function( word, rIntervalsData ) { let rvIndex; if ( word.search( new RegExp( rIntervalsData.rvRegex1 ) ) !== -1 || word.search( new RegExp( rIntervalsData.rvRegex2 ) ) !== -1 ) { rvIndex = 3; } else { rvIndex = word.substring( 1 ).search( new RegExp( rIntervalsData.rvRegex3 ) ); if ( rvIndex === -1 ) { rvIndex = word.length; } else { // +2 is to supplement the substring(1) used to find rvIndex rvIndex += 2; } } /* * R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. * R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. */ const r1Regex = new RegExp( rIntervalsData.r1Regex ); let r1Index = word.search( r1Regex ); let r1 = ""; if ( r1Index === -1 ) { r1Index = word.length; } else { r1Index += 2; r1 = word.substring( r1Index ); } let r2Index = r1.search( r1Regex ); if ( r2Index === -1 ) { r2Index = word.length; } else { r2Index += 2; r2Index += r1Index; } if ( r1Index !== -1 && r1Index < 3 ) { r1Index = 3; } return [ r1Index, r2Index, rvIndex, ]; }; /** * Removes or normalizes standard suffixes. * * @param {string} word The word for which to remove suffixes. * @param {Object} standardSuffixData The French morphology data covering standard suffixes. * @param {number} r1Index The start index of R1. * @param {number} r2Index The start index of R2. * @param {number} rvIndex The start index of RV. * * @returns {string} The word with standard suffixes removed or normalized. */ const processStandardSuffixes = function( word, standardSuffixData, r1Index, r2Index, rvIndex ) { const a1Index = word.search( new RegExp( standardSuffixData.standardSuffixes1 ) ), a2Index = word.search( new RegExp( standardSuffixData.standardSuffixes2 ) ), a3Index = word.search( new RegExp( standardSuffixData.standardSuffixes3[ 0 ] ) ), a4Index = word.search( new RegExp( standardSuffixData.standardSuffixes4[ 0 ] ) ), a5Index = word.search( new RegExp( standardSuffixData.standardSuffixes5[ 0 ] ) ), a6Index = word.search( new RegExp( standardSuffixData.standardSuffixes6 ) ), a7Index = word.search( new RegExp( standardSuffixData.standardSuffixes7 ) ), a8Index = word.search( new RegExp( standardSuffixData.standardSuffixes8 ) ), a9Index = word.search( new RegExp( standardSuffixData.standardSuffixes9[ 0 ] ) ), a10Index = word.search( new RegExp( standardSuffixData.standardSuffixes10[ 0 ] ) ), a11Index = word.search( new RegExp( standardSuffixData.standardSuffixes11[ 0 ] ) ), a12Index = word.search( new RegExp( standardSuffixData.standardSuffixes12 ) ), a13Index = word.search( new RegExp( standardSuffixData.standardSuffixes13[ 0 ] ) ), a14Index = word.search( new RegExp( standardSuffixData.standardSuffixes14[ 0 ] ) ), a15Index = word.search( new RegExp( standardSuffixData.standardSuffixes15 ) ); if ( a1Index !== -1 && a1Index >= r2Index ) { word = word.substring( 0, a1Index ); } else if ( a2Index !== -1 && a2Index >= r2Index ) { word = word.substring( 0, a2Index ); const a2Index2 = word.search( new RegExp( standardSuffixData.suffixesPrecedingChar1[ 0 ] ) ); if ( a2Index2 !== -1 && a2Index2 >= r2Index ) { // If preceded by ic, delete if in R2 word = word.substring( 0, a2Index2 ); } else { // Else replace by iqU word = word.replace( new RegExp( standardSuffixData.suffixesPrecedingChar1[ 0 ] ), standardSuffixData.suffixesPrecedingChar1[ 1 ] ); } } else if ( a3Index !== -1 && a3Index >= r2Index ) { // Replace with log if in R2 word = word.slice( 0, a3Index ) + standardSuffixData.standardSuffixes3[ 1 ]; } else if ( a4Index !== -1 && a4Index >= r2Index ) { // Replace with u if in R2 word = word.slice( 0, a4Index ) + standardSuffixData.standardSuffixes4[ 1 ]; } else if ( a5Index !== -1 && a5Index >= r2Index ) { // Replace with ent if in R2 word = word.slice( 0, a5Index ) + standardSuffixData.standardSuffixes5[ 1 ]; } else if ( a12Index !== -1 && a12Index >= r1Index ) { // +1- amendment to non-vowel word = word.substring( 0, a12Index + 1 ); } else if ( a6Index !== -1 && a6Index >= rvIndex ) { word = word.substring( 0, a6Index ); const precedingCharacter2 = word.search( new RegExp( standardSuffixData.suffixesPrecedingChar2[ 0 ] ) ); const a6Index2 = word.search( new RegExp( standardSuffixData.suffixesPrecedingChar4[ 0 ] ) ); const precedingCharacter5 = word.search( new RegExp( standardSuffixData.suffixesPrecedingChar5[ 0 ] ) ); const precedingCharacter6 = word.search( new RegExp( standardSuffixData.suffixesPrecedingChar6[ 0 ] ) ); if ( precedingCharacter2 >= r2Index ) { word = word.slice( 0, precedingCharacter2 ) + standardSuffixData.suffixesPrecedingChar2[ 1 ]; const precedingCharacter3 = word.search( new RegExp( standardSuffixData.suffixesPrecedingChar3[ 0 ] ) ); if ( precedingCharacter3 >= r2Index ) { word = word.slice( 0, precedingCharacter3 ) + standardSuffixData.suffixesPrecedingChar3[ 1 ]; } } else if ( word.search( new RegExp( standardSuffixData.suffixesPrecedingChar4[ 0 ] ) ) !== -1 ) { if ( a6Index2 >= r2Index ) { word = word.substring( 0, a6Index2 ); } else if ( a6Index2 >= r1Index ) { word = word.substring( 0, a6Index2 ) + standardSuffixData.suffixesPrecedingChar4[ 1 ]; } } else if ( precedingCharacter5 >= r2Index ) { // If preceded by abl or iqU, delete if in R2 word = word.slice( 0, precedingCharacter5 ) + standardSuffixData.suffixesPrecedingChar5[ 1 ]; } else if ( precedingCharacter6 >= rvIndex ) { // If preceded by ièr, replace with i if in RV word = word.slice( 0, precedingCharacter6 ) + standardSuffixData.suffixesPrecedingChar6[ 1 ]; } } else if ( a7Index !== -1 && a7Index >= r2Index ) { // Delete if in R2 word = word.substring( 0, a7Index ); const a7Index2 = word.search( new RegExp( standardSuffixData.suffixesPrecedingChar7[ 0 ] ) ); const a7Index3 = word.search( new RegExp( standardSuffixData.suffixesPrecedingChar1[ 0 ] ) ); if ( a7Index2 !== -1 ) { // If preceded by abil, delete if in R2, else replace by abl, otherwise if ( a7Index2 >= r2Index ) { word = word.substring( 0, a7Index2 ); } else { word = word.substring( 0, a7Index2 ) + standardSuffixData.suffixesPrecedingChar7[ 1 ]; } } else if ( a7Index3 !== -1 ) { if ( a7Index3 !== -1 && a7Index3 >= r2Index ) { // If preceded by ic, delete if in R2 word = word.substring( 0, a7Index3 ); } else { // Else replace by iqU word = word.substring( 0, a7Index3 ) + standardSuffixData.suffixesPrecedingChar1[ 1 ]; } } else if ( word.search( new RegExp( standardSuffixData.suffixesPrecedingChar2[ 0 ] ) ) >= r2Index ) { word = word.replace( new RegExp( standardSuffixData.suffixesPrecedingChar2[ 0 ] ), standardSuffixData.suffixesPrecedingChar2[ 1 ] ); } } else if ( a8Index !== -1 && a8Index >= r2Index ) { word = word.substring( 0, a8Index ); if ( word.search( new RegExp( standardSuffixData.suffixesPrecedingChar3[ 0 ] ) ) >= r2Index ) { word = word.replace( new RegExp( standardSuffixData.suffixesPrecedingChar3[ 0 ] ), standardSuffixData.suffixesPrecedingChar3[ 1 ] ); if ( word.search( new RegExp( standardSuffixData.suffixesPrecedingChar1[ 0 ] ) ) >= r2Index ) { word = word.replace( new RegExp( standardSuffixData.suffixesPrecedingChar1[ 0 ] ), "" ); } else { word = word.replace( new RegExp( standardSuffixData.suffixesPrecedingChar1[ 0 ] ), standardSuffixData.suffixesPrecedingChar1[ 1 ] ); } } } else if ( a9Index !== -1 ) { word = word.replace( new RegExp( standardSuffixData.standardSuffixes9[ 0 ] ), standardSuffixData.standardSuffixes9[ 1 ] ); } else if ( a10Index >= r1Index ) { word = word.replace( new RegExp( standardSuffixData.standardSuffixes10[ 0 ] ), standardSuffixData.standardSuffixes10[ 1 ] ); } else if ( a11Index !== -1 ) { const a11Index2 = word.search( new RegExp( standardSuffixData.standardSuffixes11[ 0 ] ) ); if ( a11Index2 >= r2Index ) { word = word.substring( 0, a11Index2 ); } else if ( a11Index2 >= r1Index ) { word = word.substring( 0, a11Index2 ) + standardSuffixData.standardSuffixes11[ 1 ]; } } else if ( a13Index !== -1 && a13Index >= rvIndex ) { word = word.replace( new RegExp( standardSuffixData.standardSuffixes13[ 0 ] ), standardSuffixData.standardSuffixes13[ 1 ] ); } else if ( a14Index !== -1 && a14Index >= rvIndex ) { word = word.replace( new RegExp( standardSuffixData.standardSuffixes14[ 0 ] ), standardSuffixData.standardSuffixes14[ 1 ] ); } else if ( a15Index !== -1 && a15Index >= rvIndex ) { word = word.substring( 0, a15Index + 1 ); } return word; }; /** * Removes verb suffixes starting with i. * * @param {string} word The word for which to remove suffixes. * @param {string} originalWord The unprocessed word. * @param {number} rvIndex The start index of RV. * @param {string} verbSuffixesWithIBeginning Data for checking French suffixes starting with I. * * @returns {{step2aDone: boolean, word: string}} The word and information about whether the conditions for step 2a were met. */ const removeVerbSuffixesStartingWithI = function( word, originalWord, rvIndex, verbSuffixesWithIBeginning ) { let step2aDone = false; if ( originalWord === word.toLowerCase() || checkIfWordEndingIsOnExceptionList( originalWord, verbSuffixesWithIBeginning.exceptions ) ) { step2aDone = true; const b1Regex = new RegExp( verbSuffixesWithIBeginning.suffixes[ 0 ] ); if ( word.search( b1Regex ) >= rvIndex ) { word = word.replace( b1Regex, verbSuffixesWithIBeginning.suffixes[ 1 ] ); } } return { word, step2aDone }; }; /** * Removes other verb suffixes. * * @param {string} word The word for which to remove suffixes. * @param {boolean} step2aDone Whether step 2a was done. * @param {string} wordAfterStep1 The word after step 1 was done. * @param {number} r2Index The start index of R2. * @param {number} rvIndex The start index of RV. * @param {Object} morphologyData The French morphology data. * * @returns {string} The word after other verb suffixes were removed. */ const removeOtherVerbSuffixes = function( word, step2aDone, wordAfterStep1, r2Index, rvIndex, morphologyData ) { const otherVerbSuffixes = morphologyData.regularStemmer.otherVerbSuffixes; if ( step2aDone && wordAfterStep1 === word ) { const suffixIons = new RegExp( otherVerbSuffixes[ 0 ] ); if ( word.search( suffixIons ) >= r2Index ) { return word.replace( suffixIons, "" ); } for ( let i = 1; i < otherVerbSuffixes.length; i++ ) { const regex = new RegExp( otherVerbSuffixes[ i ] ); if ( word.search( regex ) >= rvIndex ) { return word.replace( regex, "" ); } } // Check if a word ends in "ons" preceded by "i", if it is "ons" is not stemmed. if ( word.endsWith( "ions" ) ) { return word; } // Check if a word ends in "ons" preceded by other than "i" and stem it if it is in RV. const verbSuffixOns = new RegExp( morphologyData.regularStemmer.verbSuffixOns ); if ( word.search( verbSuffixOns ) >= rvIndex ) { word = word.replace( verbSuffixOns, "" ); } } return word; }; /** * Removes residual suffixes. * * @param {string} word The word for which to remove residual suffixes. * @param {number} rvIndex The start index of RV. * @param {number} r2Index The start index of R2. * @param {Object} morphologyDataRegularStemmer The French morphology data. * * @returns {string} The word after residual suffixes were removed. */ const removeResidualSuffixes = function( word, rvIndex, r2Index, morphologyDataRegularStemmer ) { const residualSuffixes = morphologyDataRegularStemmer.residualSuffixes; if ( word.search( new RegExp( residualSuffixes.residualSuffixes1[ 0 ] ) ) >= rvIndex ) { word = word.replace( new RegExp( residualSuffixes.residualSuffixes1[ 0 ] ), residualSuffixes.residualSuffixes1[ 1 ] ); } const e1Index = word.search( new RegExp( residualSuffixes.residualSuffix2 ) ); if ( e1Index >= r2Index && word.search( new RegExp( residualSuffixes.residualSuffix3 ) ) >= rvIndex ) { word = word.substring( 0, e1Index ); } else { let e2Index = word.search( new RegExp( residualSuffixes.residualSuffixes4[ 0 ] ) ); if ( e2Index >= rvIndex ) { word = word.substring( 0, e2Index ) + residualSuffixes.residualSuffixes4[ 1 ]; } else { e2Index = word.search( new RegExp( residualSuffixes.residualSuffix5 ) ); if ( e2Index >= rvIndex ) { // Delete last e. word = word.substring( 0, e2Index ); } else { e2Index = word.search( new RegExp( residualSuffixes.residualSuffix6[ 0 ] ) ); if ( e2Index >= rvIndex ) { word = word.substring( 0, e2Index ) + residualSuffixes.residualSuffix6[ 1 ]; } } } } return word; }; /** * Checks whether a word is in the full-form exception list and if so returns the canonical stem. * * @param {string} word The word to be checked. * @param {Object} exceptions The list of full-form exceptions to be checked in. * * @returns {null|string} The canonical stem or null if nothing was found. */ const checkWordInFullFormExceptions = function( word, exceptions ) { for ( const paradigm of exceptions ) { if ( paradigm[ 1 ].includes( word ) ) { return paradigm[ 0 ]; } } return null; }; /** * Check whether the stem is on the exception list of stems that belong to one word. If it is, returns the canonical stem. * * @param {string} stemmedWord The stemmed word. * @param {Object} stemsThatBelongToOneWord The list of stems that belong to one word. * * @returns {null|string} The canonical stem if word was found on the list. */ const canonicalizeStem = function( stemmedWord, stemsThatBelongToOneWord ) { // Check the adjectives list. for ( const paradigm of stemsThatBelongToOneWord.adjectives ) { if ( paradigm.includes( stemmedWord ) ) { return paradigm[ 0 ]; } } // Check the verbs list. The infinitive stem is always the canonical stem for verbs. for ( const paradigm of stemsThatBelongToOneWord.verbs ) { if ( paradigm.includes( stemmedWord ) ) { return paradigm[ 0 ]; } } }; /** * Checks whether the word is on the list of words which should be stemmed, even though the suffix would not be found in the * required region. If the word is found on the list, the stem specified in that list is returned. * * @param {string} word The word to check. * @param {Object} shortWordsAndStems The list to check. * * @returns {null|string} The stem or null if the word was not found on the list. */ const checkShortWordsExceptionList = function( word, shortWordsAndStems ) { // First check whether the word is on the sub-list of words that cannot take an extra -s suffix. for ( const wordStemPair of shortWordsAndStems.cannotTakeExtraSuffixS ) { if ( wordStemPair[ 0 ] === word ) { return wordStemPair[ 1 ]; } } // If the word was not found on the first sub-list, check the second sub-list of words that can take an extra -s suffix. if ( word.endsWith( "s" ) ) { word = word.slice( 0, -1 ); } for ( const wordStemPair of shortWordsAndStems.canTakeExtraSuffixS ) { if ( wordStemPair[ 0 ] === word ) { return wordStemPair[ 1 ]; } } }; /** * Stems French words. * * @param {string} word The word to stem. * @param {Object} morphologyData The French morphology data. * * @returns {string} The stemmed word. */ export default function stem( word, morphologyData ) { word = word.toLowerCase(); const originalWord = word; // Check if the word is on an exception list of words that should be stemmed even though the suffix is not in the required region. const wordAfterShortWordsCheck = checkShortWordsExceptionList( word, morphologyData.shortWordsAndStems ); if ( wordAfterShortWordsCheck ) { return wordAfterShortWordsCheck; } // Check if the word is on an exception list for which all forms of a word and its stem are listed. const ifException = checkWordInFullFormExceptions( word, morphologyData.exceptionStemsWithFullForms ); if ( ifException ) { return ifException; } // Check the exception list of words with plural suffix -x. if ( word.endsWith( "x" ) ) { const pluralsWithXSuffix = morphologyData.pluralsWithXSuffix; if ( pluralsWithXSuffix.includes( word ) ) { return word.slice( 0, -1 ); } } // Check if the word is on the exception list of words for which -s should not be stemmed. if ( word.endsWith( "s" ) ) { const sShouldNotBeStemmed = morphologyData.sShouldNotBeStemmed; if ( sShouldNotBeStemmed.includes( word ) ) { return word; } } // Check if the word is on the exception list of words for which -ent should not be stemmed. const nonVerbsOnEnt = morphologyData.nonVerbsOnEnt; if ( word.endsWith( "ent" ) ) { if ( nonVerbsOnEnt.includes( word ) ) { return word; } } if ( word.endsWith( "ents" ) ) { if ( nonVerbsOnEnt.includes( word.slice( 0, -1 ) ) ) { return word.slice( 0, -1 ); } } // Check if word is on the exception list of nouns and adjectives for which the verb suffix -ons should not be stemmed. const nonVerbsOnOns = morphologyData.nonVerbsOnOns; if ( word.endsWith( "ons" ) ) { if ( nonVerbsOnOns.includes( word ) ) { return word.slice( 0, -1 ); } } // Pre-processing steps word = applyAllReplacements( word, morphologyData.regularStemmer.preProcessingStepsRegexes ); // Determine R1, R2 & RV regions. const [ r1Index, r2Index, rvIndex, ] = determineRs( word, morphologyData.regularStemmer.rIntervals ); /* * Step 1: * Remove standard suffixes */ word = processStandardSuffixes( word, morphologyData.regularStemmer.standardSuffixes, r1Index, r2Index, rvIndex ); const wordAfterStep1 = word; /* * Step 2a: * Stem verb suffixes beginning with "i" */ const verbSuffixesStartingWithIRemoved = removeVerbSuffixesStartingWithI( word, originalWord, rvIndex, morphologyData.regularStemmer.verbSuffixesWithIBeginning ); word = verbSuffixesStartingWithIRemoved.word; const step2aDone = verbSuffixesStartingWithIRemoved.step2aDone; /* * Step 2b: * Stem other verb suffixes */ if ( ! nonVerbsOnEnt.includes( word ) ) { word = removeOtherVerbSuffixes( word, step2aDone, wordAfterStep1, r2Index, rvIndex, morphologyData ); } if ( originalWord === word.toLowerCase() ) { /* Step 4: * Stem residual suffixes. */ word = removeResidualSuffixes( word, rvIndex, r2Index, morphologyData.regularStemmer ); } else { /* * Step 3 (only needs to be executed if step 4 isn't executed) * Replace final Y with i or final ç with c. */ const yEnding = morphologyData.regularStemmer.yAndSoftCEndingAndReplacement.yEndingAndReplacement; const softCEnding = morphologyData.regularStemmer.yAndSoftCEndingAndReplacement.softCEndingAndReplacement; if ( word.endsWith( yEnding[ 0 ] ) ) { word = word.slice( 0, -1 ) + yEnding[ 1 ]; } else if ( word.endsWith( softCEnding[ 0 ] ) ) { word = word.slice( 0, -1 ) + softCEnding[ 1 ]; } } /* Step 5: * Undouble final consonants */ word = applyAllReplacements( word, morphologyData.regularStemmer.finalConsonantUndoubling ); /* Step 6: * Un-accent */ const unaccentE = morphologyData.regularStemmer.unaccentERegex; word = word.replace( new RegExp( unaccentE[ 0 ] ), unaccentE[ 1 ] ); word = word.toLowerCase(); const canonicalStem = canonicalizeStem( word, morphologyData.stemsThatBelongToOneWord ); if ( canonicalStem ) { return canonicalStem; } return word; }