UNPKG

yoastseo-dep

Version:

Yoast clientside page analysis

410 lines (366 loc) 18.8 kB
import { languageProcessing } from "yoastseo-dep"; const { flattenSortLength, buildFormRule, createRulesFromArrays } = languageProcessing; import { calculateTotalNumberOfSyllables, removeEnding, checkBeginningsList } from "./stemHelpers"; /** * MIT License * * Adapted from: Copyright (c) 2013 Adinda Praditya * * Permission is hereby granted, free of charge, to any person obtaining a copy of this software * and associated documentation files (the \"Software\"), to deal in the Software without restriction, * including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." */ /** * Checks if a single syllable word has one of the suffixes/suffix combination. * * @param {string} word The word to check. * @param {Array} suffixesArray The array of suffixes * @returns {boolean} Whether the word ends in one of the suffixes or not. */ const checkSingleSyllableWordSuffix = function( word, suffixesArray ) { for ( const suffix of suffixesArray ) { if ( word.match( suffix ) ) { return true; } } }; /** * Stems the prefix of the single syllable words, i.e. di-/penge-/menge- * * @param {string} word The word to check. * @param {Object} morphologyData The Indonesian morphology data file. * * @returns {string} The stemmed word. */ const stemSingleSyllableWordsPrefixes = function( word, morphologyData ) { // If the word gets prefix di-, stem the prefix here. E.g. dicekkanlah -> cekkanlah, dibomi -> bomi if ( word.startsWith( "di" ) && checkBeginningsList( word, 2, morphologyData.stemming.singleSyllableWords ) ) { return word.substring( 2, word.length ); } /* * If the word gets prefix menge-/penge- and is followed by one of the words in the list, stem the prefix here. * E.g. pengeboman -> boman */ if ( /^[mp]enge/i.test( word ) && checkBeginningsList( word, 5, morphologyData.stemming.singleSyllableWords ) ) { return word.substring( 5, word.length ); } return word; }; /** * Stems Indonesian single syllable words. This function concerns single syllable words * with this possible word format [di/penge/menge] + single syllable word + [kan/an/i] + [ku/mu/nya] + [kah/lah/pun], with [] being optional. * E.g. dipel -> pel, dipelkan -> pel, dipelkanlah -> pel, pelkan -> pel, pelmulah -> pel, pengeboman -> bom, mengesahkan -> sah * * * @param {string} word The word to check. * @param {Object} morphologyData The Indonesian morphology data file. * * @returns {string} The stemmed word. */ const stemSingleSyllableWords = function( word, morphologyData ) { const singleSyllableWords = morphologyData.stemming.singleSyllableWords; const suffixCombination = morphologyData.stemming.singleSyllableWordsSuffixes; const inputWord = word; // If the word starts with prefix di-/penge-/menge-, stem the prefix here. E.g. pengeboman -> boman, dipelkan -> pelkan word = stemSingleSyllableWordsPrefixes( word, morphologyData ); // Check if a word starts with one of the words in the list, has maximum 3 syllables, and ends in one of the single syllable suffixes if ( singleSyllableWords.some( shortWord => word.startsWith( shortWord ) ) && calculateTotalNumberOfSyllables( word ) <= 3 && checkSingleSyllableWordSuffix( word, suffixCombination ) ) { // If the word gets a particle suffix, stem the particle. E.g. cekkanlah -> cekkan, pelkan -> pel, vasmulah -> vasmu word = removeEnding( word, morphologyData.stemming.regexRules.removeParticle, morphologyData.stemming.doNotStemWords.doNotStemParticle, morphologyData ); // If the word gets a possessive pronoun suffix, stem the possessive pronoun. E.g. vasmu -> vas word = removeEnding( word, morphologyData.stemming.regexRules.removePronoun, morphologyData.stemming.doNotStemWords.doNotStemPronounSuffix, morphologyData ); // If the word ends in -kan/-an/-i suffix, stem the suffix. E.g. cekkan -> cek, bomi -> bom const wordWithoutDerivationalSuffix = removeEnding( word, morphologyData.stemming.regexRules.removeSuffixes, morphologyData.stemming.doNotStemWords.doNotStemSuffix, morphologyData ); if ( singleSyllableWords.includes( wordWithoutDerivationalSuffix ) ) { word = wordWithoutDerivationalSuffix; } } /* * We only want to stem single syllable words here. * Thus, if the output word has more than one syllable, we don't stem the input word at all. */ if ( calculateTotalNumberOfSyllables( word ) > 1 || word.length === 1 ) { word = inputWord; } return word; }; /** * Tries stemming prefixes ke- and ter-. Ke- is always stemmed, and ter- only if it is a prefix and not part of the stem. * Also if the stem of the word begins with r-, only te- is stemmed, not ter-. * * @param {Object} morphologyData The Indonesian morphology data file. * @param {string} word The word to check. * * @returns {string|null} The stem or null if the word did not start with ter/keter. */ const tryStemmingKeAndTer = function( morphologyData, word ) { const terException = morphologyData.stemming.doNotStemWords.doNotStemPrefix.doNotStemFirstOrderPrefix.doNotStemTer; // If prefix -ter is preceded by prefix -ke, remove it first. if ( word.startsWith( "keter" ) ) { word = word.substring( 2, word.length ); } if ( word.startsWith( "ter" ) ) { // If word is on an exception list of words where -ter should not be stemmed, do not stem -ter and return the word. if ( terException.some( wordWithTer => word.startsWith( wordWithTer ) ) ) { return word; } // If word (without prefixes) is on the list of words beginning with -r, remove only -te instead of -ter. if ( checkBeginningsList( word, 3, morphologyData.stemming.beginningModification.rBeginning ) ) { return word.replace( /^ter/i, "r" ); } // Otherwise, remove -ter. return word.substring( 3, word.length ); } }; /** * Checks whether a word has a first order prefix and whether it is on an exception list of words which require a stem mofification * after removing the prefix. Returns the stem if the prefix was found and the word was matched on an exception list. * * * @param {string} word The word to check. * @param {Object} morphologyData The Indonesian morphology data file. * * @returns {string|null} The stem or null if a prefix was not found, or was found but the word was not on the exception list. */ const checkFirstOrderPrefixExceptions = function( word, morphologyData ) { const beginningModification = morphologyData.stemming.beginningModification; // If a word starts with "men" or "pen" and is present in the nBeginning exception list, the prefix should be replaced with "n". if ( /^[mp]en/i.test( word ) ) { if ( checkBeginningsList( word, 3, beginningModification.nBeginning ) ) { return word.replace( /^[mp]en/i, "n" ); } } if ( /^[mp]eng/i.test( word ) && checkBeginningsList( word, 4, beginningModification.kBeginning ) ) { return word.replace( /^[mp]eng/i, "k" ); } if ( /^[mp]em/i.test( word ) ) { if ( checkBeginningsList( word, 3, beginningModification.pBeginning ) ) { return word.replace( /^(mem|pem)/i, "p" ); } else if ( checkBeginningsList( word, 3, beginningModification.mBeginning ) ) { return word.replace( /^(mem|pem)/i, "m" ); } } // Stem prefix ke- if found. Stem te(r)- unless the word was found on the exception list of words with stem beginning in -ter. const wordAfterKeTerCheck = tryStemmingKeAndTer( morphologyData, word ); if ( wordAfterKeTerCheck ) { return wordAfterKeTerCheck; } }; /** * Stems the first-order prefix of a word based on regexRules. If the word is found in an exception list, implements a stem modification. * * @param {string} word The word to stem. * @param {Object} morphologyData The object that contains regex-based rules and exception lists for Indonesian stemming. * * @returns {string} The stemmed word. */ const removeFirstOrderPrefix = function( word, morphologyData ) { // Checks whether the word has a first order prefix and requires a stem modification. const firstOrderPrefixException = checkFirstOrderPrefixExceptions( word, morphologyData ); if ( firstOrderPrefixException ) { return firstOrderPrefixException; } const regex = createRulesFromArrays( morphologyData.stemming.regexRules.removeFirstOrderPrefixes ); const withRemovedFirstOrderPrefix = buildFormRule( word, regex ); return withRemovedFirstOrderPrefix || word; }; /** * Stems the second-order prefix of a word based on regexRules. If the word is found in an exception list, implements a stem modification. * * @param {string} word The word to stem. * @param {Object} morphologyData The object that contains regex-based rules and exception lists for Indonesian stemming. * * @returns {string} The stemmed word. */ const removeSecondOrderPrefix = function( word, morphologyData ) { // If a word starts with "ber" or "per" and is present in the rBeginning exception list, the prefix should be replaced with "r". if ( ( word.startsWith( "ber" ) || word.startsWith( "per" ) ) && checkBeginningsList( word, 3, morphologyData.stemming.beginningModification.rBeginning ) ) { return word.replace( /^(ber|per)/i, "r" ); } // If a word starts with a first order prefix followed by peng- and is in the kBeginning exception list, then peng- should be replaced with k. if ( /^peng/i.test( word ) && checkBeginningsList( word, 4, morphologyData.stemming.beginningModification.kBeginning ) ) { return word.replace( /^peng/i, "k" ); } const regex = createRulesFromArrays( morphologyData.stemming.regexRules.removeSecondOrderPrefixes ); const withRemovedSecondOrderPrefix = buildFormRule( word, regex ); return withRemovedSecondOrderPrefix || word; }; /** * Stems derivational affixes of Indonesian words. * * @param {string} word The word to stem. * @param {Object} morphologyData The object that contains regex-based rules and exception lists for Indonesian stemming. * * @returns {string} The stemmed word. */ const stemDerivational = function( word, morphologyData ) { let wordLength = word.length; const removeSuffixRules = morphologyData.stemming.regexRules.removeSuffixes; const removeSuffixExceptions = morphologyData.stemming.doNotStemWords.doNotStemSuffix; const doNotStemFirstOrderPrefix = flattenSortLength( morphologyData.stemming.doNotStemWords.doNotStemPrefix.doNotStemFirstOrderPrefix ); const doNotStemSecondOrderPrefix = flattenSortLength( morphologyData.stemming.doNotStemWords.doNotStemPrefix.doNotStemSecondOrderPrefix ); // If a word is in the list of words with a beginning that looks like a valid suffix, do not stem the suffix if ( ! doNotStemFirstOrderPrefix.some( wordWithPrefixLookAlike => word.startsWith( wordWithPrefixLookAlike ) ) ) { /* * If the word has more than 2 syllables and starts with one of first order prefixes (i.e. meng-, meny-, men-, mem-, me-, * peng-, peny-, pen-, pem-, di-, ter-, ke- ), the prefix will be stemmed here. e.g. penyebaran -> sebaran, diperlebarkan -> perlebarkan */ word = removeFirstOrderPrefix( word, morphologyData ); } if ( wordLength === word.length ) { if ( ! doNotStemSecondOrderPrefix.some( wordWithPrefixLookAlike => word.startsWith( wordWithPrefixLookAlike ) ) ) { /* * If the word does not start with one of the first order prefixes but starts with one of the second order prefixes, * the prefix will be stemmed here, e.g., peranakan -> anakan */ word = removeSecondOrderPrefix( word, morphologyData ); } // If the word has more than 2 syllables and ends in either -kan, -an, or -i suffixes, the suffix will be deleted here, e.g., anakan -> anak if ( calculateTotalNumberOfSyllables( word ) > 2 ) { word = removeEnding( word, removeSuffixRules, removeSuffixExceptions, morphologyData ); } } else { // If the word previously had a first order prefix, assign wordLength to the length of the word after prefix deletion. wordLength = word.length; /* * If the word after first order prefix deletion is bigger than 2 and ends in either -kan, -an, or -i suffixes, * the suffix will be stemmed here. e.g. penyebaran - sebar. */ if ( calculateTotalNumberOfSyllables( word ) > 2 ) { word = removeEnding( word, removeSuffixRules, removeSuffixExceptions, morphologyData ); } /* * If the word previously had a suffix, we check further if the word after first order prefix and suffix deletion has more than 2 syllables. * If it does have more than 2 syllables and starts with one of the second order prefixes (i.e. ber-, be-, per-, pe-), the prefix will * be stemmed here unless the word is in the exception list of words with a beginning that looks like a second order prefix. */ if ( wordLength !== word.length && ! doNotStemSecondOrderPrefix.includes( word ) ) { if ( calculateTotalNumberOfSyllables( word ) > 2 ) { word = removeSecondOrderPrefix( word, morphologyData ); } } } return word; }; /** * Stems Indonesian singular words. * * @param {string} word The singular word to stem. * @param {Object} morphologyData The object that contains regex-based rules and exception lists for Indonesian stemming. * * @returns {string} The stem of an Indonesian singular word. */ const stemSingular = function( word, morphologyData ) { const singleSyllableWords = stemSingleSyllableWords( word, morphologyData ); // Stem the single syllable words word = singleSyllableWords; const doNotStemParticle = morphologyData.stemming.doNotStemWords.doNotStemParticle; const doNotStemPronoun = morphologyData.stemming.doNotStemWords.doNotStemPronounSuffix; if ( calculateTotalNumberOfSyllables( word ) <= 2 ) { return word; } // Check if a word after its derivational affixes stemmed exists in the exception list. const firstDerivationalStem = stemDerivational( word, morphologyData ); if ( doNotStemParticle.includes( firstDerivationalStem ) || doNotStemPronoun.includes( firstDerivationalStem ) ) { // If it does exist in the exception list, the ending that looks like a particle or a pronoun suffix should not be stemmed. return firstDerivationalStem; } /* * If the word has more than 2 syllables and ends in of the particle endings (i.e. -kah, -lah, -pun), stem the particle here. * e.g. bajumulah -> bajumu, bawalah -> bawa */ word = removeEnding( word, morphologyData.stemming.regexRules.removeParticle, doNotStemParticle, morphologyData ); // If the word (still) has more than 2 syllables and ends in of the possessive pronoun endings (i.e. -ku, -mu, -nya), stem the ending here. if ( calculateTotalNumberOfSyllables( word ) > 2 ) { // E.g. bajumu -> baju word = removeEnding( word, morphologyData.stemming.regexRules.removePronoun, doNotStemPronoun, morphologyData ); } // If the word (still) has more than 2 syllables and has derivational affixes, the affix(es) will be stemmed here. if ( calculateTotalNumberOfSyllables( word ) > 2 ) { word = stemDerivational( word, morphologyData ); } return word; }; /** * Stems Indonesian plural words. * * @param {string} word The plural word to stem. * @param {Object} morphologyData The object that contains regex-based rules and exception lists for Indonesian stemming. * * @returns {string|null} The stem of an Indonesian plural word or null if no plural was detected. */ const stemPlural = function( word, morphologyData ) { const hyphenIndex = word.indexOf( "-" ); // If there is no hyphen in the word, it can't be a reduplicated plural. if ( hyphenIndex === -1 ) { return null; } const splitWord = word.split( "-" ); if ( splitWord.length === 2 ) { let firstPart = splitWord[ 0 ]; let secondPart = splitWord[ 1 ]; firstPart = stemSingular( firstPart, morphologyData ); secondPart = stemSingular( secondPart, morphologyData ); /* * To compare the first and second part and see whether it's actually a reduplication: * Trim the beginning of the word since it might be variable due to stem changes caused by prefixes. * For example, in "meniru-nirukan" the singular stemmer will correctly stem the first "niru" to "tiru" because * of the prefix "me". Since the second part of the word is stemmed individually, there is no "me" and hence * "niru" remains "niru". To still be able to link these two forms to each other, * we compare the two parts of the word after stripping the variable first or first and second letter. * */ const firstPartBeginningTrimmed = firstPart.substr( 1 ); const secondPartBeginningTrimmed = ( secondPart.startsWith( "ng" ) || secondPart.startsWith( "ny" ) ) ? secondPart.substr( 2 ) : secondPart.substr( 1 ); if ( firstPartBeginningTrimmed === secondPartBeginningTrimmed ) { const nonPlurals = morphologyData.stemming.nonPluralReduplications; // Check non-plural reduplication. if ( nonPlurals.includes( firstPart ) && nonPlurals.includes( secondPart ) ) { /* * In words such as "mengira-ngira" prefix "me" causes a modification on both words (k->ng). This will * be correctly stemmed for the first word, but not the second. Therefore, the correct base form * "kira-kira" is created based on a reduplication of the correctly stemmed first part, "kira". */ return firstPart + "-" + firstPart; } // Return the stemmed singular form of a reduplicated plural. return firstPart; } } return null; }; /** * Stems Indonesian words * * @param {string} word The word to stem. * @param {Object} morphologyData The object that contains regex-based rules and exception lists for Indonesian stemming. * * @returns {string} The stem of an Indonesian word. */ export default function stem( word, morphologyData ) { // Check words that shouldn't receive any stemming. if ( morphologyData.stemming.shouldNotBeStemmed.includes( word ) ) { return word; } const stemmedPlural = stemPlural( word, morphologyData ); if ( stemmedPlural ) { return stemmedPlural; } word = stemSingular( word, morphologyData ); return word; }