UNPKG

yoastseo-dep

Version:

Yoast clientside page analysis

641 lines (558 loc) 21.7 kB
import { isNaN, isUndefined, map } from "lodash-es"; import core from "tokenizer2/core"; import { normalize as normalizeQuotes } from "../sanitize/quotes.js"; import abbreviations from "../../languages/en/config/abbreviations"; import createRegexFromArray from "../regex/createRegexFromArray"; import wordBoundaries from "../../../config/wordBoundaries"; // All characters that indicate a sentence delimiter. const fullStop = "."; const fullStopRegex = new RegExp( "^[" + fullStop + "]$" ); const smallerThanContentRegex = /^<[^><]*$/; const htmlStartRegex = /^<([^>\s/]+)[^>]*>$/mi; const htmlEndRegex = /^<\/([^>\s]+)[^>]*>$/mi; const blockStartRegex = /^\s*[[({]\s*$/; const blockEndRegex = /^\s*[\])}]\s*$/; const abbreviationsPreparedForRegex = abbreviations.map( ( abbreviation ) => abbreviation.replace( ".", "\\." ) ); const abbreviationsRegex = createRegexFromArray( abbreviationsPreparedForRegex ); const wordBoundariesForRegex = "(^|$|[" + wordBoundaries().map( ( boundary ) => "\\" + boundary ).join( "" ) + "])"; const lastCharacterPartOfInitialsRegex = new RegExp( wordBoundariesForRegex + "[A-Za-z]$" ); // Constants to be used in isValidTagPair. // A regex to get the tag type. const tagTypeRegex = /<\/?([^\s]+?)(\s|>)/; // Semantic tags (as opposed to style tags) are tags that are used to structure the text. const semanticTags = [ "p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "span", "li", "main" ]; /** * Class for tokenizing a (html) text into sentences. */ export default class SentenceTokenizer { /** * Constructor * @constructor */ constructor() { /* * \u2026 - ellipsis. * \u06D4 - Urdu full stop. * \u061f - Arabic question mark. */ this.sentenceDelimiters = "”〞〟„』›»’‛`\"?!\u2026\u06d4\u061f"; } /** * Gets the sentence delimiters. * * @returns {string} The sentence delimiters. */ getSentenceDelimiters() { return this.sentenceDelimiters; } /** * Returns whether or not a certain character is a number. * * @param {string} character The character to check. * @returns {boolean} Whether or not the character is a capital letter. */ isNumber( character ) { return ! isNaN( parseInt( character, 10 ) ); } /** * Returns whether or not a given HTML tag is a break tag. * * @param {string} htmlTag The HTML tag to check. * @returns {boolean} Whether or not the given HTML tag is a break tag. */ isBreakTag( htmlTag ) { return /<\/?br/.test( htmlTag ); } /** * Returns whether or not a given character is quotation mark. * * @param {string} character The character to check. * @returns {boolean} Whether or not the given character is a quotation mark. */ isQuotation( character ) { character = normalizeQuotes( character ); return "'" === character || "\"" === character; } /** * A mock definition of this function. This function is only used in extensions for languages that use an ordinal dot. * * @returns {boolean} Always returns false as it is a language specific implementation if a language has an ordinal dot. */ endsWithOrdinalDot() { return false; } /** * Returns whether or not a given character is a punctuation mark that can be at the beginning * of a sentence, like ¿ and ¡ used in Spanish. * * @param {string} character The character to check. * @returns {boolean} Whether or not the given character is a punctuation mark. */ isPunctuation( character ) { return "¿" === character || "¡" === character; } /** * Removes duplicate whitespace from a given text. * * @param {string} text The text with duplicate whitespace. * @returns {string} The text without duplicate whitespace. */ removeDuplicateWhitespace( text ) { return text.replace( /\s+/, " " ); } /** * Returns whether or not a certain character is a capital letter. * * @param {string} character The character to check. * @returns {boolean} Whether or not the character is a capital letter. */ isCapitalLetter( character ) { return character !== character.toLocaleLowerCase(); } /** * Checks whether the given character is a smaller than sign. * * This function is used to make sure that tokenizing the content after * the smaller than sign works as expected. * E.g. 'A sentence. < Hello world!' = ['A sentence.', '< Hello world!']. * * @param {string} character The character to check. * @returns {boolean} Whether the character is a smaller than sign ('<') or not. */ isSmallerThanSign( character ) { return character === "<"; } /** * Retrieves the next two characters from an array with the two next tokens. * * @param {Array} nextTokens The two next tokens. Might be undefined. * @returns {string} The next two characters. */ getNextTwoCharacters( nextTokens ) { let next = ""; if ( ! isUndefined( nextTokens[ 0 ] ) ) { next += nextTokens[ 0 ].src; } if ( ! isUndefined( nextTokens[ 1 ] ) ) { next += nextTokens[ 1 ].src; } next = this.removeDuplicateWhitespace( next ); return next; } /** * Checks whether a character is from a language that's written from right to left. * These languages don't have capital letter forms. Therefore any letter from these languages is a * potential sentence beginning. * * @param {string} letter The letter to check. * * @returns {boolean} Whether the letter is from an LTR language. */ isLetterFromSpecificLanguage( letter ) { const ltrLetterRanges = [ // Hebrew characters. /^[\u0590-\u05fe]+$/i, // Arabic characters (used for Arabic, Farsi, Urdu). /^[\u0600-\u06FF]+$/i, // Additional Farsi characters. /^[\uFB8A\u067E\u0686\u06AF]+$/i, ]; return ( ltrLetterRanges.some( ltrLetterRange => ltrLetterRange.test( letter ) ) ); } /** * Checks if the sentenceBeginning beginning is a valid beginning. * * @param {string} sentenceBeginning The beginning of the sentence to validate. * @returns {boolean} Returns true if it is a valid beginning, false if it is not. */ isValidSentenceBeginning( sentenceBeginning ) { return ( this.isCapitalLetter( sentenceBeginning ) || this.isLetterFromSpecificLanguage( sentenceBeginning ) || this.isNumber( sentenceBeginning ) || this.isQuotation( sentenceBeginning ) || this.isPunctuation( sentenceBeginning ) || this.isSmallerThanSign( sentenceBeginning ) ); } /** * Checks if the token is a valid sentence start. * * @param {Object} token The token to validate. * @returns {boolean} Returns true if the token is valid sentence start, false if it is not. */ isSentenceStart( token ) { return ( ! isUndefined( token ) && ( "html-start" === token.type || "html-end" === token.type || "block-start" === token.type ) ); } /** * Checks if the token is a valid sentence ending. A valid sentence ending is either a full stop or another * delimiter such as "?", "!", etc. * * @param {Object} token The token to validate. * @returns {boolean} Returns true if the token is valid sentence ending, false if it is not. */ isSentenceEnding( token ) { return ( ! isUndefined( token ) && ( token.type === "full-stop" || token.type === "sentence-delimiter" ) ); } /** * Checks if a full stop is part of a person's initials. * * Tests if tokens exist. Then tests if the tokens are of the right type. * For previous token, it checks if the sentence ends with a single letter. * For nextToken it checks if it is a single letter. * Checks if next token is followed by a full stop. * * @param {object} token The current token (must be a full stop). * @param {object} previousToken The token before the full stop. * @param {object} nextToken The token following the full stop. * @param {object} secondToNextToken The second token after the full stop. * @returns {boolean} True if a full stop is part of a person's initials, False if the full stop is not part of a person's initials. */ isPartOfPersonInitial( token, previousToken, nextToken, secondToNextToken ) { return ( ! isUndefined( token ) && ! isUndefined( nextToken ) && ! isUndefined( secondToNextToken ) && ! isUndefined( previousToken ) && token.type === "full-stop" && previousToken.type === "sentence" && lastCharacterPartOfInitialsRegex.test( previousToken.src ) && nextToken.type === "sentence" && nextToken.src.trim().length === 1 && secondToNextToken.type === "full-stop" ); } /** * Tokens that represent a '<', followed by content until it enters another '<' or '>' * gets another pass by the tokenizer. * * @param {Object} token A token of type 'smaller-than-sign-content'. * @param {string[]} tokenSentences The current array of found sentences. Sentences may get added by this method. * @param {string} currentSentence The current sentence. Sentence parts may get appended by this method. * @returns {{tokenSentences, currentSentence}} The found sentences and the current sentence, appended when necessary. */ tokenizeSmallerThanContent( token, tokenSentences, currentSentence ) { /* Remove the '<' from the text, to avoid matching this rule recursively again and again. We add it again later on. */ const localText = token.src.substring( 1 ); // Tokenize the current smaller-than-content token without the first '<'. const tokenizerResult = this.createTokenizer(); this.tokenize( tokenizerResult.tokenizer, localText ); const localSentences = this.getSentencesFromTokens( tokenizerResult.tokens, false ); localSentences[ 0 ] = isUndefined( localSentences[ 0 ] ) ? "<" : "<" + localSentences[ 0 ]; /* * When the first sentence has a valid sentence beginning. * Add the currently build sentence to the sentences. * Start building the next sentence. */ if ( this.isValidSentenceBeginning( localSentences[ 0 ] ) ) { tokenSentences.push( currentSentence ); currentSentence = ""; } currentSentence += localSentences[ 0 ]; if ( localSentences.length > 1 ) { /* There is a new sentence after the first, add and reset the current sentence. */ tokenSentences.push( currentSentence ); currentSentence = ""; // Remove the first sentence (we do not need to add it again). localSentences.shift(); // Last sentence gets special treatment. const lastSentence = localSentences.pop(); // Add the remaining found sentences. localSentences.forEach( sentence => { tokenSentences.push( sentence ); } ); const sentenceEndRegex = new RegExp( "[" + fullStop + this.getSentenceDelimiters() + "]$" ); // Check if the last sentence has a valid sentence ending. if ( lastSentence.match( sentenceEndRegex ) ) { // If so, add it as a sentence. tokenSentences.push( lastSentence ); } else { // If not, start making a new one. currentSentence = lastSentence; } } return { tokenSentences, currentSentence, }; } /** * Creates a tokenizer. * * @returns {Object} The tokenizer and the tokens. */ createTokenizer() { const sentenceDelimiterRegex = new RegExp( "^[" + this.getSentenceDelimiters() + "]$" ); const sentenceRegex = new RegExp( "^[^" + fullStop + this.getSentenceDelimiters() + "<\\(\\)\\[\\]]+$" ); const tokens = []; const tokenizer = core( function( token ) { tokens.push( token ); } ); tokenizer.addRule( fullStopRegex, "full-stop" ); tokenizer.addRule( smallerThanContentRegex, "smaller-than-sign-content" ); tokenizer.addRule( htmlStartRegex, "html-start" ); tokenizer.addRule( htmlEndRegex, "html-end" ); tokenizer.addRule( blockStartRegex, "block-start" ); tokenizer.addRule( blockEndRegex, "block-end" ); tokenizer.addRule( sentenceDelimiterRegex, "sentence-delimiter" ); tokenizer.addRule( sentenceRegex, "sentence" ); return { tokenizer, tokens, }; } /** * Tokenizes the given text using the given tokenizer. * * @param {Object} tokenizer The tokenizer to use. * @param {string} text The text to tokenize. * @returns {void} */ tokenize( tokenizer, text ) { tokenizer.onText( text ); try { tokenizer.end(); } catch ( e ) { console.error( "Tokenizer end error:", e, e.tokenizer2 ); } } /** * Checks if a string ends with an abbreviation. * @param {string} currentSentence A (part of) a sentence. * @returns {boolean} True if the string ends with an abbreviation that is in abbreviations.js. Otherwise, False. */ endsWithAbbreviation( currentSentence ) { const matchedAbbreviations = currentSentence.match( abbreviationsRegex ); if ( ! matchedAbbreviations ) { return false; } const lastAbbreviation = matchedAbbreviations.pop(); return currentSentence.endsWith( lastAbbreviation ); } /** * Checks whether the given tokens are a valid html tag pair. * Note that this method is not a full html tag validator. It should be replaced with a better solution once the html parser is implemented. * * @param {object} firstToken The first token to check. It is asserted that this token contains/is an opening html tag. * @param {object} lastToken The last token to check. It is asserted that this token contains/is a closing html tag. * * @returns {boolean} True if the tokens are a valid html tag pair. Otherwise, False. */ isValidTagPair( firstToken, lastToken ) { const firstTokenText = firstToken.src; const lastTokenText = lastToken.src; // Get the tag types. const firstTagType = firstTokenText.match( tagTypeRegex )[ 1 ]; const lastTagType = lastTokenText.match( tagTypeRegex )[ 1 ]; // Check if the tags are the same and if they are a semantic tag (p, div, h1, h2, h3, h4, h5, h6, span). return firstTagType === lastTagType && semanticTags.includes( firstTagType ); } /** * Returns an array of sentences for a given array of tokens, assumes that the text has already been split into blocks. * * @param {Object[]} tokenArray The tokens from the sentence tokenizer. * @param {boolean} [trimSentences=true] Whether to trim the sentences at the end or not. * * @returns {string[]} A list of sentences. */ getSentencesFromTokens( tokenArray, trimSentences = true ) { let tokenSentences = [], currentSentence = "", nextSentenceStart, sliced; // Drop the first and last HTML tag if both are present. do { sliced = false; const firstToken = tokenArray[ 0 ]; const lastToken = tokenArray[ tokenArray.length - 1 ]; if ( firstToken && lastToken && firstToken.type === "html-start" && lastToken.type === "html-end" && this.isValidTagPair( firstToken, lastToken ) ) { tokenArray = tokenArray.slice( 1, tokenArray.length - 1 ); sliced = true; } } while ( sliced && tokenArray.length > 1 ); tokenArray.forEach( ( token, i ) => { let hasNextSentence, nextCharacters, tokenizeResults; const nextToken = tokenArray[ i + 1 ]; const previousToken = tokenArray[ i - 1 ]; const secondToNextToken = tokenArray[ i + 2 ]; nextCharacters = this.getNextTwoCharacters( [ nextToken, secondToNextToken ] ); // For a new sentence we need to check the next two characters. hasNextSentence = nextCharacters.length >= 2; nextSentenceStart = hasNextSentence ? nextCharacters[ 1 ] : ""; switch ( token.type ) { case "html-start": case "html-end": if ( this.isBreakTag( token.src ) ) { tokenSentences.push( currentSentence ); currentSentence = ""; } else { currentSentence += token.src; } break; case "smaller-than-sign-content": tokenizeResults = this.tokenizeSmallerThanContent( token, tokenSentences, currentSentence ); tokenSentences = tokenizeResults.tokenSentences; currentSentence = tokenizeResults.currentSentence; break; case "sentence": currentSentence += token.src; break; case "sentence-delimiter": currentSentence += token.src; /* * Only split text into sentences if: * the next token is defined, AND * the next token type is neither "block-end" nor "sentence-delimiter", AND * the next token first character is a white space */ if ( ! isUndefined( nextToken ) && "block-end" !== nextToken.type && "sentence-delimiter" !== nextToken.type && this.isCharacterASpace( nextToken.src[ 0 ] ) ) { // Don't split on quotation marks unless they're preceded by a full stop. if ( this.isQuotation( token.src ) && previousToken && previousToken.src !== "." ) { break; } /* * Only split on ellipsis or quotation marks when: * a) There is a next sentence, and the next character is a valid sentence beginning preceded by a white space, OR * b) The next token is a sentence start */ if ( this.isQuotation( token.src ) || token.src === "…" ) { currentSentence = this.getValidSentence( hasNextSentence, nextSentenceStart, nextCharacters, nextToken, tokenSentences, currentSentence ); } else { tokenSentences.push( currentSentence ); currentSentence = ""; } } break; case "full-stop": currentSentence += token.src; nextCharacters = this.getNextTwoCharacters( [ nextToken, secondToNextToken ] ); // For a new sentence we need to check the next two characters. hasNextSentence = nextCharacters.length >= 2; nextSentenceStart = hasNextSentence ? nextCharacters[ 1 ] : ""; // If the current sentence ends with an abbreviation, the full stop does not split the sentence. if ( this.endsWithAbbreviation( currentSentence ) ) { break; } // It should not split the text if the first character of the potential next sentence is a number. if ( hasNextSentence && this.isNumber( nextCharacters[ 0 ] ) ) { break; } // If the full stop is part of a person's initials, don't split sentence. if ( this.isPartOfPersonInitial( token, previousToken, nextToken, secondToNextToken ) ) { break; } // If the full stop is an ordinal dot (in German), then don't break the sentence. // This check should be done after hasNextSentence && this.isNumber( nextCharacters[ 0 ] ) (above). // Because otherwise it could break before that test. if ( this.endsWithOrdinalDot( currentSentence ) ) { break; } /* * Only split on full stop when: * a) There is a next sentence, and the next character is a valid sentence beginning preceded by a white space, OR * b) The next token is a sentence start */ currentSentence = this.getValidSentence( hasNextSentence, nextSentenceStart, nextCharacters, nextToken, tokenSentences, currentSentence ); break; case "block-start": currentSentence += token.src; break; case "block-end": currentSentence += token.src; nextCharacters = this.getNextTwoCharacters( [ nextToken, secondToNextToken ] ); // For a new sentence we need to check the next two characters. hasNextSentence = nextCharacters.length >= 2; nextSentenceStart = hasNextSentence ? nextCharacters[ 0 ] : ""; /* Don't split if: * - The next character is a number. For example: IPv4-numbers. * - The block end is preceded by a valid sentence ending, but not followed by a valid sentence beginning. */ if ( hasNextSentence && this.isNumber( nextCharacters[ 0 ] ) || ( this.isSentenceEnding( previousToken ) && ( ! ( this.isValidSentenceBeginning( nextSentenceStart ) || this.isSentenceStart( nextToken ) ) ) ) ) { break; } /* * Split if: * - The block end is preceded by a sentence ending and followed by a valid sentence beginning. */ if ( this.isSentenceEnding( previousToken ) && ( this.isSentenceStart( nextToken ) || this.isValidSentenceBeginning( nextSentenceStart ) ) ) { tokenSentences.push( currentSentence ); currentSentence = ""; } break; } } ); if ( "" !== currentSentence ) { tokenSentences.push( currentSentence ); } if ( trimSentences ) { tokenSentences = map( tokenSentences, function( sentence ) { return sentence.trim(); } ); } return tokenSentences; } /** * Gets the current sentence when: * a) There is a next sentence, and the next character is a valid sentence beginning preceded by a white space, OR * b) The next token is a sentence start * * @param {boolean} hasNextSentence Whether the next characters are more than two. * @param {string} nextSentenceStart The second character of the next characters. * @param {string} nextCharacters The string values of the next two tokens. * @param {object} nextToken The next token object. * @param {array} tokenSentences The array of pushed valid sentences. * @param {string} currentSentence The current sentence. * * @returns {string} The current sentence. */ getValidSentence( hasNextSentence, nextSentenceStart, nextCharacters, nextToken, tokenSentences, currentSentence ) { if ( ( hasNextSentence && this.isValidSentenceBeginning( nextSentenceStart ) && this.isCharacterASpace( nextCharacters[ 0 ] ) ) || this.isSentenceStart( nextToken ) ) { tokenSentences.push( currentSentence ); currentSentence = ""; } return currentSentence; } /** * Checks if the character is a whitespace. * * @param {string} character The character to check. * @returns {boolean} Whether the character is a whitespace. */ isCharacterASpace( character ) { return /\s/.test( character ); } }