yoastseo-dep
Version:
Yoast clientside page analysis
74 lines (67 loc) • 3.01 kB
JavaScript
import { Heading, Paragraph } from "../../structure";
import getTextElementPositions from "./getTextElementPositions";
import { hashedHtmlEntities } from "../../../helpers/htmlEntities";
/**
* Splits the sentence into tokens, determines their positions in the source code, and puts them on the sentence.
*
* @param {Paragraph|Heading} node The paragraph or heading node to split into sentences.
* @param {Sentence} sentence The sentence.
* @param {LanguageProcessor} languageProcessor The language processor for the current language.
*
* @returns {Sentence} The sentence, with tokens.
*/
function getTokens( node, sentence, languageProcessor ) {
sentence.tokens = languageProcessor.splitIntoTokens( sentence );
sentence.tokens = getTextElementPositions( node, sentence.tokens, sentence.sourceCodeRange.startOffset );
return sentence;
}
/**
* Splits the node's inner text into sentences, and the sentences into tokens,
* using the language processor.
*
* @param {Paragraph|Heading} node The paragraph or heading node to split into sentences.
* @param {LanguageProcessor} languageProcessor The language processor to use.
*
* @returns {Sentence[]} The node's sentences.
*/
function getSentences( node, languageProcessor ) {
// Split text into sentences.
let sentences = languageProcessor.splitIntoSentences( node.innerText() );
// Add position information to the sentences.
sentences = getTextElementPositions( node, sentences );
// Tokenize sentences into tokens.
return sentences.map( sentence => {
sentence = getTokens( node, sentence, languageProcessor );
// Now positions have been determined, change HTML entities that had earlier been converted to hashed versions back to their short version.
// For example, "&" was earlier converted into "#amp;" and is now converted into "&".
// We make this change in both the Sentence and the accompanying Tokens.
hashedHtmlEntities.forEach( ( character, hashedHtmlEntity ) => {
// We use split/join instead of replaceAll to support older browsers.
sentence.text = sentence.text.split( hashedHtmlEntity ).join( character );
sentence.tokens.map( token => {
token.text = token.text.split( hashedHtmlEntity ).join( character );
return token;
} );
} );
return sentence;
} );
}
/**
* Splits any Paragraph and Heading nodes in the tree into sentences and tokens.
* Excludes overarching Paragraphs, as those will have (implicit) paragraphs as their children.
*
* @param {Node} tree The tree to process.
* @param {LanguageProcessor} languageProcessor The language processor to use.
*
* @returns {Node} The processed tree.
*/
function tokenize( tree, languageProcessor ) {
if ( ( tree instanceof Paragraph && tree.name !== "p-overarching" ) || tree instanceof Heading ) {
tree.sentences = getSentences( tree, languageProcessor );
}
if ( tree.childNodes ) {
tree.childNodes = tree.childNodes.map( child => tokenize( child, languageProcessor ) );
}
return tree;
}
export default tokenize;