yoastseo-dep
Version:
Yoast clientside page analysis
75 lines (64 loc) • 2.27 kB
JavaScript
import Sentence from "../structure/Sentence";
import Token from "../structure/Token";
import getWordsForHTMLParser from "../../languageProcessing/helpers/word/getWordsForHTMLParser";
const whitespaceRegex = /^\s+$/;
/**
* Contains language-specific logic for splitting a text into sentences and tokens.
*/
class LanguageProcessor {
/**
* Creates a new language processor.
*
* @param {Researcher} researcher The researcher to use.
*/
constructor( researcher ) {
this.researcher = researcher;
}
/**
* Split text into sentences.
*
* @param {string} text The text to split into sentences.
*
* @returns {Sentence[]} The sentences.
*/
splitIntoSentences( text ) {
const memoizedTokenizer = this.researcher.getHelper( "memoizedTokenizer" );
/*
* Set the `trimSentences` flag to false. We want to keep whitespaces to be able to correctly assess the
* position of sentences within the source code.
*/
const sentences = memoizedTokenizer( text, false );
/*
* If the last element in the array of sentences contains only whitespaces, remove it.
* This will be the case if the text ends in a whitespace - that whitespace ends up being tokenized as a
* separate sentence. A space at the end of the text is not needed for calculating the position of
* sentences, so it can be safely removed.
*/
if ( whitespaceRegex.test( sentences[ sentences.length - 1 ] ) ) {
sentences.pop();
}
return sentences.map( function( sentence ) {
return new Sentence( sentence );
} );
}
/**
* Split sentence into tokens.
*
* @param {Sentence} sentence The sentence to split.
*
* @returns {Token[]} The tokens.
*/
splitIntoTokens( sentence ) {
// Retrieve sentence from sentence class
const sentenceText = sentence.text;
// If there is a custom getWords helper use its output for retrieving words/tokens.
const tokenTextsCustom = this.researcher.getHelper( "splitIntoTokensCustom" );
if ( tokenTextsCustom ) {
const tokensCustom = tokenTextsCustom( sentence );
return tokensCustom.map( tokenText => new Token( tokenText ) );
}
const tokenTexts = getWordsForHTMLParser( sentenceText );
return tokenTexts.map( tokenText => new Token( tokenText ) );
}
}
export default LanguageProcessor;