UNPKG

yoastseo-dep

Version:

Yoast clientside page analysis

42 lines (34 loc) 1.32 kB
/** @module stringProcessing/countWords */ import sanitizeString from "../sanitize/sanitizeString"; import { filter, flatMap } from "lodash-es"; import removePunctuation, { punctuationRegexString } from "../sanitize/removePunctuation.js"; const interJectionRegexString = `([${punctuationRegexString}])`; const interJectionRegex = new RegExp( interJectionRegexString, "g" ); /** * Returns an array with words used in the text. * * @param {string} text The text to be counted. * @param {boolean} [shouldRemovePunctuation=true] If punctuation should be removed. Defaults to `true`. * * @returns {Array} The array with all words. */ export default function( text, shouldRemovePunctuation = true ) { // Unify whitespaces and non-breaking spaces, remove table of content and strip the tags and multiple spaces. text = sanitizeString( text ); if ( text === "" ) { return []; } let words = text.split( /\s/g ); if ( shouldRemovePunctuation ) { words = words.map( removePunctuation ); } else { // If punctuation is not removed, punctuation marks are tokenized as if they were words. words = flatMap( words, ( word ) => { const newWord = word.replace( interJectionRegex, " $1 " ); return newWord.split( " " ); } ); } return filter( words, function( word ) { return word.trim() !== ""; } ); }