yoastseo-dep
Version:
Yoast clientside page analysis
177 lines (146 loc) • 5.26 kB
JavaScript
const blockElements = [ "address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "fieldset", "figcaption",
"figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "li", "main", "nav",
"noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video" ];
const inlineElements = [ "b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "code", "dfn", "em", "kbd", "strong",
"samp", "time", "var", "a", "bdo", "br", "img", "map", "object", "q", "script", "span", "sub", "sup", "button",
"input", "label", "select", "textarea" ];
const blockElementsRegex = new RegExp( "^(" + blockElements.join( "|" ) + ")$", "i" );
const inlineElementsRegex = new RegExp( "^(" + inlineElements.join( "|" ) + ")$", "i" );
const blockElementStartRegex = new RegExp( "^<(" + blockElements.join( "|" ) + ")[^>]*?>$", "i" );
const blockElementEndRegex = new RegExp( "^</(" + blockElements.join( "|" ) + ")[^>]*?>$", "i" );
const inlineElementStartRegex = new RegExp( "^<(" + inlineElements.join( "|" ) + ")[^>]*>$", "i" );
const inlineElementEndRegex = new RegExp( "^</(" + inlineElements.join( "|" ) + ")[^>]*>$", "i" );
const otherElementStartRegex = /^<([^>\s/]+)[^>]*>$/;
const otherElementEndRegex = /^<\/([^>\s]+)[^>]*>$/;
const contentRegex = /^[^<]+$/;
const greaterThanContentRegex = /^<[^><]*$/;
const commentRegex = /<!--(.|[\r\n])*?-->/g;
import { forEach, memoize } from "lodash-es";
import core from "tokenizer2/core";
let tokens = [];
let htmlBlockTokenizer;
/**
* Creates a tokenizer to tokenize HTML into blocks.
*
* @returns {void}
*/
function createTokenizer() {
tokens = [];
htmlBlockTokenizer = core( function( token ) {
tokens.push( token );
} );
htmlBlockTokenizer.addRule( contentRegex, "content" );
htmlBlockTokenizer.addRule( greaterThanContentRegex, "greater-than-sign-content" );
htmlBlockTokenizer.addRule( blockElementStartRegex, "block-start" );
htmlBlockTokenizer.addRule( blockElementEndRegex, "block-end" );
htmlBlockTokenizer.addRule( inlineElementStartRegex, "inline-start" );
htmlBlockTokenizer.addRule( inlineElementEndRegex, "inline-end" );
htmlBlockTokenizer.addRule( otherElementStartRegex, "other-element-start" );
htmlBlockTokenizer.addRule( otherElementEndRegex, "other-element-end" );
}
/**
* Returns whether or not the given element name is a block element.
*
* @param {string} htmlElementName The name of the HTML element.
* @returns {boolean} Whether or not it is a block element.
*/
function isBlockElement( htmlElementName ) {
return blockElementsRegex.test( htmlElementName );
}
/**
* Returns whether or not the given element name is an inline element.
*
* @param {string} htmlElementName The name of the HTML element.
* @returns {boolean} Whether or not it is an inline element.
*/
function isInlineElement( htmlElementName ) {
return inlineElementsRegex.test( htmlElementName );
}
/**
* Splits a text into blocks based on HTML block elements.
*
* @param {string} text The text to split.
* @returns {Array} A list of blocks based on HTML block elements.
*/
function getBlocks( text ) {
const blocks = [];
let depth = 0,
blockStartTag = "",
currentBlock = "",
blockEndTag = "";
// Remove all comments because it is very hard to tokenize them.
text = text.replace( commentRegex, "" );
createTokenizer();
htmlBlockTokenizer.onText( text );
htmlBlockTokenizer.end();
forEach( tokens, function( token, i ) {
const nextToken = tokens[ i + 1 ];
switch ( token.type ) {
case "content":
case "greater-than-sign-content":
case "inline-start":
case "inline-end":
case "other-tag":
case "other-element-start":
case "other-element-end":
case "greater than sign":
if ( ! nextToken || ( depth === 0 && ( nextToken.type === "block-start" || nextToken.type === "block-end" ) ) ) {
currentBlock += token.src;
blocks.push( currentBlock );
blockStartTag = "";
currentBlock = "";
blockEndTag = "";
} else {
currentBlock += token.src;
}
break;
case "block-start":
if ( depth !== 0 ) {
if ( currentBlock.trim() !== "" ) {
blocks.push( currentBlock );
}
currentBlock = "";
blockEndTag = "";
}
depth++;
blockStartTag = token.src;
break;
case "block-end":
depth--;
blockEndTag = token.src;
/*
* We try to match the most deep blocks so discard any other blocks that have been started but not
* finished.
*/
if ( "" !== blockStartTag && "" !== blockEndTag ) {
blocks.push( blockStartTag + currentBlock + blockEndTag );
} else if ( "" !== currentBlock.trim() ) {
blocks.push( currentBlock );
}
blockStartTag = "";
currentBlock = "";
blockEndTag = "";
break;
}
// Handles HTML with too many closing tags.
if ( depth < 0 ) {
depth = 0;
}
} );
return blocks;
}
const memoizedGetBlocks = memoize( getBlocks );
export {
blockElements,
inlineElements,
isBlockElement,
isInlineElement,
memoizedGetBlocks as getBlocks,
};
export default {
blockElements: blockElements,
inlineElements: inlineElements,
isBlockElement: isBlockElement,
isInlineElement: isInlineElement,
getBlocks: memoizedGetBlocks,
};