UNPKG

yoastseo-dep

Version:

Yoast clientside page analysis

177 lines (146 loc) 5.26 kB
const blockElements = [ "address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video" ]; const inlineElements = [ "b", "big", "i", "small", "tt", "abbr", "acronym", "cite", "code", "dfn", "em", "kbd", "strong", "samp", "time", "var", "a", "bdo", "br", "img", "map", "object", "q", "script", "span", "sub", "sup", "button", "input", "label", "select", "textarea" ]; const blockElementsRegex = new RegExp( "^(" + blockElements.join( "|" ) + ")$", "i" ); const inlineElementsRegex = new RegExp( "^(" + inlineElements.join( "|" ) + ")$", "i" ); const blockElementStartRegex = new RegExp( "^<(" + blockElements.join( "|" ) + ")[^>]*?>$", "i" ); const blockElementEndRegex = new RegExp( "^</(" + blockElements.join( "|" ) + ")[^>]*?>$", "i" ); const inlineElementStartRegex = new RegExp( "^<(" + inlineElements.join( "|" ) + ")[^>]*>$", "i" ); const inlineElementEndRegex = new RegExp( "^</(" + inlineElements.join( "|" ) + ")[^>]*>$", "i" ); const otherElementStartRegex = /^<([^>\s/]+)[^>]*>$/; const otherElementEndRegex = /^<\/([^>\s]+)[^>]*>$/; const contentRegex = /^[^<]+$/; const greaterThanContentRegex = /^<[^><]*$/; const commentRegex = /<!--(.|[\r\n])*?-->/g; import { forEach, memoize } from "lodash-es"; import core from "tokenizer2/core"; let tokens = []; let htmlBlockTokenizer; /** * Creates a tokenizer to tokenize HTML into blocks. * * @returns {void} */ function createTokenizer() { tokens = []; htmlBlockTokenizer = core( function( token ) { tokens.push( token ); } ); htmlBlockTokenizer.addRule( contentRegex, "content" ); htmlBlockTokenizer.addRule( greaterThanContentRegex, "greater-than-sign-content" ); htmlBlockTokenizer.addRule( blockElementStartRegex, "block-start" ); htmlBlockTokenizer.addRule( blockElementEndRegex, "block-end" ); htmlBlockTokenizer.addRule( inlineElementStartRegex, "inline-start" ); htmlBlockTokenizer.addRule( inlineElementEndRegex, "inline-end" ); htmlBlockTokenizer.addRule( otherElementStartRegex, "other-element-start" ); htmlBlockTokenizer.addRule( otherElementEndRegex, "other-element-end" ); } /** * Returns whether or not the given element name is a block element. * * @param {string} htmlElementName The name of the HTML element. * @returns {boolean} Whether or not it is a block element. */ function isBlockElement( htmlElementName ) { return blockElementsRegex.test( htmlElementName ); } /** * Returns whether or not the given element name is an inline element. * * @param {string} htmlElementName The name of the HTML element. * @returns {boolean} Whether or not it is an inline element. */ function isInlineElement( htmlElementName ) { return inlineElementsRegex.test( htmlElementName ); } /** * Splits a text into blocks based on HTML block elements. * * @param {string} text The text to split. * @returns {Array} A list of blocks based on HTML block elements. */ function getBlocks( text ) { const blocks = []; let depth = 0, blockStartTag = "", currentBlock = "", blockEndTag = ""; // Remove all comments because it is very hard to tokenize them. text = text.replace( commentRegex, "" ); createTokenizer(); htmlBlockTokenizer.onText( text ); htmlBlockTokenizer.end(); forEach( tokens, function( token, i ) { const nextToken = tokens[ i + 1 ]; switch ( token.type ) { case "content": case "greater-than-sign-content": case "inline-start": case "inline-end": case "other-tag": case "other-element-start": case "other-element-end": case "greater than sign": if ( ! nextToken || ( depth === 0 && ( nextToken.type === "block-start" || nextToken.type === "block-end" ) ) ) { currentBlock += token.src; blocks.push( currentBlock ); blockStartTag = ""; currentBlock = ""; blockEndTag = ""; } else { currentBlock += token.src; } break; case "block-start": if ( depth !== 0 ) { if ( currentBlock.trim() !== "" ) { blocks.push( currentBlock ); } currentBlock = ""; blockEndTag = ""; } depth++; blockStartTag = token.src; break; case "block-end": depth--; blockEndTag = token.src; /* * We try to match the most deep blocks so discard any other blocks that have been started but not * finished. */ if ( "" !== blockStartTag && "" !== blockEndTag ) { blocks.push( blockStartTag + currentBlock + blockEndTag ); } else if ( "" !== currentBlock.trim() ) { blocks.push( currentBlock ); } blockStartTag = ""; currentBlock = ""; blockEndTag = ""; break; } // Handles HTML with too many closing tags. if ( depth < 0 ) { depth = 0; } } ); return blocks; } const memoizedGetBlocks = memoize( getBlocks ); export { blockElements, inlineElements, isBlockElement, isInlineElement, memoizedGetBlocks as getBlocks, }; export default { blockElements: blockElements, inlineElements: inlineElements, isBlockElement: isBlockElement, isInlineElement: isInlineElement, getBlocks: memoizedGetBlocks, };