UNPKG

yoastseo-dep

Version:

Yoast clientside page analysis

231 lines (209 loc) 7.64 kB
import { FormattingElement, Heading, Paragraph, StructuredNode, List, ListItem } from "../../../structure/tree"; import LeafNode from "../../../structure/tree/nodes/LeafNode"; import { formattingElements, headings, ignoredHtmlElements } from "./htmlConstants"; /** * Converts a parse5 tree to a tree that can be analyzed. */ class HTMLTreeConverter { /** * Converts the parse5 tree to a Yoast tree. * * @param {Object} parse5Tree The parse5 tree to convert. * * @returns {module:parsedPaper/structure.Node} The converted tree. */ convert( parse5Tree ) { const root = new StructuredNode( "root", null ); this._convert( parse5Tree, root ); return root; } /** * Converts the tree from a parse5 implementation to a Yoast tree. * * @param {Object} parse5Tree The parse5 tree to convert. * @param {module:parsedPaper/structure.Node} parent The tree in progress. * * @returns {void} * * @private */ _convert( parse5Tree, parent ) { if ( ignoredHtmlElements.includes( parse5Tree.nodeName ) ) { return; } if ( parse5Tree.childNodes ) { for ( const node of parse5Tree.childNodes ) { const child = this._createChild( node, parent ); if ( child ) { parent.addChild( child ); this._convert( node, child ); } else { this._convert( node, parent ); } } } } /** * Creates a new node for in the structured tree from the given parse 5 node. * Returns `null` if no node should be added to the tree. * * @param {Object} parse5Node The parse5 node that should be parsed. * @param {module:parsedPaper/structure.Node} parentNode The parent node. * * @returns {module:parsedPaper/structure.Node|null} The node that should be added to the tree, or `null` if no node should be added. * * @private */ _createChild( parse5Node, parentNode ) { let child = null; const nodeType = parse5Node.nodeName; if ( ignoredHtmlElements.includes( nodeType ) ) { const formatting = new FormattingElement( nodeType, parse5Node.sourceCodeLocation, this._parseAttributes( parse5Node.attrs ) ); this._addLeafNodeContent( formatting, this._addFormatting, parentNode, parse5Node.sourceCodeLocation ); } else if ( nodeType === "p" ) { // Paragraph. child = new Paragraph( parse5Node.sourceCodeLocation ); } else if ( headings.includes( nodeType ) ) { // Heading. child = new Heading( parseInt( nodeType[ 1 ], 10 ), parse5Node.sourceCodeLocation ); } else if ( nodeType === "li" ) { // List item. child = new ListItem( parse5Node.sourceCodeLocation ); } else if ( nodeType === "ol" || nodeType === "ul" ) { // List node. child = new List( nodeType === "ol", parse5Node.sourceCodeLocation ); } else if ( nodeType === "#text" ) { // Text (outside of an ignored element). child = this._addLeafNodeContent( parse5Node.value, this._addText, parentNode, parse5Node.sourceCodeLocation ); } else if ( formattingElements.includes( nodeType ) ) { // Formatting element. const formatting = new FormattingElement( nodeType, parse5Node.sourceCodeLocation, this._parseAttributes( parse5Node.attrs ) ); child = this._addLeafNodeContent( formatting, this._addFormatting, parentNode, parse5Node.sourceCodeLocation ); } else { // Other element (`div`, `section`, `article`, etc.). child = new StructuredNode( nodeType, parse5Node.sourceCodeLocation ); } return child; } /** * Adds leaf node content (text or formatting) to the tree according to a predefined strategy: * * If the content has a leaf node (paragraph, heading, list item) parent or ancestor: add it to this leaf node. * Else: wrap the content in an implicit paragraph, and add the content to it in such a way that runs * of phrasing content make up one implicit paragraph. * * @param {string|module:parsedPaper/structure.FormattingElement} contentToAdd The content to add. * @param {function} add A function that adds the content to the specified leaf node. * @param {module:parsedPaper/structure.Node} parent The parent to which to try to add the content. * @param {Object} location The location of the content as parsed by parse5. * * @returns {null|module:parsedPaper/structure.Paragraph|null} The implicit paragraph, if one was created. * * @private */ _addLeafNodeContent( contentToAdd, add, parent, location ) { if ( parent instanceof LeafNode ) { add( parent, contentToAdd ); return null; } const previousChild = this._previousChild( parent ); const leafNodeAncestor = this._leafNodeAncestor( parent ); if ( leafNodeAncestor instanceof Paragraph ) { add( leafNodeAncestor, contentToAdd ); return null; } else if ( previousChild && previousChild instanceof Paragraph && previousChild.isImplicit ) { add( previousChild, contentToAdd ); return null; } // Don't create an implicit paragraph when the content is whitespace only. if ( /^\s*$/.exec( contentToAdd ) ) { return null; } const child = new Paragraph( location, true ); add( child, contentToAdd ); return child; } /** * Add text to the parent leaf node. * * @param {module:parsedPaper/structure.LeafNode} parent The leaf node to which to add the text. * @param {string} text The text to add to the node. * * @returns {void} * * @private */ _addText( parent, text ) { parent.appendText( text ); } /** * Add a formatting element to the parent leaf node. * * @param {module:parsedPaper/structure.LeafNode} parent The leaf node to which to add the formatting. * @param {module:parsedPaper/structure.FormattingElement} formatting The formatting element to add. * * @returns {void} * * @private */ _addFormatting( parent, formatting ) { parent.addFormatting( formatting ); } /** * Parses the HTML element attributes from parse5's format to a plain JS object. * * @example * * const attributes = _parseAttributes( { name: "id", value: "an-id" } ) // becomes { id: "an-id" }. * * @param {Array<{ name: string, value: string }>} parse5attributes The attributes as parsed by parse5. * * @returns {Object} The parsed attributes. * * @private */ _parseAttributes( parse5attributes ) { if ( parse5attributes && parse5attributes.length > 0 ) { return parse5attributes.reduce( ( attributes, attribute ) => { attributes[ attribute.name ] = attribute.value; return attributes; }, {} ); } return null; } /** * Returns the last child of the given parent. * * @param {module:parsedPaper/structure.StructuredNode} parent The parent. * * @returns {module:parsedPaper/structure.Node} The parent's last child. * * @private */ _previousChild( parent ) { if ( parent.children ) { return parent.children[ parent.children.length - 1 ]; } return null; } /** * Try to find the leaf node ancestor of this node. Returns `null` if the node does not have one. * * @param {module:parsedPaper/structure.Node} element The node for which to find the leaf node ancestor. * * @returns {null|module:parsedPaper/structure.Node} The leaf node ancestor, if it has one, `null` if not. * * @private */ _leafNodeAncestor( element ) { const parent = element.parent; if ( ! parent ) { return null; } if ( element instanceof LeafNode ) { return parent; } return this._leafNodeAncestor( parent ); } } export default HTMLTreeConverter;