UNPKG

@desertnet/html-parser

Version:

HTML parser and non-strict validator

384 lines (338 loc) 9.68 kB
import Instr from './Instr' import Compiler from './Compiler' import {NodeType, RootNode, EntityNode} from './HTMLNode' import HTMLParseError from './HTMLParseError' let _parserSingleton = null /** * Basic HTML parser and validator. Call the parse() method with a * string of HTML, and it will return a Foundaiton.HTML.Parser.HTMLNode * object containing the parse tree. * * How does it work? It creates an instance of Compiler, * which in turn uses Foundation.Scanner to tokenize the string. * The compiler takes these tokens and generates arrays of * Op objects. The parser takes these arrays * of ops, and executes them. * * There are three op instructions: "push node", "pop node" and "add token". * Pushing a node adds a new Foundation.HTML.Parser.HTMLNode to the * parser's node stack. Adding a token adds the Foundation.Scanner.Token * to the node at the top of the node stack. And popping a node * pops the topmost node from the node stack, and applies it to the * parser's open element stack. This may just involve adding it to the * currently open element, or if it's a closing tag node it may modify * the open element stack in interesting ways. * * http://www.w3.org/TR/html5/syntax.html#parsing */ export default class HTMLParser { constructor () { /** * The string of HTML text we are going to parse. * @private * @type {string} */ this._input; /** * The HTML parse tree. * @private * @type {RootNode} */ this._root; /** * The stack of unfinished nodes. The top of the stack is whatever * node we are working on assembling. * @private * @type {Array.<Foundation.HTML.Parser.HTMLNode>} */ this._nodes; /** * The stack of open element nodes. When we consume a closing * tag, we use this stack to determine which nodes belong * as children of which elements. * @private * @type {Array.<TagNode|RootNode>} */ this._openElements; /** * The compiler. Takes the string of HTML, and tells us * what to do. * @private * @type {Compiler} */ this._compiler = new Compiler(); this.reset(); } /** * @public * @param {string} html * @return {Array.<HTMLParseError>} */ static validate (html) { if (!_parserSingleton) { _parserSingleton = new HTMLParser(); } return _parserSingleton.parse(html).errors; } /** * @deprecated Extra validations are now standard. * @public * @return {Foundation.Promise} */ static initializeExtraValidations () { return Promise.resolve(); } /** * Parses the given html string, returning the HTML parse tree. * @public * @param {string} html * @return {RootNode} */ parse (html) { this.setInput(html); var code; while (code = this.compiler().generateNextCodeFragment()) { code.forEach(op => this.executeOp(op)); } this.finalize(); var result = this._root; this.reset(); return result; } /** * @return {Compiler} */ compiler () { return this._compiler; } /** * Get unfinished nodes into the tree and forcibly close any open tags. * @private */ finalize () { var addedErrorToTopmostNode = false; while (this._nodes.length) { var unfinishedNode = this.popNode(); if (!addedErrorToTopmostNode) { var error = new HTMLParseError(); error.message = "Unexpected end of HTML."; error.addTokensFromNode(unfinishedNode); unfinishedNode.addError(error); addedErrorToTopmostNode = true; } } if (this.currentOpenElement() !== this._root) { var topmostTag = /** @type {TagNode} */ (this._openElements[this._openElements.length - 2]); this.addClosedElementToParent(topmostTag); } } /** * Reset the parser to its default state. * @private */ reset () { this.setInput(""); } /** * @private * @param {string} input */ setInput (input) { this._input = input; this._compiler.setInput(this._input); this._nodes = []; this._root = new RootNode(); this._openElements = [this._root]; } /** * @private * @param {Op} op */ executeOp (op) { switch (op.instruction()) { case Instr.PUSH_NODE: this.pushNode(op.node()); break; case Instr.POP_NODE: this.popNode(); break; case Instr.ADD_TOKEN: this.currentNode().addToken(op.token()); break; default: throw new Error("Unknown instruction in executeOp(): " + op.instruction()); break; } } /** * @private * @param {Foundation.HTML.Parser.HTMLNode} node */ pushNode (node) { this._nodes.push(node); } /** * @private * @return {Foundation.HTML.Parser.HTMLNode} */ popNode () { if (this._nodes.length === 0) { throw new Error("HTMLNode stack is unexpectedly empty."); } var poppedNode = this._nodes.pop(); this.applyCompletedNode(poppedNode); return poppedNode; } /** * @private * @return {Foundation.HTML.Parser.HTMLNode} */ currentNode () { if (this._nodes.length === 0) { throw new Error("called currentNode() when node stack is empty."); } return this._nodes[this._nodes.length - 1]; } /** * @private * @param {Foundation.HTML.Parser.HTMLNode} node */ applyCompletedNode (node) { switch (node.type) { case NodeType.TEXT: case NodeType.ENTITY: case NodeType.COMMENT: this.currentOpenElement().appendChild(node); break; case NodeType.ATTRIBUTE: if (this.currentNode().type !== NodeType.TAG) { throw new Error("Unexpected node type when applying attribute node."); } this.currentNode().addAttribute(node); break; case NodeType.TAG: var tagNode = /** @type {TagNode} */ (node); if (tagNode.canHaveChildren) { this.pushOpenElement(tagNode); if (tagNode.hasRawtextContent()) { this.compiler().setRawtextModeForTag(tagNode.tagName); } } else { this.currentOpenElement().appendChild(tagNode); } break; case NodeType.CLOSETAG: var tagToClose = this.mostRecentOpenElementWithName(node.tagName); if (tagToClose) { tagToClose.appendChild(node); this.addClosedElementToParent(tagToClose); } else { this.currentOpenElement().appendChild(node); var error = new HTMLParseError(); error.message = 'Found bogus closing tag "</'+node.tagName+'>".'; error.addTokensFromNode(node); node.addError(error); } break; default: throw new Error("Unknown node type in applyCompletedNode: " + node.type); } }; addClosedElementToParent (element) { var closedElements = this.popElementsToAndIncluding(element); var appendChain = closedElements.concat([this.currentOpenElement()]); appendChain.reduce(function (child, parent) { parent.appendChild(child); return parent; }); // Add error to close tag node if we had to forcibly close elements. if (closedElements.length > 1) { var closeTagNode = element.closingTag; var firstForciblyClosedElement = closedElements[0]; if (closeTagNode) { var error = new HTMLParseError(); error.message = `Unexpected closing tag, "</${element.tagName}>". ` + `Expected closing tag for "<${firstForciblyClosedElement.tagName}>".`; error.addTokensFromNode(closeTagNode); closeTagNode.addError(error); } } // Add errors to elements closed without a closing tag. closedElements.forEach(element => { if (!element.closingTag) { var error = new HTMLParseError(); error.message = `Could not find closing tag for "<${element.tagName}>".`; error.addTokensFromNode(element); element.addError(error); } }); } /** * @private * @param {TagNode} element * @return {Array.<TagNode>} */ popElementsToAndIncluding (element) { var closedElements = []; var foundElement = false; while (!foundElement) { var closedElement = this.popOpenElement(); closedElements.push(closedElement); foundElement = (closedElement === element); } return closedElements; } /** * @private * @param {string} name * @return {TagNode} */ mostRecentOpenElementWithName (name) { for (var i = 0; i < this._openElements.length - 1; i++) { var element = /** @type {TagNode} */ (this._openElements[i]); if (element.tagName === name.toLowerCase()) { return element; } } // Couldn't find a matching tag. return null; } /** * @private * @return {RootNode} */ parseTree () { return this._root; } /** * @private * @return {Foundation.HTML.Parser.HTMLNode} */ currentOpenElement () { if (this._openElements.length === 0) { throw new Error("called currentOpenElement() when stack is empty"); } return this._openElements[0]; } /** * @private * @return {TagNode} */ popOpenElement () { var element = this._openElements.shift(); if (element instanceof RootNode) { throw new Error("Unexpectedly attempted to pop root node from open element stack."); } return element; } /** * @private * @param {Foundation.HTML.Parser.HTMLNode} elementNode */ pushOpenElement (elementNode) { this._openElements.unshift(elementNode); } }