UNPKG

html-dom-parser

Version:
717 lines (708 loc) 23.9 kB
(function (global, factory) { typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() : typeof define === 'function' && define.amd ? define(factory) : (global = typeof globalThis !== 'undefined' ? globalThis : global || self, global.HTMLDOMParser = factory()); })(this, (function () { 'use strict'; /** Types of elements found in htmlparser2's DOM */ var ElementType; (function (ElementType) { /** Type for the root element of a document */ ElementType["Root"] = "root"; /** Type for Text */ ElementType["Text"] = "text"; /** Type for <? ... ?> */ ElementType["Directive"] = "directive"; /** Type for <!-- ... --> */ ElementType["Comment"] = "comment"; /** Type for <script> tags */ ElementType["Script"] = "script"; /** Type for <style> tags */ ElementType["Style"] = "style"; /** Type for Any tag */ ElementType["Tag"] = "tag"; /** Type for <![CDATA[ ... ]]> */ ElementType["CDATA"] = "cdata"; /** Type for <!doctype ...> */ ElementType["Doctype"] = "doctype"; })(ElementType || (ElementType = {})); /** * Tests whether an element is a tag or not. * * @param elem Element to test */ function isTag$1(elem) { return (elem.type === ElementType.Tag || elem.type === ElementType.Script || elem.type === ElementType.Style); } // Exports for backwards compatibility /** Type for the root element of a document */ ElementType.Root; /** Type for Text */ ElementType.Text; /** Type for <? ... ?> */ ElementType.Directive; /** Type for <!-- ... --> */ ElementType.Comment; /** Type for <script> tags */ ElementType.Script; /** Type for <style> tags */ ElementType.Style; /** Type for Any tag */ ElementType.Tag; /** Type for <![CDATA[ ... ]]> */ ElementType.CDATA; /** Type for <!doctype ...> */ ElementType.Doctype; /** * This object will be used as the prototype for Nodes when creating a * DOM-Level-1-compliant structure. */ class Node { constructor() { /** Parent of the node */ this.parent = null; /** Previous sibling */ this.prev = null; /** Next sibling */ this.next = null; /** The start index of the node. Requires `withStartIndices` on the handler to be `true. */ this.startIndex = null; /** The end index of the node. Requires `withEndIndices` on the handler to be `true. */ this.endIndex = null; } // Read-write aliases for properties /** * Same as {@link parent}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get parentNode() { return this.parent; } set parentNode(parent) { this.parent = parent; } /** * Same as {@link prev}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get previousSibling() { return this.prev; } set previousSibling(prev) { this.prev = prev; } /** * Same as {@link next}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get nextSibling() { return this.next; } set nextSibling(next) { this.next = next; } /** * Clone this node, and optionally its children. * * @param recursive Clone child nodes as well. * @returns A clone of the node. */ cloneNode(recursive = false) { return cloneNode(this, recursive); } } /** * A node that contains some data. */ class DataNode extends Node { /** * @param data The content of the data node */ constructor(data) { super(); this.data = data; } /** * Same as {@link data}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get nodeValue() { return this.data; } set nodeValue(data) { this.data = data; } } /** * Text within the document. */ class Text extends DataNode { constructor() { super(...arguments); this.type = ElementType.Text; } get nodeType() { return 3; } } /** * Comments within the document. */ class Comment extends DataNode { constructor() { super(...arguments); this.type = ElementType.Comment; } get nodeType() { return 8; } } /** * Processing instructions, including doc types. */ class ProcessingInstruction extends DataNode { constructor(name, data) { super(data); this.name = name; this.type = ElementType.Directive; } get nodeType() { return 1; } } /** * A `Node` that can have children. */ class NodeWithChildren extends Node { /** * @param children Children of the node. Only certain node types can have children. */ constructor(children) { super(); this.children = children; } // Aliases /** First child of the node. */ get firstChild() { var _a; return (_a = this.children[0]) !== null && _a !== void 0 ? _a : null; } /** Last child of the node. */ get lastChild() { return this.children.length > 0 ? this.children[this.children.length - 1] : null; } /** * Same as {@link children}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get childNodes() { return this.children; } set childNodes(children) { this.children = children; } } class CDATA extends NodeWithChildren { constructor() { super(...arguments); this.type = ElementType.CDATA; } get nodeType() { return 4; } } /** * The root node of the document. */ class Document extends NodeWithChildren { constructor() { super(...arguments); this.type = ElementType.Root; } get nodeType() { return 9; } } /** * An element within the DOM. */ class Element extends NodeWithChildren { /** * @param name Name of the tag, eg. `div`, `span`. * @param attribs Object mapping attribute names to attribute values. * @param children Children of the node. */ constructor(name, attribs, children = [], type = name === "script" ? ElementType.Script : name === "style" ? ElementType.Style : ElementType.Tag) { super(children); this.name = name; this.attribs = attribs; this.type = type; } get nodeType() { return 1; } // DOM Level 1 aliases /** * Same as {@link name}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get tagName() { return this.name; } set tagName(name) { this.name = name; } get attributes() { return Object.keys(this.attribs).map((name) => { var _a, _b; return ({ name, value: this.attribs[name], namespace: (_a = this["x-attribsNamespace"]) === null || _a === void 0 ? void 0 : _a[name], prefix: (_b = this["x-attribsPrefix"]) === null || _b === void 0 ? void 0 : _b[name], }); }); } } /** * @param node Node to check. * @returns `true` if the node is a `Element`, `false` otherwise. */ function isTag(node) { return isTag$1(node); } /** * @param node Node to check. * @returns `true` if the node has the type `CDATA`, `false` otherwise. */ function isCDATA(node) { return node.type === ElementType.CDATA; } /** * @param node Node to check. * @returns `true` if the node has the type `Text`, `false` otherwise. */ function isText(node) { return node.type === ElementType.Text; } /** * @param node Node to check. * @returns `true` if the node has the type `Comment`, `false` otherwise. */ function isComment(node) { return node.type === ElementType.Comment; } /** * @param node Node to check. * @returns `true` if the node has the type `ProcessingInstruction`, `false` otherwise. */ function isDirective(node) { return node.type === ElementType.Directive; } /** * @param node Node to check. * @returns `true` if the node has the type `ProcessingInstruction`, `false` otherwise. */ function isDocument(node) { return node.type === ElementType.Root; } /** * Clone a node, and optionally its children. * * @param recursive Clone child nodes as well. * @returns A clone of the node. */ function cloneNode(node, recursive = false) { let result; if (isText(node)) { result = new Text(node.data); } else if (isComment(node)) { result = new Comment(node.data); } else if (isTag(node)) { const children = recursive ? cloneChildren(node.children) : []; const clone = new Element(node.name, { ...node.attribs }, children); children.forEach((child) => (child.parent = clone)); if (node.namespace != null) { clone.namespace = node.namespace; } if (node["x-attribsNamespace"]) { clone["x-attribsNamespace"] = { ...node["x-attribsNamespace"] }; } if (node["x-attribsPrefix"]) { clone["x-attribsPrefix"] = { ...node["x-attribsPrefix"] }; } result = clone; } else if (isCDATA(node)) { const children = recursive ? cloneChildren(node.children) : []; const clone = new CDATA(children); children.forEach((child) => (child.parent = clone)); result = clone; } else if (isDocument(node)) { const children = recursive ? cloneChildren(node.children) : []; const clone = new Document(children); children.forEach((child) => (child.parent = clone)); if (node["x-mode"]) { clone["x-mode"] = node["x-mode"]; } result = clone; } else if (isDirective(node)) { const instruction = new ProcessingInstruction(node.name, node.data); if (node["x-name"] != null) { instruction["x-name"] = node["x-name"]; instruction["x-publicId"] = node["x-publicId"]; instruction["x-systemId"] = node["x-systemId"]; } result = instruction; } else { throw new Error(`Not implemented yet: ${node.type}`); } result.startIndex = node.startIndex; result.endIndex = node.endIndex; if (node.sourceCodeLocation != null) { result.sourceCodeLocation = node.sourceCodeLocation; } return result; } function cloneChildren(childs) { const children = childs.map((child) => cloneNode(child, true)); for (let i = 1; i < children.length; i++) { children[i].prev = children[i - 1]; children[i - 1].next = children[i]; } return children; } /** * SVG elements are case-sensitive. * * @see https://developer.mozilla.org/docs/Web/SVG/Element#svg_elements_a_to_z */ var CASE_SENSITIVE_TAG_NAMES = [ 'animateMotion', 'animateTransform', 'clipPath', 'feBlend', 'feColorMatrix', 'feComponentTransfer', 'feComposite', 'feConvolveMatrix', 'feDiffuseLighting', 'feDisplacementMap', 'feDropShadow', 'feFlood', 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage', 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight', 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'foreignObject', 'linearGradient', 'radialGradient', 'textPath', ]; var CASE_SENSITIVE_TAG_NAMES_MAP = CASE_SENSITIVE_TAG_NAMES.reduce(function (accumulator, tagName) { accumulator[tagName.toLowerCase()] = tagName; return accumulator; }, {}); var CARRIAGE_RETURN = '\r'; var CARRIAGE_RETURN_REGEX = new RegExp(CARRIAGE_RETURN, 'g'); var CARRIAGE_RETURN_PLACEHOLDER = "__HTML_DOM_PARSER_CARRIAGE_RETURN_PLACEHOLDER_".concat(Date.now(), "__"); var CARRIAGE_RETURN_PLACEHOLDER_REGEX = new RegExp(CARRIAGE_RETURN_PLACEHOLDER, 'g'); /** * Gets case-sensitive tag name. * * @param tagName - Tag name in lowercase. * @returns - Case-sensitive tag name. */ function getCaseSensitiveTagName(tagName) { return CASE_SENSITIVE_TAG_NAMES_MAP[tagName]; } /** * Formats DOM attributes to a hash map. * * @param attributes - List of attributes. * @returns - Map of attribute name to value. */ function formatAttributes(attributes) { var map = {}; var index = 0; var attributesLength = attributes.length; // `NamedNodeMap` is array-like for (; index < attributesLength; index++) { var attribute = attributes[index]; map[attribute.name] = attribute.value; } return map; } /** * Corrects the tag name if it is case-sensitive (SVG). * Otherwise, returns the lowercase tag name (HTML). * * @param tagName - Lowercase tag name. * @returns - Formatted tag name. */ function formatTagName(tagName) { tagName = tagName.toLowerCase(); var caseSensitiveTagName = getCaseSensitiveTagName(tagName); if (caseSensitiveTagName) { return caseSensitiveTagName; } return tagName; } /** * Escapes special characters before parsing. * * @param html - The HTML string. * @returns - HTML string with escaped special characters. */ function escapeSpecialCharacters(html) { return html.replace(CARRIAGE_RETURN_REGEX, CARRIAGE_RETURN_PLACEHOLDER); } /** * Reverts escaped special characters back to actual characters. * * @param text - The text with escaped characters. * @returns - Text with escaped characters reverted. */ function revertEscapedCharacters(text) { return text.replace(CARRIAGE_RETURN_PLACEHOLDER_REGEX, CARRIAGE_RETURN); } /** * Transforms DOM nodes to `domhandler` nodes. * * @param nodes - DOM nodes. * @param parent - Parent node. * @param directive - Directive. * @returns - Nodes. */ function formatDOM(nodes, parent, directive) { if (parent === void 0) { parent = null; } var domNodes = []; var current; var index = 0; var nodesLength = nodes.length; for (; index < nodesLength; index++) { var node = nodes[index]; // set the node data given the type switch (node.nodeType) { case 1: { var tagName = formatTagName(node.nodeName); // script, style, or tag current = new Element(tagName, formatAttributes(node.attributes)); current.children = formatDOM( // template children are on content tagName === 'template' ? node.content.childNodes : node.childNodes, current); break; } case 3: current = new Text(revertEscapedCharacters(node.nodeValue)); break; case 8: current = new Comment(node.nodeValue); break; default: continue; } // set previous node next var prev = domNodes[index - 1] || null; if (prev) { prev.next = current; } // set properties for current node current.parent = parent; current.prev = prev; current.next = null; domNodes.push(current); } if (directive) { current = new ProcessingInstruction(directive.substring(0, directive.indexOf(' ')).toLowerCase(), directive); current.next = domNodes[0] || null; current.parent = parent; domNodes.unshift(current); if (domNodes[1]) { domNodes[1].prev = domNodes[0]; } } return domNodes; } // constants var HTML = 'html'; var HEAD = 'head'; var BODY = 'body'; var FIRST_TAG_REGEX = /<([a-zA-Z]+[0-9]?)/; // e.g., <h1> // match-all-characters in case of newlines (DOTALL) var HEAD_TAG_REGEX = /<head[^]*>/i; var BODY_TAG_REGEX = /<body[^]*>/i; // falls back to `parseFromString` if `createHTMLDocument` cannot be used // eslint-disable-next-line @typescript-eslint/no-unused-vars var parseFromDocument = function (html, tagName) { /* istanbul ignore next */ throw new Error('This browser does not support `document.implementation.createHTMLDocument`'); }; // eslint-disable-next-line @typescript-eslint/no-unused-vars var parseFromString = function (html, tagName) { /* istanbul ignore next */ throw new Error('This browser does not support `DOMParser.prototype.parseFromString`'); }; var DOMParser = typeof window === 'object' && window.DOMParser; /** * DOMParser (performance: slow). * * @see https://developer.mozilla.org/docs/Web/API/DOMParser#Parsing_an_SVG_or_HTML_document */ if (typeof DOMParser === 'function') { var domParser_1 = new DOMParser(); var mimeType_1 = 'text/html'; /** * Creates an HTML document using `DOMParser.parseFromString`. * * @param html - The HTML string. * @param tagName - The element to render the HTML (with 'body' as fallback). * @returns - Document. */ parseFromString = function (html, tagName) { if (tagName) { /* istanbul ignore next */ html = "<".concat(tagName, ">").concat(html, "</").concat(tagName, ">"); } return domParser_1.parseFromString(html, mimeType_1); }; parseFromDocument = parseFromString; } /** * DOMImplementation (performance: fair). * * @see https://developer.mozilla.org/docs/Web/API/DOMImplementation/createHTMLDocument */ if (typeof document === 'object' && document.implementation) { var htmlDocument_1 = document.implementation.createHTMLDocument(); /** * Use HTML document created by `document.implementation.createHTMLDocument`. * * @param html - The HTML string. * @param tagName - The element to render the HTML (with 'body' as fallback). * @returns - Document */ parseFromDocument = function (html, tagName) { if (tagName) { var element = htmlDocument_1.documentElement.querySelector(tagName); if (element) { element.innerHTML = html; } return htmlDocument_1; } htmlDocument_1.documentElement.innerHTML = html; return htmlDocument_1; }; } /** * Template (performance: fast). * * @see https://developer.mozilla.org/docs/Web/HTML/Element/template */ var template = typeof document === 'object' && document.createElement('template'); var parseFromTemplate; if (template && template.content) { /** * Uses a template element (content fragment) to parse HTML. * * @param html - HTML string. * @returns - Nodes. */ parseFromTemplate = function (html) { template.innerHTML = html; return template.content.childNodes; }; } /** * Parses HTML string to DOM nodes. * * @param html - HTML markup. * @returns - DOM nodes. */ function domparser(html) { var _a, _b; // Escape special characters before parsing html = escapeSpecialCharacters(html); var match = html.match(FIRST_TAG_REGEX); var firstTagName = match && match[1] ? match[1].toLowerCase() : ''; switch (firstTagName) { case HTML: { var doc = parseFromString(html); // the created document may come with filler head/body elements, // so make sure to remove them if they don't actually exist if (!HEAD_TAG_REGEX.test(html)) { var element = doc.querySelector(HEAD); (_a = element === null || element === void 0 ? void 0 : element.parentNode) === null || _a === void 0 ? void 0 : _a.removeChild(element); } if (!BODY_TAG_REGEX.test(html)) { var element = doc.querySelector(BODY); (_b = element === null || element === void 0 ? void 0 : element.parentNode) === null || _b === void 0 ? void 0 : _b.removeChild(element); } return doc.querySelectorAll(HTML); } case HEAD: case BODY: { var elements = parseFromDocument(html).querySelectorAll(firstTagName); // if there's a sibling element, then return both elements if (BODY_TAG_REGEX.test(html) && HEAD_TAG_REGEX.test(html)) { return elements[0].parentNode.childNodes; } return elements; } // low-level tag or text default: { if (parseFromTemplate) { return parseFromTemplate(html); } var element = parseFromDocument(html, BODY).querySelector(BODY); return element.childNodes; } } } var DIRECTIVE_REGEX = /<(![a-zA-Z\s]+)>/; // e.g., <!doctype html> /** * Parses HTML string to DOM nodes in browser. * * @param html - HTML markup. * @returns - DOM elements. */ function HTMLDOMParser(html) { if (typeof html !== 'string') { throw new TypeError('First argument must be a string'); } if (!html) { return []; } // match directive var match = html.match(DIRECTIVE_REGEX); var directive = match ? match[1] : undefined; return formatDOM(domparser(html), null, directive); } return HTMLDOMParser; })); //# sourceMappingURL=html-dom-parser.js.map