UNPKG

html-dom-parser

Version:
774 lines (765 loc) 26.5 kB
(function (global, factory) { typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() : typeof define === 'function' && define.amd ? define(factory) : (global = typeof globalThis !== 'undefined' ? globalThis : global || self, global.HTMLDOMParser = factory()); })(this, (function () { 'use strict'; /** Types of elements found in htmlparser2's DOM */ var ElementType; (function (ElementType) { /** Type for the root element of a document */ ElementType["Root"] = "root"; /** Type for Text */ ElementType["Text"] = "text"; /** Type for <? ... ?> */ ElementType["Directive"] = "directive"; /** Type for <!-- ... --> */ ElementType["Comment"] = "comment"; /** Type for <script> tags */ ElementType["Script"] = "script"; /** Type for <style> tags */ ElementType["Style"] = "style"; /** Type for Any tag */ ElementType["Tag"] = "tag"; /** Type for <![CDATA[ ... ]]> */ ElementType["CDATA"] = "cdata"; /** Type for <!doctype ...> */ ElementType["Doctype"] = "doctype"; })(ElementType || (ElementType = {})); /** * Tests whether an element is a tag or not. * @param element Element to test * @param element.type Node type discriminator to check. */ function isTag$1(element) { return (element.type === ElementType.Tag || element.type === ElementType.Script || element.type === ElementType.Style); } // Exports for backwards compatibility /** Type for the root element of a document */ // eslint-disable-next-line prefer-destructuring ElementType.Root; /** Type for Text */ // eslint-disable-next-line prefer-destructuring ElementType.Text; /** Type for <? ... ?> */ // eslint-disable-next-line prefer-destructuring ElementType.Directive; /** Type for <!-- ... --> */ // eslint-disable-next-line prefer-destructuring ElementType.Comment; /** Type for <script> tags */ // eslint-disable-next-line prefer-destructuring ElementType.Script; /** Type for <style> tags */ // eslint-disable-next-line prefer-destructuring ElementType.Style; /** Type for Any tag */ // eslint-disable-next-line prefer-destructuring ElementType.Tag; /** Type for <![CDATA[ ... ]]> */ // eslint-disable-next-line prefer-destructuring ElementType.CDATA; /** Type for <!doctype ...> */ // eslint-disable-next-line prefer-destructuring ElementType.Doctype; /** * This object will be used as the prototype for Nodes when creating a * DOM-Level-1-compliant structure. */ class Node { /** Parent of the node */ parent = null; /** Previous sibling */ prev = null; /** Next sibling */ next = null; /** The start index of the node. Requires `withStartIndices` on the handler to be `true. */ startIndex = null; /** The end index of the node. Requires `withEndIndices` on the handler to be `true. */ endIndex = null; // Read-write aliases for properties /** * Same as {@link parent}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get parentNode() { return this.parent; } set parentNode(parent) { this.parent = parent; } /** * Same as {@link prev}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get previousSibling() { return this.prev; } set previousSibling(previous) { this.prev = previous; } /** * Same as {@link next}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get nextSibling() { return this.next; } set nextSibling(next) { this.next = next; } /** * Clone this node, and optionally its children. * @param recursive Clone child nodes as well. * @returns A clone of the node. */ cloneNode(recursive = false) { return cloneNode(this, recursive); } } /** * A node that contains some data. */ class DataNode extends Node { data; /** * @param data The content of the data node */ constructor(data) { super(); this.data = data; } /** * Same as {@link data}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get nodeValue() { return this.data; } set nodeValue(data) { this.data = data; } } /** * Text within the document. */ class Text extends DataNode { type = ElementType.Text; get nodeType() { return 3; } } /** * Comments within the document. */ class Comment extends DataNode { type = ElementType.Comment; get nodeType() { return 8; } } /** * Processing instructions, including doc types. */ class ProcessingInstruction extends DataNode { type = ElementType.Directive; name; constructor(name, data) { super(data); this.name = name; } get nodeType() { return 1; } /** If this is a doctype, the document type name (parse5 only). */ "x-name"; /** If this is a doctype, the document type public identifier (parse5 only). */ "x-publicId"; /** If this is a doctype, the document type system identifier (parse5 only). */ "x-systemId"; } /** * A node that can have children. */ class NodeWithChildren extends Node { children; /** * @param children Children of the node. Only certain node types can have children. */ constructor(children) { super(); this.children = children; } // Aliases /** First child of the node. */ get firstChild() { return this.children[0] ?? null; } /** Last child of the node. */ get lastChild() { return this.children.length > 0 ? this.children[this.children.length - 1] : null; } /** * Same as {@link children}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get childNodes() { return this.children; } set childNodes(children) { this.children = children; } } /** * CDATA nodes. */ class CDATA extends NodeWithChildren { type = ElementType.CDATA; get nodeType() { return 4; } } /** * The root node of the document. */ class Document extends NodeWithChildren { type = ElementType.Root; get nodeType() { return 9; } } /** * An element within the DOM. */ class Element extends NodeWithChildren { name; attribs; type; /** * @param name Name of the tag, eg. `div`, `span`. * @param attribs Object mapping attribute names to attribute values. * @param children Children of the node. * @param type Node type used for the new node instance. */ constructor(name, attribs, children = [], type = name === "script" ? ElementType.Script : name === "style" ? ElementType.Style : ElementType.Tag) { super(children); this.name = name; this.attribs = attribs; this.type = type; } get nodeType() { return 1; } // DOM Level 1 aliases /** * Same as {@link name}. * [DOM spec](https://dom.spec.whatwg.org)-compatible alias. */ get tagName() { return this.name; } set tagName(name) { this.name = name; } get attributes() { return Object.keys(this.attribs).map((name) => ({ name, value: this.attribs[name], namespace: this["x-attribsNamespace"]?.[name], prefix: this["x-attribsPrefix"]?.[name], })); } /** Element namespace (parse5 only). */ namespace; /** Element attribute namespaces (parse5 only). */ "x-attribsNamespace"; /** Element attribute namespace-related prefixes (parse5 only). */ "x-attribsPrefix"; } /** * Checks if `node` is an element node. * @param node Node to check. * @returns `true` if the node is an element node. */ function isTag(node) { return isTag$1(node); } /** * Checks if `node` is a CDATA node. * @param node Node to check. * @returns `true` if the node is a CDATA node. */ function isCDATA(node) { return node.type === ElementType.CDATA; } /** * Checks if `node` is a text node. * @param node Node to check. * @returns `true` if the node is a text node. */ function isText(node) { return node.type === ElementType.Text; } /** * Checks if `node` is a comment node. * @param node Node to check. * @returns `true` if the node is a comment node. */ function isComment(node) { return node.type === ElementType.Comment; } /** * Checks if `node` is a directive node. * @param node Node to check. * @returns `true` if the node is a directive node. */ function isDirective(node) { return node.type === ElementType.Directive; } /** * Checks if `node` is a document node. * @param node Node to check. * @returns `true` if the node is a document node. */ function isDocument(node) { return node.type === ElementType.Root; } /** * Clone a node, and optionally its children. * @param node Node to clone. * @param recursive Clone child nodes as well. * @returns A clone of the node. */ function cloneNode(node, recursive = false) { let result; if (isText(node)) { result = new Text(node.data); } else if (isComment(node)) { result = new Comment(node.data); } else if (isTag(node)) { const children = recursive ? cloneChildren(node.children) : []; const clone = new Element(node.name, { ...node.attribs }, children); for (const child of children) { child.parent = clone; } if (node.namespace != null) { clone.namespace = node.namespace; } if (node["x-attribsNamespace"]) { clone["x-attribsNamespace"] = { ...node["x-attribsNamespace"] }; } if (node["x-attribsPrefix"]) { clone["x-attribsPrefix"] = { ...node["x-attribsPrefix"] }; } result = clone; } else if (isCDATA(node)) { const children = recursive ? cloneChildren(node.children) : []; const clone = new CDATA(children); for (const child of children) { child.parent = clone; } result = clone; } else if (isDocument(node)) { const children = recursive ? cloneChildren(node.children) : []; const clone = new Document(children); for (const child of children) { child.parent = clone; } if (node["x-mode"]) { clone["x-mode"] = node["x-mode"]; } result = clone; } else if (isDirective(node)) { const instruction = new ProcessingInstruction(node.name, node.data); if (node["x-name"] != null) { instruction["x-name"] = node["x-name"]; instruction["x-publicId"] = node["x-publicId"]; instruction["x-systemId"] = node["x-systemId"]; } result = instruction; } else { throw new Error(`Not implemented yet: ${node.type}`); } result.startIndex = node.startIndex; result.endIndex = node.endIndex; if (node.sourceCodeLocation != null) { result.sourceCodeLocation = node.sourceCodeLocation; } return result; } /** * Clone a list of child nodes. * @param childs The child nodes to clone. * @returns A list of cloned child nodes. */ function cloneChildren(childs) { const children = childs.map((child) => cloneNode(child, true)); for (let index = 1; index < children.length; index++) { children[index].prev = children[index - 1]; children[index - 1].next = children[index]; } return children; } /** * SVG elements are case-sensitive. * * @see https://developer.mozilla.org/docs/Web/SVG/Element#svg_elements_a_to_z */ var CASE_SENSITIVE_TAG_NAMES = [ 'animateMotion', 'animateTransform', 'clipPath', 'feBlend', 'feColorMatrix', 'feComponentTransfer', 'feComposite', 'feConvolveMatrix', 'feDiffuseLighting', 'feDisplacementMap', 'feDropShadow', 'feFlood', 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage', 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight', 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'foreignObject', 'linearGradient', 'radialGradient', 'textPath', ]; var CASE_SENSITIVE_TAG_NAMES_MAP = CASE_SENSITIVE_TAG_NAMES.reduce(function (accumulator, tagName) { accumulator[tagName.toLowerCase()] = tagName; return accumulator; }, {}); var CARRIAGE_RETURN = '\r'; var CARRIAGE_RETURN_REGEX = new RegExp(CARRIAGE_RETURN, 'g'); var CARRIAGE_RETURN_PLACEHOLDER = "__HTML_DOM_PARSER_CARRIAGE_RETURN_PLACEHOLDER_".concat(Date.now().toString(), "__"); var CARRIAGE_RETURN_PLACEHOLDER_REGEX = new RegExp(CARRIAGE_RETURN_PLACEHOLDER, 'g'); /** * Gets case-sensitive tag name. * * @param tagName - Tag name in lowercase. * @returns - Case-sensitive tag name. */ function getCaseSensitiveTagName(tagName) { return CASE_SENSITIVE_TAG_NAMES_MAP[tagName]; } /** * Formats DOM attributes to a hash map. * * @param attributes - List of attributes. * @returns - Map of attribute name to value. */ function formatAttributes(attributes) { var map = {}; var index = 0; var attributesLength = attributes.length; // `NamedNodeMap` is array-like for (; index < attributesLength; index++) { var attribute = attributes[index]; map[attribute.name] = attribute.value; } return map; } /** * Corrects the tag name if it is case-sensitive (SVG). * Otherwise, returns the lowercase tag name (HTML). * * @param tagName - Lowercase tag name. * @returns - Formatted tag name. */ function formatTagName(tagName) { tagName = tagName.toLowerCase(); var caseSensitiveTagName = getCaseSensitiveTagName(tagName); if (caseSensitiveTagName) { return caseSensitiveTagName; } return tagName; } /** * Checks if an HTML string contains an opening tag (case-insensitive). * * @param html - HTML string. * @param tagName - Tag name to search for (e.g., 'head' or 'body'). * @returns - Whether the tag is found. */ function hasOpenTag(html, tagName) { var openTag = '<' + tagName; var index = html.toLowerCase().indexOf(openTag); if (index === -1) { return false; } var char = html[index + openTag.length]; // the character after the tag name must be '>' or whitespace (for attributes) return (char === '>' || char === ' ' || char === '\t' || char === '\n' || char === '\r' || char === '/'); } /** * Escapes special characters before parsing. * * @param html - The HTML string. * @returns - HTML string with escaped special characters. */ function escapeSpecialCharacters(html) { return html.replace(CARRIAGE_RETURN_REGEX, CARRIAGE_RETURN_PLACEHOLDER); } /** * Reverts escaped special characters back to actual characters. * * @param text - The text with escaped characters. * @returns - Text with escaped characters reverted. */ function revertEscapedCharacters(text) { return text.replace(CARRIAGE_RETURN_PLACEHOLDER_REGEX, CARRIAGE_RETURN); } /** * Transforms DOM nodes to `domhandler` nodes. * * @param nodes - DOM nodes. * @param parent - Parent node. * @param directive - Directive. * @returns - Nodes. */ function formatDOM(nodes, parent, directive) { var _a, _b, _c, _d; if (parent === void 0) { parent = null; } var domNodes = []; var current; var index = 0; var nodesLength = nodes.length; for (; index < nodesLength; index++) { var node = nodes[index]; // set the node data given the type switch (node.nodeType) { case 1: { var tagName = formatTagName(node.nodeName); // script, style, or tag current = new Element(tagName, formatAttributes(node.attributes)); current.children = formatDOM( // template children are on content tagName === 'template' ? node.content.childNodes : node.childNodes, current); break; } /* v8 ignore start */ case 3: current = new Text(revertEscapedCharacters((_a = node.nodeValue) !== null && _a !== void 0 ? _a : '')); break; case 8: current = new Comment((_b = node.nodeValue) !== null && _b !== void 0 ? _b : ''); break; /* v8 ignore stop */ default: continue; } // set previous node next var prev = (_c = domNodes[index - 1]) !== null && _c !== void 0 ? _c : null; // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition if (prev) { prev.next = current; } // set properties for current node current.parent = parent; current.prev = prev; current.next = null; domNodes.push(current); } if (directive) { current = new ProcessingInstruction(directive.substring(0, directive.indexOf(' ')).toLowerCase(), directive); current.next = (_d = domNodes[0]) !== null && _d !== void 0 ? _d : null; current.parent = parent; domNodes.unshift(current); if (domNodes[1]) { domNodes[1].prev = domNodes[0]; } } return domNodes; } // constants var HTML = 'html'; var HEAD = 'head'; var BODY = 'body'; var FIRST_TAG_REGEX = /<([a-zA-Z]+[0-9]?)/; // e.g., <h1> // falls back to `parseFromString` if `createHTMLDocument` cannot be used /* eslint-disable @typescript-eslint/no-unused-vars */ /* v8 ignore start */ var parseFromDocument = function (html, tagName) { throw new Error('This browser does not support `document.implementation.createHTMLDocument`'); }; var parseFromString = function (html, tagName) { throw new Error('This browser does not support `DOMParser.prototype.parseFromString`'); }; var DOMParser = typeof window === 'object' && window.DOMParser; /** * DOMParser (performance: slow). * * @see https://developer.mozilla.org/docs/Web/API/DOMParser#Parsing_an_SVG_or_HTML_document */ if (typeof DOMParser === 'function') { var domParser_1 = new DOMParser(); var mimeType_1 = 'text/html'; /** * Creates an HTML document using `DOMParser.parseFromString`. * * @param html - The HTML string. * @param tagName - The element to render the HTML (with 'body' as fallback). * @returns - Document. */ parseFromString = function (html, tagName) { if (tagName) { html = "<".concat(tagName, ">").concat(html, "</").concat(tagName, ">"); } return domParser_1.parseFromString(html, mimeType_1); }; parseFromDocument = parseFromString; } /** * DOMImplementation (performance: fair). * * @see https://developer.mozilla.org/docs/Web/API/DOMImplementation/createHTMLDocument */ // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition if (typeof document === 'object' && document.implementation) { var htmlDocument_1 = document.implementation.createHTMLDocument(); /** * Use HTML document created by `document.implementation.createHTMLDocument`. * * @param html - The HTML string. * @param tagName - The element to render the HTML (with 'body' as fallback). * @returns - Document */ parseFromDocument = function (html, tagName) { if (tagName) { var element = htmlDocument_1.documentElement.querySelector(tagName); if (element) { element.innerHTML = html; } return htmlDocument_1; } htmlDocument_1.documentElement.innerHTML = html; return htmlDocument_1; }; } /** * Template (performance: fast). * * @see https://developer.mozilla.org/docs/Web/HTML/Element/template */ var template = typeof document === 'object' && document.createElement('template'); var parseFromTemplate; // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition if (template && template.content) { /** * Uses a template element (content fragment) to parse HTML. * * @param html - HTML string. * @returns - Nodes. */ parseFromTemplate = function (html) { template.innerHTML = html; return template.content.childNodes; }; } var createNodeList = function () { return document.createDocumentFragment().childNodes; }; /* v8 ignore stop */ /** * Parses HTML string to DOM nodes. * * @param html - HTML markup. * @returns - DOM nodes. */ function domparser(html) { var _a, _b, _c, _d, _e, _f; // Escape special characters before parsing html = escapeSpecialCharacters(html); var match = FIRST_TAG_REGEX.exec(html); var firstTagName = (_a = match === null || match === void 0 ? void 0 : match[1]) === null || _a === void 0 ? void 0 : _a.toLowerCase(); switch (firstTagName) { case HTML: { var doc = parseFromString(html); // the created document may come with filler head/body elements, // so make sure to remove them if they don't actually exist if (!hasOpenTag(html, HEAD)) { var element = doc.querySelector(HEAD); (_b = element === null || element === void 0 ? void 0 : element.parentNode) === null || _b === void 0 ? void 0 : _b.removeChild(element); } if (!hasOpenTag(html, BODY)) { var element = doc.querySelector(BODY); (_c = element === null || element === void 0 ? void 0 : element.parentNode) === null || _c === void 0 ? void 0 : _c.removeChild(element); } return doc.querySelectorAll(HTML); } case HEAD: case BODY: { var elements = parseFromDocument(html).querySelectorAll(firstTagName); // if there's a sibling element, then return both elements /* v8 ignore next */ if (hasOpenTag(html, BODY) && hasOpenTag(html, HEAD)) { return (_e = (_d = elements[0].parentNode) === null || _d === void 0 ? void 0 : _d.childNodes) !== null && _e !== void 0 ? _e : createNodeList(); } return elements; } // low-level tag or text /* v8 ignore start */ default: { // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition if (parseFromTemplate) { return parseFromTemplate(html); } var element = parseFromDocument(html, BODY).querySelector(BODY); return (_f = element === null || element === void 0 ? void 0 : element.childNodes) !== null && _f !== void 0 ? _f : createNodeList(); } /* v8 ignore stop */ } } var DIRECTIVE_REGEX = /<(![a-zA-Z\s]+)>/; // e.g., <!doctype html> /** * Parses HTML string to DOM nodes in browser. * * @param html - HTML markup. * @returns - DOM elements. */ function HTMLDOMParser(html) { if (typeof html !== 'string') { throw new TypeError('First argument must be a string'); } if (!html) { return []; } // match directive var match = DIRECTIVE_REGEX.exec(html); var directive = match ? match[1] : undefined; return formatDOM(domparser(html), null, directive); } return HTMLDOMParser; })); //# sourceMappingURL=html-dom-parser.js.map