html-dom-parser
Version:
HTML to DOM parser.
660 lines (659 loc) • 19.4 kB
JavaScript
(function(global, factory) {
typeof exports === "object" && typeof module !== "undefined" ? module.exports = factory() : typeof define === "function" && define.amd ? define([], factory) : (global = typeof globalThis !== "undefined" ? globalThis : global || self, global.HTMLDOMParser = factory());
})(this, function() {
//#region node_modules/domelementtype/dist/index.js
/** Types of elements found in htmlparser2's DOM */
var ElementType;
(function(ElementType) {
/** Type for the root element of a document */
ElementType["Root"] = "root";
/** Type for Text */
ElementType["Text"] = "text";
/** Type for <? ... ?> */
ElementType["Directive"] = "directive";
/** Type for <!-- ... --> */
ElementType["Comment"] = "comment";
/** Type for <script> tags */
ElementType["Script"] = "script";
/** Type for <style> tags */
ElementType["Style"] = "style";
/** Type for Any tag */
ElementType["Tag"] = "tag";
/** Type for <![CDATA[ ... ]]> */
ElementType["CDATA"] = "cdata";
/** Type for <!doctype ...> */
ElementType["Doctype"] = "doctype";
})(ElementType || (ElementType = {}));
/**
* Tests whether an element is a tag or not.
* @param element Element to test
* @param element.type Node type discriminator to check.
*/
function isTag$1(element) {
return element.type === ElementType.Tag || element.type === ElementType.Script || element.type === ElementType.Style;
}
ElementType.Root;
ElementType.Text;
ElementType.Directive;
ElementType.Comment;
ElementType.Script;
ElementType.Style;
ElementType.Tag;
ElementType.CDATA;
ElementType.Doctype;
//#endregion
//#region node_modules/domhandler/dist/node.js
/**
* This object will be used as the prototype for Nodes when creating a
* DOM-Level-1-compliant structure.
*/
var Node = class {
/** Parent of the node */
parent = null;
/** Previous sibling */
prev = null;
/** Next sibling */
next = null;
/** The start index of the node. Requires `withStartIndices` on the handler to be `true. */
startIndex = null;
/** The end index of the node. Requires `withEndIndices` on the handler to be `true. */
endIndex = null;
/**
* Same as {@link parent}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get parentNode() {
return this.parent;
}
set parentNode(parent) {
this.parent = parent;
}
/**
* Same as {@link prev}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get previousSibling() {
return this.prev;
}
set previousSibling(previous) {
this.prev = previous;
}
/**
* Same as {@link next}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get nextSibling() {
return this.next;
}
set nextSibling(next) {
this.next = next;
}
/**
* Clone this node, and optionally its children.
* @param recursive Clone child nodes as well.
* @returns A clone of the node.
*/
cloneNode(recursive = false) {
return cloneNode(this, recursive);
}
};
/**
* A node that contains some data.
*/
var DataNode = class extends Node {
data;
/**
* @param data The content of the data node
*/
constructor(data) {
super();
this.data = data;
}
/**
* Same as {@link data}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get nodeValue() {
return this.data;
}
set nodeValue(data) {
this.data = data;
}
};
/**
* Text within the document.
*/
var Text = class extends DataNode {
type = ElementType.Text;
get nodeType() {
return 3;
}
};
/**
* Comments within the document.
*/
var Comment = class extends DataNode {
type = ElementType.Comment;
get nodeType() {
return 8;
}
};
/**
* Processing instructions, including doc types.
*/
var ProcessingInstruction = class extends DataNode {
type = ElementType.Directive;
name;
constructor(name, data) {
super(data);
this.name = name;
}
get nodeType() {
return 1;
}
/** If this is a doctype, the document type name (parse5 only). */
"x-name";
/** If this is a doctype, the document type public identifier (parse5 only). */
"x-publicId";
/** If this is a doctype, the document type system identifier (parse5 only). */
"x-systemId";
};
/**
* A node that can have children.
*/
var NodeWithChildren = class extends Node {
children;
/**
* @param children Children of the node. Only certain node types can have children.
*/
constructor(children) {
super();
this.children = children;
}
/** First child of the node. */
get firstChild() {
return this.children[0] ?? null;
}
/** Last child of the node. */
get lastChild() {
return this.children.length > 0 ? this.children[this.children.length - 1] : null;
}
/**
* Same as {@link children}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get childNodes() {
return this.children;
}
set childNodes(children) {
this.children = children;
}
};
/**
* CDATA nodes.
*/
var CDATA = class extends NodeWithChildren {
type = ElementType.CDATA;
get nodeType() {
return 4;
}
};
/**
* The root node of the document.
*/
var Document = class extends NodeWithChildren {
type = ElementType.Root;
get nodeType() {
return 9;
}
};
/**
* An element within the DOM.
*/
var Element = class extends NodeWithChildren {
name;
attribs;
type;
/**
* @param name Name of the tag, eg. `div`, `span`.
* @param attribs Object mapping attribute names to attribute values.
* @param children Children of the node.
* @param type Node type used for the new node instance.
*/
constructor(name, attribs, children = [], type = name === "script" ? ElementType.Script : name === "style" ? ElementType.Style : ElementType.Tag) {
super(children);
this.name = name;
this.attribs = attribs;
this.type = type;
}
get nodeType() {
return 1;
}
/**
* Same as {@link name}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get tagName() {
return this.name;
}
set tagName(name) {
this.name = name;
}
get attributes() {
return Object.keys(this.attribs).map((name) => ({
name,
value: this.attribs[name],
namespace: this["x-attribsNamespace"]?.[name],
prefix: this["x-attribsPrefix"]?.[name]
}));
}
/** Element namespace (parse5 only). */
namespace;
/** Element attribute namespaces (parse5 only). */
"x-attribsNamespace";
/** Element attribute namespace-related prefixes (parse5 only). */
"x-attribsPrefix";
};
/**
* Checks if `node` is an element node.
* @param node Node to check.
* @returns `true` if the node is an element node.
*/
function isTag(node) {
return isTag$1(node);
}
/**
* Checks if `node` is a CDATA node.
* @param node Node to check.
* @returns `true` if the node is a CDATA node.
*/
function isCDATA(node) {
return node.type === ElementType.CDATA;
}
/**
* Checks if `node` is a text node.
* @param node Node to check.
* @returns `true` if the node is a text node.
*/
function isText(node) {
return node.type === ElementType.Text;
}
/**
* Checks if `node` is a comment node.
* @param node Node to check.
* @returns `true` if the node is a comment node.
*/
function isComment(node) {
return node.type === ElementType.Comment;
}
/**
* Checks if `node` is a directive node.
* @param node Node to check.
* @returns `true` if the node is a directive node.
*/
function isDirective(node) {
return node.type === ElementType.Directive;
}
/**
* Checks if `node` is a document node.
* @param node Node to check.
* @returns `true` if the node is a document node.
*/
function isDocument(node) {
return node.type === ElementType.Root;
}
/**
* Clone a node, and optionally its children.
* @param node Node to clone.
* @param recursive Clone child nodes as well.
* @returns A clone of the node.
*/
function cloneNode(node, recursive = false) {
let result;
if (isText(node)) result = new Text(node.data);
else if (isComment(node)) result = new Comment(node.data);
else if (isTag(node)) {
const children = recursive ? cloneChildren(node.children) : [];
const clone = new Element(node.name, { ...node.attribs }, children);
for (const child of children) child.parent = clone;
if (node.namespace != null) clone.namespace = node.namespace;
if (node["x-attribsNamespace"]) clone["x-attribsNamespace"] = { ...node["x-attribsNamespace"] };
if (node["x-attribsPrefix"]) clone["x-attribsPrefix"] = { ...node["x-attribsPrefix"] };
result = clone;
} else if (isCDATA(node)) {
const children = recursive ? cloneChildren(node.children) : [];
const clone = new CDATA(children);
for (const child of children) child.parent = clone;
result = clone;
} else if (isDocument(node)) {
const children = recursive ? cloneChildren(node.children) : [];
const clone = new Document(children);
for (const child of children) child.parent = clone;
if (node["x-mode"]) clone["x-mode"] = node["x-mode"];
result = clone;
} else if (isDirective(node)) {
const instruction = new ProcessingInstruction(node.name, node.data);
if (node["x-name"] != null) {
instruction["x-name"] = node["x-name"];
instruction["x-publicId"] = node["x-publicId"];
instruction["x-systemId"] = node["x-systemId"];
}
result = instruction;
} else throw new Error(`Not implemented yet: ${node.type}`);
result.startIndex = node.startIndex;
result.endIndex = node.endIndex;
if (node.sourceCodeLocation != null) result.sourceCodeLocation = node.sourceCodeLocation;
return result;
}
/**
* Clone a list of child nodes.
* @param childs The child nodes to clone.
* @returns A list of cloned child nodes.
*/
function cloneChildren(childs) {
const children = childs.map((child) => cloneNode(child, true));
for (let index = 1; index < children.length; index++) {
children[index].prev = children[index - 1];
children[index - 1].next = children[index];
}
return children;
}
const CASE_SENSITIVE_TAG_NAMES_MAP = [
"animateMotion",
"animateTransform",
"clipPath",
"feBlend",
"feColorMatrix",
"feComponentTransfer",
"feComposite",
"feConvolveMatrix",
"feDiffuseLighting",
"feDisplacementMap",
"feDropShadow",
"feFlood",
"feFuncA",
"feFuncB",
"feFuncG",
"feFuncR",
"feGaussianBlur",
"feImage",
"feMerge",
"feMergeNode",
"feMorphology",
"feOffset",
"fePointLight",
"feSpecularLighting",
"feSpotLight",
"feTile",
"feTurbulence",
"foreignObject",
"linearGradient",
"radialGradient",
"textPath"
].reduce((accumulator, tagName) => {
accumulator[tagName.toLowerCase()] = tagName;
return accumulator;
}, {});
//#endregion
//#region src/client/utilities.ts
const CARRIAGE_RETURN = "\r";
const CARRIAGE_RETURN_REGEX = new RegExp(CARRIAGE_RETURN, "g");
const CARRIAGE_RETURN_PLACEHOLDER = `__HTML_DOM_PARSER_CARRIAGE_RETURN_PLACEHOLDER_${Date.now().toString()}__`;
const CARRIAGE_RETURN_PLACEHOLDER_REGEX = new RegExp(CARRIAGE_RETURN_PLACEHOLDER, "g");
/**
* Gets case-sensitive tag name.
*
* @param tagName - Tag name in lowercase.
* @returns - Case-sensitive tag name.
*/
function getCaseSensitiveTagName(tagName) {
return CASE_SENSITIVE_TAG_NAMES_MAP[tagName];
}
/**
* Formats DOM attributes to a hash map.
*
* @param attributes - List of attributes.
* @returns - Map of attribute name to value.
*/
function formatAttributes(attributes) {
const map = {};
let index = 0;
const attributesLength = attributes.length;
for (; index < attributesLength; index++) {
const attribute = attributes[index];
map[attribute.name] = attribute.value;
}
return map;
}
/**
* Corrects the tag name if it is case-sensitive (SVG).
* Otherwise, returns the lowercase tag name (HTML).
*
* @param tagName - Lowercase tag name.
* @returns - Formatted tag name.
*/
function formatTagName(tagName) {
tagName = tagName.toLowerCase();
const caseSensitiveTagName = getCaseSensitiveTagName(tagName);
if (caseSensitiveTagName) return caseSensitiveTagName;
return tagName;
}
/**
* Checks if an HTML string contains an opening tag (case-insensitive).
*
* @param html - HTML string.
* @param tagName - Tag name to search for (e.g., 'head' or 'body').
* @returns - Whether the tag is found.
*/
function hasOpenTag(html, tagName) {
const openTag = "<" + tagName;
const index = html.toLowerCase().indexOf(openTag);
if (index === -1) return false;
const char = html[index + openTag.length];
return char === ">" || char === " " || char === " " || char === "\n" || char === "\r" || char === "/";
}
/**
* Escapes special characters before parsing.
*
* @param html - The HTML string.
* @returns - HTML string with escaped special characters.
*/
function escapeSpecialCharacters(html) {
return html.replace(CARRIAGE_RETURN_REGEX, CARRIAGE_RETURN_PLACEHOLDER);
}
/**
* Reverts escaped special characters back to actual characters.
*
* @param text - The text with escaped characters.
* @returns - Text with escaped characters reverted.
*/
function revertEscapedCharacters(text) {
return text.replace(CARRIAGE_RETURN_PLACEHOLDER_REGEX, CARRIAGE_RETURN);
}
/**
* Transforms DOM nodes to `domhandler` nodes.
*
* @param nodes - DOM nodes.
* @param parent - Parent node.
* @param directive - Directive.
* @returns - Nodes.
*/
function formatDOM(nodes, parent = null, directive) {
const domNodes = [];
let current;
let index = 0;
const nodesLength = nodes.length;
for (; index < nodesLength; index++) {
const node = nodes[index];
switch (node.nodeType) {
case 1: {
const tagName = formatTagName(node.nodeName);
current = new Element(tagName, formatAttributes(node.attributes));
current.children = formatDOM(tagName === "template" ? node.content.childNodes : node.childNodes, current);
break;
}
/* v8 ignore start */
case 3:
current = new Text(revertEscapedCharacters(node.nodeValue ?? ""));
break;
case 8:
current = new Comment(node.nodeValue ?? "");
break;
/* v8 ignore stop */
default: continue;
}
const prev = domNodes[index - 1] ?? null;
if (prev) prev.next = current;
current.parent = parent;
current.prev = prev;
current.next = null;
domNodes.push(current);
}
if (directive) {
current = new ProcessingInstruction(directive.substring(0, directive.indexOf(" ")).toLowerCase(), directive);
current.next = domNodes[0] ?? null;
current.parent = parent;
domNodes.unshift(current);
if (domNodes[1]) domNodes[1].prev = domNodes[0];
}
return domNodes;
}
//#endregion
//#region src/client/domparser.ts
const HTML = "html";
const HEAD = "head";
const BODY = "body";
const FIRST_TAG_REGEX = /<([a-zA-Z]+[0-9]?)/;
function getHTMLForInnerHTML(html, trustedTypePolicy) {
return trustedTypePolicy ? trustedTypePolicy.createHTML(html) : html;
}
/* v8 ignore start */
let parseFromDocument = (html, tagName, trustedTypePolicy) => {
throw new Error("This browser does not support `document.implementation.createHTMLDocument`");
};
let parseFromString = (html, tagName, trustedTypePolicy) => {
throw new Error("This browser does not support `DOMParser.prototype.parseFromString`");
};
const DOMParser = typeof window === "object" && window.DOMParser;
/**
* DOMParser (performance: slow).
*
* @see https://developer.mozilla.org/docs/Web/API/DOMParser#Parsing_an_SVG_or_HTML_document
*/
if (typeof DOMParser === "function") {
const domParser = new DOMParser();
const mimeType = "text/html";
/**
* Creates an HTML document using `DOMParser.parseFromString`.
*
* @param html - The HTML string.
* @param tagName - The element to render the HTML (with 'body' as fallback).
* @returns - Document.
*/
parseFromString = (html, tagName, trustedTypePolicy) => {
if (tagName) html = `<${tagName}>${html}</${tagName}>`;
return domParser.parseFromString(html, mimeType);
};
parseFromDocument = parseFromString;
}
/**
* DOMImplementation (performance: fair).
*
* @see https://developer.mozilla.org/docs/Web/API/DOMImplementation/createHTMLDocument
*/
if (typeof document === "object" && document.implementation) {
const htmlDocument = document.implementation.createHTMLDocument();
/**
* Use HTML document created by `document.implementation.createHTMLDocument`.
*
* @param html - The HTML string.
* @param tagName - The element to render the HTML (with 'body' as fallback).
* @returns - Document
*/
parseFromDocument = function(html, tagName, trustedTypePolicy) {
if (tagName) {
const element = htmlDocument.documentElement.querySelector(tagName);
if (element) element.innerHTML = getHTMLForInnerHTML(html, trustedTypePolicy);
return htmlDocument;
}
htmlDocument.documentElement.innerHTML = getHTMLForInnerHTML(html, trustedTypePolicy);
return htmlDocument;
};
}
/**
* Template (performance: fast).
*
* @see https://developer.mozilla.org/docs/Web/HTML/Element/template
*/
const template = typeof document === "object" && document.createElement("template");
let parseFromTemplate;
if (template && template.content)
/**
* Uses a template element (content fragment) to parse HTML.
*
* @param html - HTML string.
* @returns - Nodes.
*/
parseFromTemplate = (html, trustedTypePolicy) => {
template.innerHTML = getHTMLForInnerHTML(html, trustedTypePolicy);
return template.content.childNodes;
};
const createNodeList = () => document.createDocumentFragment().childNodes;
/* v8 ignore stop */
/**
* Parses HTML string to DOM nodes.
*
* @param html - HTML markup.
* @param trustedTypePolicy - Trusted Types policy.
* @returns - DOM nodes.
*/
function domparser(html, trustedTypePolicy) {
html = escapeSpecialCharacters(html);
const firstTagName = FIRST_TAG_REGEX.exec(html)?.[1]?.toLowerCase();
switch (firstTagName) {
case HTML: {
const doc = parseFromString(html);
if (!hasOpenTag(html, HEAD)) {
const element = doc.querySelector(HEAD);
element?.parentNode?.removeChild(element);
}
if (!hasOpenTag(html, BODY)) {
const element = doc.querySelector(BODY);
element?.parentNode?.removeChild(element);
}
return doc.querySelectorAll(HTML);
}
case HEAD:
case BODY: {
const elements = parseFromDocument(html, void 0, trustedTypePolicy).querySelectorAll(firstTagName);
/* v8 ignore next */
if (hasOpenTag(html, BODY) && hasOpenTag(html, HEAD)) return elements[0].parentNode?.childNodes ?? createNodeList();
return elements;
}
/* v8 ignore start */
default:
if (parseFromTemplate) return parseFromTemplate(html, trustedTypePolicy);
return parseFromDocument(html, BODY, trustedTypePolicy).querySelector(BODY)?.childNodes ?? createNodeList();
}
}
//#endregion
//#region src/client/html-to-dom.ts
const DIRECTIVE_REGEX = /<(![a-zA-Z\s]+)>/;
/**
* Parses HTML string to DOM nodes in browser.
*
* @param html - HTML markup.
* @param options - Parser options.
* @returns - DOM elements.
*/
function HTMLDOMParser(html, options) {
if (typeof html !== "string") throw new TypeError("First argument must be a string");
if (!html) return [];
const match = DIRECTIVE_REGEX.exec(html);
const directive = match ? match[1] : void 0;
return formatDOM(domparser(html, options?.trustedTypePolicy), null, directive);
}
//#endregion
return HTMLDOMParser;
});
//# sourceMappingURL=html-dom-parser.js.map