html-dom-parser
Version:
HTML to DOM parser.
774 lines (765 loc) • 26.5 kB
JavaScript
(function (global, factory) {
typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() :
typeof define === 'function' && define.amd ? define(factory) :
(global = typeof globalThis !== 'undefined' ? globalThis : global || self, global.HTMLDOMParser = factory());
})(this, (function () { 'use strict';
/** Types of elements found in htmlparser2's DOM */
var ElementType;
(function (ElementType) {
/** Type for the root element of a document */
ElementType["Root"] = "root";
/** Type for Text */
ElementType["Text"] = "text";
/** Type for <? ... ?> */
ElementType["Directive"] = "directive";
/** Type for <!-- ... --> */
ElementType["Comment"] = "comment";
/** Type for <script> tags */
ElementType["Script"] = "script";
/** Type for <style> tags */
ElementType["Style"] = "style";
/** Type for Any tag */
ElementType["Tag"] = "tag";
/** Type for <![CDATA[ ... ]]> */
ElementType["CDATA"] = "cdata";
/** Type for <!doctype ...> */
ElementType["Doctype"] = "doctype";
})(ElementType || (ElementType = {}));
/**
* Tests whether an element is a tag or not.
* @param element Element to test
* @param element.type Node type discriminator to check.
*/
function isTag$1(element) {
return (element.type === ElementType.Tag ||
element.type === ElementType.Script ||
element.type === ElementType.Style);
}
// Exports for backwards compatibility
/** Type for the root element of a document */
// eslint-disable-next-line prefer-destructuring
ElementType.Root;
/** Type for Text */
// eslint-disable-next-line prefer-destructuring
ElementType.Text;
/** Type for <? ... ?> */
// eslint-disable-next-line prefer-destructuring
ElementType.Directive;
/** Type for <!-- ... --> */
// eslint-disable-next-line prefer-destructuring
ElementType.Comment;
/** Type for <script> tags */
// eslint-disable-next-line prefer-destructuring
ElementType.Script;
/** Type for <style> tags */
// eslint-disable-next-line prefer-destructuring
ElementType.Style;
/** Type for Any tag */
// eslint-disable-next-line prefer-destructuring
ElementType.Tag;
/** Type for <![CDATA[ ... ]]> */
// eslint-disable-next-line prefer-destructuring
ElementType.CDATA;
/** Type for <!doctype ...> */
// eslint-disable-next-line prefer-destructuring
ElementType.Doctype;
/**
* This object will be used as the prototype for Nodes when creating a
* DOM-Level-1-compliant structure.
*/
class Node {
/** Parent of the node */
parent = null;
/** Previous sibling */
prev = null;
/** Next sibling */
next = null;
/** The start index of the node. Requires `withStartIndices` on the handler to be `true. */
startIndex = null;
/** The end index of the node. Requires `withEndIndices` on the handler to be `true. */
endIndex = null;
// Read-write aliases for properties
/**
* Same as {@link parent}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get parentNode() {
return this.parent;
}
set parentNode(parent) {
this.parent = parent;
}
/**
* Same as {@link prev}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get previousSibling() {
return this.prev;
}
set previousSibling(previous) {
this.prev = previous;
}
/**
* Same as {@link next}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get nextSibling() {
return this.next;
}
set nextSibling(next) {
this.next = next;
}
/**
* Clone this node, and optionally its children.
* @param recursive Clone child nodes as well.
* @returns A clone of the node.
*/
cloneNode(recursive = false) {
return cloneNode(this, recursive);
}
}
/**
* A node that contains some data.
*/
class DataNode extends Node {
data;
/**
* @param data The content of the data node
*/
constructor(data) {
super();
this.data = data;
}
/**
* Same as {@link data}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get nodeValue() {
return this.data;
}
set nodeValue(data) {
this.data = data;
}
}
/**
* Text within the document.
*/
class Text extends DataNode {
type = ElementType.Text;
get nodeType() {
return 3;
}
}
/**
* Comments within the document.
*/
class Comment extends DataNode {
type = ElementType.Comment;
get nodeType() {
return 8;
}
}
/**
* Processing instructions, including doc types.
*/
class ProcessingInstruction extends DataNode {
type = ElementType.Directive;
name;
constructor(name, data) {
super(data);
this.name = name;
}
get nodeType() {
return 1;
}
/** If this is a doctype, the document type name (parse5 only). */
"x-name";
/** If this is a doctype, the document type public identifier (parse5 only). */
"x-publicId";
/** If this is a doctype, the document type system identifier (parse5 only). */
"x-systemId";
}
/**
* A node that can have children.
*/
class NodeWithChildren extends Node {
children;
/**
* @param children Children of the node. Only certain node types can have children.
*/
constructor(children) {
super();
this.children = children;
}
// Aliases
/** First child of the node. */
get firstChild() {
return this.children[0] ?? null;
}
/** Last child of the node. */
get lastChild() {
return this.children.length > 0
? this.children[this.children.length - 1]
: null;
}
/**
* Same as {@link children}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get childNodes() {
return this.children;
}
set childNodes(children) {
this.children = children;
}
}
/**
* CDATA nodes.
*/
class CDATA extends NodeWithChildren {
type = ElementType.CDATA;
get nodeType() {
return 4;
}
}
/**
* The root node of the document.
*/
class Document extends NodeWithChildren {
type = ElementType.Root;
get nodeType() {
return 9;
}
}
/**
* An element within the DOM.
*/
class Element extends NodeWithChildren {
name;
attribs;
type;
/**
* @param name Name of the tag, eg. `div`, `span`.
* @param attribs Object mapping attribute names to attribute values.
* @param children Children of the node.
* @param type Node type used for the new node instance.
*/
constructor(name, attribs, children = [], type = name === "script"
? ElementType.Script
: name === "style"
? ElementType.Style
: ElementType.Tag) {
super(children);
this.name = name;
this.attribs = attribs;
this.type = type;
}
get nodeType() {
return 1;
}
// DOM Level 1 aliases
/**
* Same as {@link name}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get tagName() {
return this.name;
}
set tagName(name) {
this.name = name;
}
get attributes() {
return Object.keys(this.attribs).map((name) => ({
name,
value: this.attribs[name],
namespace: this["x-attribsNamespace"]?.[name],
prefix: this["x-attribsPrefix"]?.[name],
}));
}
/** Element namespace (parse5 only). */
namespace;
/** Element attribute namespaces (parse5 only). */
"x-attribsNamespace";
/** Element attribute namespace-related prefixes (parse5 only). */
"x-attribsPrefix";
}
/**
* Checks if `node` is an element node.
* @param node Node to check.
* @returns `true` if the node is an element node.
*/
function isTag(node) {
return isTag$1(node);
}
/**
* Checks if `node` is a CDATA node.
* @param node Node to check.
* @returns `true` if the node is a CDATA node.
*/
function isCDATA(node) {
return node.type === ElementType.CDATA;
}
/**
* Checks if `node` is a text node.
* @param node Node to check.
* @returns `true` if the node is a text node.
*/
function isText(node) {
return node.type === ElementType.Text;
}
/**
* Checks if `node` is a comment node.
* @param node Node to check.
* @returns `true` if the node is a comment node.
*/
function isComment(node) {
return node.type === ElementType.Comment;
}
/**
* Checks if `node` is a directive node.
* @param node Node to check.
* @returns `true` if the node is a directive node.
*/
function isDirective(node) {
return node.type === ElementType.Directive;
}
/**
* Checks if `node` is a document node.
* @param node Node to check.
* @returns `true` if the node is a document node.
*/
function isDocument(node) {
return node.type === ElementType.Root;
}
/**
* Clone a node, and optionally its children.
* @param node Node to clone.
* @param recursive Clone child nodes as well.
* @returns A clone of the node.
*/
function cloneNode(node, recursive = false) {
let result;
if (isText(node)) {
result = new Text(node.data);
}
else if (isComment(node)) {
result = new Comment(node.data);
}
else if (isTag(node)) {
const children = recursive ? cloneChildren(node.children) : [];
const clone = new Element(node.name, { ...node.attribs }, children);
for (const child of children) {
child.parent = clone;
}
if (node.namespace != null) {
clone.namespace = node.namespace;
}
if (node["x-attribsNamespace"]) {
clone["x-attribsNamespace"] = { ...node["x-attribsNamespace"] };
}
if (node["x-attribsPrefix"]) {
clone["x-attribsPrefix"] = { ...node["x-attribsPrefix"] };
}
result = clone;
}
else if (isCDATA(node)) {
const children = recursive ? cloneChildren(node.children) : [];
const clone = new CDATA(children);
for (const child of children) {
child.parent = clone;
}
result = clone;
}
else if (isDocument(node)) {
const children = recursive ? cloneChildren(node.children) : [];
const clone = new Document(children);
for (const child of children) {
child.parent = clone;
}
if (node["x-mode"]) {
clone["x-mode"] = node["x-mode"];
}
result = clone;
}
else if (isDirective(node)) {
const instruction = new ProcessingInstruction(node.name, node.data);
if (node["x-name"] != null) {
instruction["x-name"] = node["x-name"];
instruction["x-publicId"] = node["x-publicId"];
instruction["x-systemId"] = node["x-systemId"];
}
result = instruction;
}
else {
throw new Error(`Not implemented yet: ${node.type}`);
}
result.startIndex = node.startIndex;
result.endIndex = node.endIndex;
if (node.sourceCodeLocation != null) {
result.sourceCodeLocation = node.sourceCodeLocation;
}
return result;
}
/**
* Clone a list of child nodes.
* @param childs The child nodes to clone.
* @returns A list of cloned child nodes.
*/
function cloneChildren(childs) {
const children = childs.map((child) => cloneNode(child, true));
for (let index = 1; index < children.length; index++) {
children[index].prev = children[index - 1];
children[index - 1].next = children[index];
}
return children;
}
/**
* SVG elements are case-sensitive.
*
* @see https://developer.mozilla.org/docs/Web/SVG/Element#svg_elements_a_to_z
*/
var CASE_SENSITIVE_TAG_NAMES = [
'animateMotion',
'animateTransform',
'clipPath',
'feBlend',
'feColorMatrix',
'feComponentTransfer',
'feComposite',
'feConvolveMatrix',
'feDiffuseLighting',
'feDisplacementMap',
'feDropShadow',
'feFlood',
'feFuncA',
'feFuncB',
'feFuncG',
'feFuncR',
'feGaussianBlur',
'feImage',
'feMerge',
'feMergeNode',
'feMorphology',
'feOffset',
'fePointLight',
'feSpecularLighting',
'feSpotLight',
'feTile',
'feTurbulence',
'foreignObject',
'linearGradient',
'radialGradient',
'textPath',
];
var CASE_SENSITIVE_TAG_NAMES_MAP = CASE_SENSITIVE_TAG_NAMES.reduce(function (accumulator, tagName) {
accumulator[tagName.toLowerCase()] = tagName;
return accumulator;
}, {});
var CARRIAGE_RETURN = '\r';
var CARRIAGE_RETURN_REGEX = new RegExp(CARRIAGE_RETURN, 'g');
var CARRIAGE_RETURN_PLACEHOLDER = "__HTML_DOM_PARSER_CARRIAGE_RETURN_PLACEHOLDER_".concat(Date.now().toString(), "__");
var CARRIAGE_RETURN_PLACEHOLDER_REGEX = new RegExp(CARRIAGE_RETURN_PLACEHOLDER, 'g');
/**
* Gets case-sensitive tag name.
*
* @param tagName - Tag name in lowercase.
* @returns - Case-sensitive tag name.
*/
function getCaseSensitiveTagName(tagName) {
return CASE_SENSITIVE_TAG_NAMES_MAP[tagName];
}
/**
* Formats DOM attributes to a hash map.
*
* @param attributes - List of attributes.
* @returns - Map of attribute name to value.
*/
function formatAttributes(attributes) {
var map = {};
var index = 0;
var attributesLength = attributes.length;
// `NamedNodeMap` is array-like
for (; index < attributesLength; index++) {
var attribute = attributes[index];
map[attribute.name] = attribute.value;
}
return map;
}
/**
* Corrects the tag name if it is case-sensitive (SVG).
* Otherwise, returns the lowercase tag name (HTML).
*
* @param tagName - Lowercase tag name.
* @returns - Formatted tag name.
*/
function formatTagName(tagName) {
tagName = tagName.toLowerCase();
var caseSensitiveTagName = getCaseSensitiveTagName(tagName);
if (caseSensitiveTagName) {
return caseSensitiveTagName;
}
return tagName;
}
/**
* Checks if an HTML string contains an opening tag (case-insensitive).
*
* @param html - HTML string.
* @param tagName - Tag name to search for (e.g., 'head' or 'body').
* @returns - Whether the tag is found.
*/
function hasOpenTag(html, tagName) {
var openTag = '<' + tagName;
var index = html.toLowerCase().indexOf(openTag);
if (index === -1) {
return false;
}
var char = html[index + openTag.length];
// the character after the tag name must be '>' or whitespace (for attributes)
return (char === '>' ||
char === ' ' ||
char === '\t' ||
char === '\n' ||
char === '\r' ||
char === '/');
}
/**
* Escapes special characters before parsing.
*
* @param html - The HTML string.
* @returns - HTML string with escaped special characters.
*/
function escapeSpecialCharacters(html) {
return html.replace(CARRIAGE_RETURN_REGEX, CARRIAGE_RETURN_PLACEHOLDER);
}
/**
* Reverts escaped special characters back to actual characters.
*
* @param text - The text with escaped characters.
* @returns - Text with escaped characters reverted.
*/
function revertEscapedCharacters(text) {
return text.replace(CARRIAGE_RETURN_PLACEHOLDER_REGEX, CARRIAGE_RETURN);
}
/**
* Transforms DOM nodes to `domhandler` nodes.
*
* @param nodes - DOM nodes.
* @param parent - Parent node.
* @param directive - Directive.
* @returns - Nodes.
*/
function formatDOM(nodes, parent, directive) {
var _a, _b, _c, _d;
if (parent === void 0) { parent = null; }
var domNodes = [];
var current;
var index = 0;
var nodesLength = nodes.length;
for (; index < nodesLength; index++) {
var node = nodes[index];
// set the node data given the type
switch (node.nodeType) {
case 1: {
var tagName = formatTagName(node.nodeName);
// script, style, or tag
current = new Element(tagName, formatAttributes(node.attributes));
current.children = formatDOM(
// template children are on content
tagName === 'template'
? node.content.childNodes
: node.childNodes, current);
break;
}
/* v8 ignore start */
case 3:
current = new Text(revertEscapedCharacters((_a = node.nodeValue) !== null && _a !== void 0 ? _a : ''));
break;
case 8:
current = new Comment((_b = node.nodeValue) !== null && _b !== void 0 ? _b : '');
break;
/* v8 ignore stop */
default:
continue;
}
// set previous node next
var prev = (_c = domNodes[index - 1]) !== null && _c !== void 0 ? _c : null;
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
if (prev) {
prev.next = current;
}
// set properties for current node
current.parent = parent;
current.prev = prev;
current.next = null;
domNodes.push(current);
}
if (directive) {
current = new ProcessingInstruction(directive.substring(0, directive.indexOf(' ')).toLowerCase(), directive);
current.next = (_d = domNodes[0]) !== null && _d !== void 0 ? _d : null;
current.parent = parent;
domNodes.unshift(current);
if (domNodes[1]) {
domNodes[1].prev = domNodes[0];
}
}
return domNodes;
}
// constants
var HTML = 'html';
var HEAD = 'head';
var BODY = 'body';
var FIRST_TAG_REGEX = /<([a-zA-Z]+[0-9]?)/; // e.g., <h1>
// falls back to `parseFromString` if `createHTMLDocument` cannot be used
/* eslint-disable @typescript-eslint/no-unused-vars */
/* v8 ignore start */
var parseFromDocument = function (html, tagName) {
throw new Error('This browser does not support `document.implementation.createHTMLDocument`');
};
var parseFromString = function (html, tagName) {
throw new Error('This browser does not support `DOMParser.prototype.parseFromString`');
};
var DOMParser = typeof window === 'object' && window.DOMParser;
/**
* DOMParser (performance: slow).
*
* @see https://developer.mozilla.org/docs/Web/API/DOMParser#Parsing_an_SVG_or_HTML_document
*/
if (typeof DOMParser === 'function') {
var domParser_1 = new DOMParser();
var mimeType_1 = 'text/html';
/**
* Creates an HTML document using `DOMParser.parseFromString`.
*
* @param html - The HTML string.
* @param tagName - The element to render the HTML (with 'body' as fallback).
* @returns - Document.
*/
parseFromString = function (html, tagName) {
if (tagName) {
html = "<".concat(tagName, ">").concat(html, "</").concat(tagName, ">");
}
return domParser_1.parseFromString(html, mimeType_1);
};
parseFromDocument = parseFromString;
}
/**
* DOMImplementation (performance: fair).
*
* @see https://developer.mozilla.org/docs/Web/API/DOMImplementation/createHTMLDocument
*/
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
if (typeof document === 'object' && document.implementation) {
var htmlDocument_1 = document.implementation.createHTMLDocument();
/**
* Use HTML document created by `document.implementation.createHTMLDocument`.
*
* @param html - The HTML string.
* @param tagName - The element to render the HTML (with 'body' as fallback).
* @returns - Document
*/
parseFromDocument = function (html, tagName) {
if (tagName) {
var element = htmlDocument_1.documentElement.querySelector(tagName);
if (element) {
element.innerHTML = html;
}
return htmlDocument_1;
}
htmlDocument_1.documentElement.innerHTML = html;
return htmlDocument_1;
};
}
/**
* Template (performance: fast).
*
* @see https://developer.mozilla.org/docs/Web/HTML/Element/template
*/
var template = typeof document === 'object' && document.createElement('template');
var parseFromTemplate;
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
if (template && template.content) {
/**
* Uses a template element (content fragment) to parse HTML.
*
* @param html - HTML string.
* @returns - Nodes.
*/
parseFromTemplate = function (html) {
template.innerHTML = html;
return template.content.childNodes;
};
}
var createNodeList = function () { return document.createDocumentFragment().childNodes; };
/* v8 ignore stop */
/**
* Parses HTML string to DOM nodes.
*
* @param html - HTML markup.
* @returns - DOM nodes.
*/
function domparser(html) {
var _a, _b, _c, _d, _e, _f;
// Escape special characters before parsing
html = escapeSpecialCharacters(html);
var match = FIRST_TAG_REGEX.exec(html);
var firstTagName = (_a = match === null || match === void 0 ? void 0 : match[1]) === null || _a === void 0 ? void 0 : _a.toLowerCase();
switch (firstTagName) {
case HTML: {
var doc = parseFromString(html);
// the created document may come with filler head/body elements,
// so make sure to remove them if they don't actually exist
if (!hasOpenTag(html, HEAD)) {
var element = doc.querySelector(HEAD);
(_b = element === null || element === void 0 ? void 0 : element.parentNode) === null || _b === void 0 ? void 0 : _b.removeChild(element);
}
if (!hasOpenTag(html, BODY)) {
var element = doc.querySelector(BODY);
(_c = element === null || element === void 0 ? void 0 : element.parentNode) === null || _c === void 0 ? void 0 : _c.removeChild(element);
}
return doc.querySelectorAll(HTML);
}
case HEAD:
case BODY: {
var elements = parseFromDocument(html).querySelectorAll(firstTagName);
// if there's a sibling element, then return both elements
/* v8 ignore next */
if (hasOpenTag(html, BODY) && hasOpenTag(html, HEAD)) {
return (_e = (_d = elements[0].parentNode) === null || _d === void 0 ? void 0 : _d.childNodes) !== null && _e !== void 0 ? _e : createNodeList();
}
return elements;
}
// low-level tag or text
/* v8 ignore start */
default: {
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
if (parseFromTemplate) {
return parseFromTemplate(html);
}
var element = parseFromDocument(html, BODY).querySelector(BODY);
return (_f = element === null || element === void 0 ? void 0 : element.childNodes) !== null && _f !== void 0 ? _f : createNodeList();
}
/* v8 ignore stop */
}
}
var DIRECTIVE_REGEX = /<(![a-zA-Z\s]+)>/; // e.g., <!doctype html>
/**
* Parses HTML string to DOM nodes in browser.
*
* @param html - HTML markup.
* @returns - DOM elements.
*/
function HTMLDOMParser(html) {
if (typeof html !== 'string') {
throw new TypeError('First argument must be a string');
}
if (!html) {
return [];
}
// match directive
var match = DIRECTIVE_REGEX.exec(html);
var directive = match ? match[1] : undefined;
return formatDOM(domparser(html), null, directive);
}
return HTMLDOMParser;
}));
//# sourceMappingURL=html-dom-parser.js.map