html-dom-parser
Version:
HTML to DOM parser.
168 lines (140 loc) • 4.68 kB
text/typescript
import { escapeSpecialCharacters } from './utilities';
// constants
const HTML = 'html';
const HEAD = 'head';
const BODY = 'body';
const FIRST_TAG_REGEX = /<([a-zA-Z]+[0-9]?)/; // e.g., <h1>
// match-all-characters in case of newlines (DOTALL)
const HEAD_TAG_REGEX = /<head[^]*>/i;
const BODY_TAG_REGEX = /<body[^]*>/i;
// falls back to `parseFromString` if `createHTMLDocument` cannot be used
// eslint-disable-next-line @typescript-eslint/no-unused-vars
let parseFromDocument = (html: string, tagName?: string): Document => {
/* istanbul ignore next */
throw new Error(
'This browser does not support `document.implementation.createHTMLDocument`',
);
};
// eslint-disable-next-line @typescript-eslint/no-unused-vars
let parseFromString = (html: string, tagName?: string): Document => {
/* istanbul ignore next */
throw new Error(
'This browser does not support `DOMParser.prototype.parseFromString`',
);
};
const DOMParser = typeof window === 'object' && window.DOMParser;
/**
* DOMParser (performance: slow).
*
* @see https://developer.mozilla.org/docs/Web/API/DOMParser#Parsing_an_SVG_or_HTML_document
*/
if (typeof DOMParser === 'function') {
const domParser = new DOMParser();
const mimeType = 'text/html';
/**
* Creates an HTML document using `DOMParser.parseFromString`.
*
* @param html - The HTML string.
* @param tagName - The element to render the HTML (with 'body' as fallback).
* @returns - Document.
*/
parseFromString = (html: string, tagName?: string): Document => {
if (tagName) {
/* istanbul ignore next */
html = `<${tagName}>${html}</${tagName}>`;
}
return domParser.parseFromString(html, mimeType);
};
parseFromDocument = parseFromString;
}
/**
* DOMImplementation (performance: fair).
*
* @see https://developer.mozilla.org/docs/Web/API/DOMImplementation/createHTMLDocument
*/
if (typeof document === 'object' && document.implementation) {
const htmlDocument = document.implementation.createHTMLDocument();
/**
* Use HTML document created by `document.implementation.createHTMLDocument`.
*
* @param html - The HTML string.
* @param tagName - The element to render the HTML (with 'body' as fallback).
* @returns - Document
*/
parseFromDocument = function (html: string, tagName?: string): Document {
if (tagName) {
const element = htmlDocument.documentElement.querySelector(tagName);
if (element) {
element.innerHTML = html;
}
return htmlDocument;
}
htmlDocument.documentElement.innerHTML = html;
return htmlDocument;
};
}
/**
* Template (performance: fast).
*
* @see https://developer.mozilla.org/docs/Web/HTML/Element/template
*/
const template =
typeof document === 'object' && document.createElement('template');
let parseFromTemplate: (html: string) => NodeList;
if (template && template.content) {
/**
* Uses a template element (content fragment) to parse HTML.
*
* @param html - HTML string.
* @returns - Nodes.
*/
parseFromTemplate = (html: string): NodeList => {
template.innerHTML = html;
return template.content.childNodes;
};
}
/**
* Parses HTML string to DOM nodes.
*
* @param html - HTML markup.
* @returns - DOM nodes.
*/
export default function domparser(html: string): NodeList {
// Escape special characters before parsing
html = escapeSpecialCharacters(html);
const match = html.match(FIRST_TAG_REGEX);
const firstTagName = match && match[1] ? match[1].toLowerCase() : '';
switch (firstTagName) {
case HTML: {
const doc = parseFromString(html);
// the created document may come with filler head/body elements,
// so make sure to remove them if they don't actually exist
if (!HEAD_TAG_REGEX.test(html)) {
const element = doc.querySelector(HEAD);
element?.parentNode?.removeChild(element);
}
if (!BODY_TAG_REGEX.test(html)) {
const element = doc.querySelector(BODY);
element?.parentNode?.removeChild(element);
}
return doc.querySelectorAll(HTML);
}
case HEAD:
case BODY: {
const elements = parseFromDocument(html).querySelectorAll(firstTagName);
// if there's a sibling element, then return both elements
if (BODY_TAG_REGEX.test(html) && HEAD_TAG_REGEX.test(html)) {
return elements[0].parentNode!.childNodes;
}
return elements;
}
// low-level tag or text
default: {
if (parseFromTemplate) {
return parseFromTemplate(html);
}
const element = parseFromDocument(html, BODY).querySelector(BODY);
return element!.childNodes;
}
}
}