UNPKG

html-dom-parser

Version:
168 lines (140 loc) 4.68 kB
import { escapeSpecialCharacters } from './utilities'; // constants const HTML = 'html'; const HEAD = 'head'; const BODY = 'body'; const FIRST_TAG_REGEX = /<([a-zA-Z]+[0-9]?)/; // e.g., <h1> // match-all-characters in case of newlines (DOTALL) const HEAD_TAG_REGEX = /<head[^]*>/i; const BODY_TAG_REGEX = /<body[^]*>/i; // falls back to `parseFromString` if `createHTMLDocument` cannot be used // eslint-disable-next-line @typescript-eslint/no-unused-vars let parseFromDocument = (html: string, tagName?: string): Document => { /* istanbul ignore next */ throw new Error( 'This browser does not support `document.implementation.createHTMLDocument`', ); }; // eslint-disable-next-line @typescript-eslint/no-unused-vars let parseFromString = (html: string, tagName?: string): Document => { /* istanbul ignore next */ throw new Error( 'This browser does not support `DOMParser.prototype.parseFromString`', ); }; const DOMParser = typeof window === 'object' && window.DOMParser; /** * DOMParser (performance: slow). * * @see https://developer.mozilla.org/docs/Web/API/DOMParser#Parsing_an_SVG_or_HTML_document */ if (typeof DOMParser === 'function') { const domParser = new DOMParser(); const mimeType = 'text/html'; /** * Creates an HTML document using `DOMParser.parseFromString`. * * @param html - The HTML string. * @param tagName - The element to render the HTML (with 'body' as fallback). * @returns - Document. */ parseFromString = (html: string, tagName?: string): Document => { if (tagName) { /* istanbul ignore next */ html = `<${tagName}>${html}</${tagName}>`; } return domParser.parseFromString(html, mimeType); }; parseFromDocument = parseFromString; } /** * DOMImplementation (performance: fair). * * @see https://developer.mozilla.org/docs/Web/API/DOMImplementation/createHTMLDocument */ if (typeof document === 'object' && document.implementation) { const htmlDocument = document.implementation.createHTMLDocument(); /** * Use HTML document created by `document.implementation.createHTMLDocument`. * * @param html - The HTML string. * @param tagName - The element to render the HTML (with 'body' as fallback). * @returns - Document */ parseFromDocument = function (html: string, tagName?: string): Document { if (tagName) { const element = htmlDocument.documentElement.querySelector(tagName); if (element) { element.innerHTML = html; } return htmlDocument; } htmlDocument.documentElement.innerHTML = html; return htmlDocument; }; } /** * Template (performance: fast). * * @see https://developer.mozilla.org/docs/Web/HTML/Element/template */ const template = typeof document === 'object' && document.createElement('template'); let parseFromTemplate: (html: string) => NodeList; if (template && template.content) { /** * Uses a template element (content fragment) to parse HTML. * * @param html - HTML string. * @returns - Nodes. */ parseFromTemplate = (html: string): NodeList => { template.innerHTML = html; return template.content.childNodes; }; } /** * Parses HTML string to DOM nodes. * * @param html - HTML markup. * @returns - DOM nodes. */ export default function domparser(html: string): NodeList { // Escape special characters before parsing html = escapeSpecialCharacters(html); const match = html.match(FIRST_TAG_REGEX); const firstTagName = match && match[1] ? match[1].toLowerCase() : ''; switch (firstTagName) { case HTML: { const doc = parseFromString(html); // the created document may come with filler head/body elements, // so make sure to remove them if they don't actually exist if (!HEAD_TAG_REGEX.test(html)) { const element = doc.querySelector(HEAD); element?.parentNode?.removeChild(element); } if (!BODY_TAG_REGEX.test(html)) { const element = doc.querySelector(BODY); element?.parentNode?.removeChild(element); } return doc.querySelectorAll(HTML); } case HEAD: case BODY: { const elements = parseFromDocument(html).querySelectorAll(firstTagName); // if there's a sibling element, then return both elements if (BODY_TAG_REGEX.test(html) && HEAD_TAG_REGEX.test(html)) { return elements[0].parentNode!.childNodes; } return elements; } // low-level tag or text default: { if (parseFromTemplate) { return parseFromTemplate(html); } const element = parseFromDocument(html, BODY).querySelector(BODY); return element!.childNodes; } } }