UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

96 lines 2.97 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.transferContent = transferContent; exports.serializeHTML = serializeHTML; exports.decodeHTMLEntities = decodeHTMLEntities; exports.escapeHtml = escapeHtml; exports.isDangerousUrl = isDangerousUrl; exports.isDirectTableChild = isDirectTableChild; exports.parseHTML = parseHTML; /** * Move all child nodes from source to target. * Clears target first, then moves each child node from source. */ function transferContent(source, target) { if ('replaceChildren' in target) { target.replaceChildren(); } else { while (target.firstChild) { target.removeChild(target.firstChild); } } while (source.firstChild) { target.appendChild(source.firstChild); } } /** * Read an element's inner HTML. */ function serializeHTML(el) { return el.innerHTML; } /** * Decode HTML entities in a string (e.g. `&amp;` → `&`). * Uses a <textarea> element which is safe for entity decoding. */ function decodeHTMLEntities(doc, text) { const textarea = doc.createElement('textarea'); textarea.innerHTML = text; return textarea.value; } /** * Escape HTML special characters in a string. */ function escapeHtml(text) { return text .replace(/&/g, '&amp;') .replace(/</g, '&lt;') .replace(/>/g, '&gt;') .replace(/"/g, '&quot;'); } /** * Check if a URL uses a dangerous protocol (javascript:, data:text/html). * Strips whitespace and control characters before checking. */ function isDangerousUrl(url) { const normalized = url.replace(/[\s\u0000-\u001F]+/g, '').toLowerCase(); return normalized.startsWith('javascript:') || normalized.startsWith('data:text/html'); } /** * Check if an element belongs directly to an ancestor table, * not to an intervening nested TABLE. */ function isDirectTableChild(el, ancestor) { let parent = el.parentNode; while (parent && parent !== ancestor) { if (parent.nodeName === 'TABLE') return false; parent = parent.parentNode; } return parent === ancestor; } /** * Parse an HTML string into a DocumentFragment. * Uses a <template> element when available (safer: no script execution, * no resource loading). Falls back to a <div> for environments that * don't support template.content (e.g. some server-side DOM libraries). */ function parseHTML(doc, html) { if (!html) return doc.createDocumentFragment(); const template = doc.createElement('template'); template.innerHTML = html; if (template.content) { return template.content; } // Fallback for environments without template.content support const div = doc.createElement('div'); div.innerHTML = html; const fragment = doc.createDocumentFragment(); while (div.firstChild) { fragment.appendChild(div.firstChild); } return fragment; } //# sourceMappingURL=dom.js.map