UNPKG

clean-html

Version:
400 lines (319 loc) 9.28 kB
const htmlparser = require('htmlparser2'); const voidElements = [ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' ]; let options = {}; function setup(opt) { options = { 'allow-attributes-without-values': opt['allow-attributes-without-values'] === true ? true : false, 'break-around-comments': opt['break-around-comments'] === false ? false : true, 'break-around-tags': opt['break-around-tags'] || [ 'blockquote', 'body', 'br', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 'link', 'meta', 'p', 'table', 'td', 'title', 'tr' ], 'decode-entities': opt['decode-entities'] === true ? true : false, 'indent': opt['indent'] || ' ', 'lower-case-tags': opt['lower-case-tags'] === false ? false : true, 'lower-case-attribute-names': opt['lower-case-attribute-names'] === false ? false : true, 'preserve-tags': opt['preserve-tags'] || [ 'math', 'script', 'style', 'svg' ], 'remove-attributes': opt['remove-attributes'] || [ 'align', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'color', 'height', 'target', 'valign', 'width' ], 'remove-comments': opt['remove-comments'] === true ? true : false, 'remove-empty-tags': opt['remove-empty-tags'] || [], 'remove-tags': opt['remove-tags'] || [ 'center', 'font' ], 'wrap': opt['wrap'] >= 0 ? opt['wrap'] : 120 }; if (opt['add-break-around-tags']) { options['break-around-tags'] = options['break-around-tags'].concat(opt['add-break-around-tags']); } if (opt['add-remove-attributes']) { options['remove-attributes'] = options['remove-attributes'].concat(opt['add-remove-attributes']); } if (opt['add-remove-tags']) { options['remove-tags'] = options['remove-tags'].concat(opt['add-remove-tags']); } } function breakAround(node) { if (shouldRemove(node)) { return false; } if (node.type == 'text') { return false; } if (node.type == 'comment') { return options['break-around-comments']; } if (options['break-around-tags'].includes(node.name)) { return true; } return breakWithin(node); } function breakWithin(node) { if (shouldRemove(node)) { return false; } if (node.type != 'tag') { return false; } return node.children.some(breakAround) || node.children.some(breakWithin); } function isEmpty(node) { if (node.type == 'text') { return !node.data.trim(); } if (node.type == 'comment') { return !node.data.trim(); } if (voidElements.includes(node.name)) { return false; } return !node.children.length || node.children.every(isEmpty); } function removeExtraSpace(text) { return text.replace(/\s+/g, ' '); } function shouldRemove(node) { if (node.type == 'text') { return isEmpty(node); } if (node.type == 'comment') { return options['remove-comments'] || isEmpty(node); } if (isListedInOptions('remove-empty-tags', node.name)) { return isEmpty(node); } return isListedInOptions('remove-tags', node.name); } function isListedInOptions(optionsArrayName, name) { return options[optionsArrayName].some(option => { return option instanceof RegExp && option.test(name) || option === name; }); } function renderText(node) { if (shouldRemove(node)) { return ''; } let text = removeExtraSpace(node.data); if (!node.prev || breakAround(node.prev)) { text = text.trimLeft(); } if (!node.next || breakAround(node.next)) { text = text.trimRight(); } return text; } function renderComment(node) { if (shouldRemove(node)) { return ''; } const comment = '<!--' + removeExtraSpace(node.data) + '-->'; if (breakAround(node)) { return '\n' + comment + '\n'; } return comment; } function renderTag(node) { if (shouldRemove(node)) { if (isEmpty(node)) { return ''; } return render(node.children); } let openTag = '<' + node.name; for (let attrib in node.attribs) { if (!isListedInOptions('remove-attributes', attrib)) { if (!node.attribs[attrib] && options['allow-attributes-without-values']) { openTag += ' ' + attrib; } else { openTag += ` ${attrib}="${removeExtraSpace(node.attribs[attrib])}"`; } } } openTag += '>'; if (voidElements.includes(node.name)) { if (breakAround(node)) { return '\n' + openTag + '\n'; } return openTag; } let closeTag = '</' + node.name + '>'; if (breakAround(node)) { openTag = '\n' + openTag; closeTag = closeTag + '\n'; } if (breakWithin(node)) { openTag = openTag + '\n'; closeTag = '\n' + closeTag; } return openTag + render(node.children) + closeTag; } function renderDirective(node) { return '<' + node.data + '>'; } function render(nodes) { let html = ''; nodes.forEach(node => { if (node.type == 'root') { html += render(node.children); return; } if (node.type == 'text') { html += renderText(node); return; } if (node.type == 'comment') { html += renderComment(node); return; } if (node.type == 'directive') { html += renderDirective(node) return; } html += renderTag(node); }); // remove extra line breaks return html.replace(/\n+/g, '\n'); } function wrap(line, indent) { // find the last space before the column limit let bound = line.lastIndexOf(' ', options['wrap']); if (bound == -1) { // there are no spaces before the colum limit // so find the first space after it bound = line.indexOf(' ', options['wrap']); if (bound == -1) { // there are no spaces in the line // so we can't wrap it return line; } } const line1 = line.substr(0, bound); let line2 = indent + options['indent'].repeat(2) + line.substr(bound + 1); if (line1.trim().length == 0) { // there are no spaces in the line other than the indent // so we can't wrap it return line; } if (line2.length > options['wrap']) { line2 = wrap(line2, indent); } return line1 + '\n' + line2; } function indent(html) { let indentLevel = 0; const openTagRe = /^<(\w+)[^>]*>$/; const closeTagRe = /^<\/(\w+)>$/; return html.split('\n').map(line => { const closeTagMatch = line.match(closeTagRe); if (closeTagMatch) { indentLevel--; } const indent = options['indent'].repeat(indentLevel); const indented = indent + line; const openTagMatch = line.match(openTagRe); if (openTagMatch && !voidElements.includes(openTagMatch[1])) { indentLevel++; } if (options['wrap'] && indented.length > options['wrap']) { return wrap(indented, indent); } return indented; }).join('\n'); } const preserveTagReplacements = {}; function preserveTags(html) { const tagPattern = options['preserve-tags'].join('|'); const re = new RegExp(`<(?:${tagPattern})[^>]*>.*?<\/(?:${tagPattern})>`, 'gs'); return html.replace(re, (match, offset) => { preserveTagReplacements[offset] = match; return `<meta name="clean-html-replacement" offset="${offset}">`; }); } function undoPreserveTags(html) { const re = /<meta name="clean-html-replacement" offset="(\d+)">/g; return html.replace(re, (_, offset) => { return preserveTagReplacements[offset]; }); } function clean(html, opt, callback) { if (typeof opt == 'function') { callback = opt; opt = null; } setup(opt || {}); const handler = new htmlparser.DomHandler((err, dom) => { if (err) { throw err; } callback( undoPreserveTags( indent( render(dom) ).trim() ) ); }); const parser = new htmlparser.Parser(handler, { decodeEntities: options['decode-entities'], lowerCaseTags: options['lower-case-tags'], lowerCaseAttributeNames: options['lower-case-attribute-names'], }); parser.write( preserveTags(html) ); parser.end(); } module.exports = {clean};