UNPKG

sanitize-html

Version:

Clean up user-submitted HTML, preserving whitelisted elements and whitelisted attributes on a per-element basis

638 lines (581 loc) 22.5 kB
'use strict'; var htmlparser = require('htmlparser2'); var extend = require('xtend'); var quoteRegexp = require('lodash.escaperegexp'); var cloneDeep = require('lodash.clonedeep'); var mergeWith = require('lodash.mergewith'); var isString = require('lodash.isstring'); var isPlainObject = require('lodash.isplainobject'); var srcset = require('srcset'); var postcss = require('postcss'); var url = require('url'); function each(obj, cb) { if (obj) Object.keys(obj).forEach(function (key) { cb(obj[key], key); }); } // Avoid false positives with .__proto__, .hasOwnProperty, etc. function has(obj, key) { return {}.hasOwnProperty.call(obj, key); } // Returns those elements of `a` for which `cb(a)` returns truthy function filter(a, cb) { var n = []; each(a, function (v) { if (cb(v)) { n.push(v); } }); return n; } module.exports = sanitizeHtml; // A valid attribute name. // We use a tolerant definition based on the set of strings defined by // html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state // and html.spec.whatwg.org/multipage/parsing.html#attribute-name-state . // The characters accepted are ones which can be appended to the attribute // name buffer without triggering a parse error: // * unexpected-equals-sign-before-attribute-name // * unexpected-null-character // * unexpected-character-in-attribute-name // We exclude the empty string because it's impossible to get to the after // attribute name state with an empty attribute name buffer. var VALID_HTML_ATTRIBUTE_NAME = /^[^\0\t\n\f\r /<=>]+$/; // Ignore the _recursing flag; it's there for recursive // invocation as a guard against this exploit: // https://github.com/fb55/htmlparser2/issues/105 function sanitizeHtml(html, options, _recursing) { var result = ''; function Frame(tag, attribs) { var that = this; this.tag = tag; this.attribs = attribs || {}; this.tagPosition = result.length; this.text = ''; // Node inner text this.updateParentNodeText = function () { if (stack.length) { var parentFrame = stack[stack.length - 1]; parentFrame.text += that.text; } }; } if (!options) { options = sanitizeHtml.defaults; options.parser = htmlParserDefaults; } else { options = extend(sanitizeHtml.defaults, options); if (options.parser) { options.parser = extend(htmlParserDefaults, options.parser); } else { options.parser = htmlParserDefaults; } } // Tags that contain something other than HTML, or where discarding // the text when the tag is disallowed makes sense for other reasons. // If we are not allowing these tags, we should drop their content too. // For other tags you would drop the tag but keep its content. var nonTextTagsArray = options.nonTextTags || ['script', 'style', 'textarea']; var allowedAttributesMap; var allowedAttributesGlobMap; if (options.allowedAttributes) { allowedAttributesMap = {}; allowedAttributesGlobMap = {}; each(options.allowedAttributes, function (attributes, tag) { allowedAttributesMap[tag] = []; var globRegex = []; attributes.forEach(function (obj) { if (isString(obj) && obj.indexOf('*') >= 0) { globRegex.push(quoteRegexp(obj).replace(/\\\*/g, '.*')); } else { allowedAttributesMap[tag].push(obj); } }); allowedAttributesGlobMap[tag] = new RegExp('^(' + globRegex.join('|') + ')$'); }); } var allowedClassesMap = {}; each(options.allowedClasses, function (classes, tag) { // Implicitly allows the class attribute if (allowedAttributesMap) { if (!has(allowedAttributesMap, tag)) { allowedAttributesMap[tag] = []; } allowedAttributesMap[tag].push('class'); } allowedClassesMap[tag] = classes; }); var transformTagsMap = {}; var transformTagsAll; each(options.transformTags, function (transform, tag) { var transFun; if (typeof transform === 'function') { transFun = transform; } else if (typeof transform === "string") { transFun = sanitizeHtml.simpleTransform(transform); } if (tag === '*') { transformTagsAll = transFun; } else { transformTagsMap[tag] = transFun; } }); var depth = 0; var stack = []; var skipMap = {}; var transformMap = {}; var skipText = false; var skipTextDepth = 0; var parser = new htmlparser.Parser({ onopentag: function onopentag(name, attribs) { if (skipText) { skipTextDepth++; return; } var frame = new Frame(name, attribs); stack.push(frame); var skip = false; var hasText = frame.text ? true : false; var transformedTag; if (has(transformTagsMap, name)) { transformedTag = transformTagsMap[name](name, attribs); frame.attribs = attribs = transformedTag.attribs; if (transformedTag.text !== undefined) { frame.innerText = transformedTag.text; } if (name !== transformedTag.tagName) { frame.name = name = transformedTag.tagName; transformMap[depth] = transformedTag.tagName; } } if (transformTagsAll) { transformedTag = transformTagsAll(name, attribs); frame.attribs = attribs = transformedTag.attribs; if (name !== transformedTag.tagName) { frame.name = name = transformedTag.tagName; transformMap[depth] = transformedTag.tagName; } } if (options.allowedTags && options.allowedTags.indexOf(name) === -1) { skip = true; if (nonTextTagsArray.indexOf(name) !== -1) { skipText = true; skipTextDepth = 1; } skipMap[depth] = true; } depth++; if (skip) { // We want the contents but not this tag return; } result += '<' + name; if (!allowedAttributesMap || has(allowedAttributesMap, name) || allowedAttributesMap['*']) { each(attribs, function (value, a) { if (!VALID_HTML_ATTRIBUTE_NAME.test(a)) { // This prevents part of an attribute name in the output from being // interpreted as the end of an attribute, or end of a tag. delete frame.attribs[a]; return; } var parsed; // check allowedAttributesMap for the element and attribute and modify the value // as necessary if there are specific values defined. var passedAllowedAttributesMapCheck = false; if (!allowedAttributesMap || has(allowedAttributesMap, name) && allowedAttributesMap[name].indexOf(a) !== -1 || allowedAttributesMap['*'] && allowedAttributesMap['*'].indexOf(a) !== -1 || has(allowedAttributesGlobMap, name) && allowedAttributesGlobMap[name].test(a) || allowedAttributesGlobMap['*'] && allowedAttributesGlobMap['*'].test(a)) { passedAllowedAttributesMapCheck = true; } else if (allowedAttributesMap && allowedAttributesMap[name]) { var _iteratorNormalCompletion = true; var _didIteratorError = false; var _iteratorError = undefined; try { for (var _iterator = allowedAttributesMap[name][Symbol.iterator](), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) { var o = _step.value; if (isPlainObject(o) && o.name && o.name === a) { passedAllowedAttributesMapCheck = true; var newValue = ''; if (o.multiple === true) { // verify the values that are allowed var splitStrArray = value.split(' '); var _iteratorNormalCompletion2 = true; var _didIteratorError2 = false; var _iteratorError2 = undefined; try { for (var _iterator2 = splitStrArray[Symbol.iterator](), _step2; !(_iteratorNormalCompletion2 = (_step2 = _iterator2.next()).done); _iteratorNormalCompletion2 = true) { var s = _step2.value; if (o.values.indexOf(s) !== -1) { if (newValue === '') { newValue = s; } else { newValue += ' ' + s; } } } } catch (err) { _didIteratorError2 = true; _iteratorError2 = err; } finally { try { if (!_iteratorNormalCompletion2 && _iterator2.return) { _iterator2.return(); } } finally { if (_didIteratorError2) { throw _iteratorError2; } } } } else if (o.values.indexOf(value) >= 0) { // verified an allowed value matches the entire attribute value newValue = value; } value = newValue; } } } catch (err) { _didIteratorError = true; _iteratorError = err; } finally { try { if (!_iteratorNormalCompletion && _iterator.return) { _iterator.return(); } } finally { if (_didIteratorError) { throw _iteratorError; } } } } if (passedAllowedAttributesMapCheck) { if (options.allowedSchemesAppliedToAttributes.indexOf(a) !== -1) { if (naughtyHref(name, value)) { delete frame.attribs[a]; return; } } if (name === 'iframe' && a === 'src') { var allowed = true; try { // naughtyHref is in charge of whether protocol relative URLs // are cool. We should just accept them parsed = url.parse(value, false, true); var isRelativeUrl = parsed && parsed.host === null && parsed.protocol === null; if (isRelativeUrl) { // default value of allowIframeRelativeUrls is true unless allowIframeHostnames specified allowed = has(options, "allowIframeRelativeUrls") ? options.allowIframeRelativeUrls : !options.allowedIframeHostnames; } else if (options.allowedIframeHostnames) { allowed = options.allowedIframeHostnames.find(function (hostname) { return hostname === parsed.hostname; }); } } catch (e) { // Unparseable iframe src allowed = false; } if (!allowed) { delete frame.attribs[a]; return; } } if (a === 'srcset') { try { parsed = srcset.parse(value); each(parsed, function (value) { if (naughtyHref('srcset', value.url)) { value.evil = true; } }); parsed = filter(parsed, function (v) { return !v.evil; }); if (!parsed.length) { delete frame.attribs[a]; return; } else { value = srcset.stringify(filter(parsed, function (v) { return !v.evil; })); frame.attribs[a] = value; } } catch (e) { // Unparseable srcset delete frame.attribs[a]; return; } } if (a === 'class') { value = filterClasses(value, allowedClassesMap[name]); if (!value.length) { delete frame.attribs[a]; return; } } if (a === 'style') { try { var abstractSyntaxTree = postcss.parse(name + " {" + value + "}"); var filteredAST = filterCss(abstractSyntaxTree, options.allowedStyles); value = stringifyStyleAttributes(filteredAST); if (value.length === 0) { delete frame.attribs[a]; return; } } catch (e) { delete frame.attribs[a]; return; } } result += ' ' + a; if (value.length) { result += '="' + escapeHtml(value, true) + '"'; } } else { delete frame.attribs[a]; } }); } if (options.selfClosing.indexOf(name) !== -1) { result += " />"; } else { result += ">"; if (frame.innerText && !hasText && !options.textFilter) { result += frame.innerText; } } }, ontext: function ontext(text) { if (skipText) { return; } var lastFrame = stack[stack.length - 1]; var tag; if (lastFrame) { tag = lastFrame.tag; // If inner text was set by transform function then let's use it text = lastFrame.innerText !== undefined ? lastFrame.innerText : text; } if (tag === 'script' || tag === 'style') { // htmlparser2 gives us these as-is. Escaping them ruins the content. Allowing // script tags is, by definition, game over for XSS protection, so if that's // your concern, don't allow them. The same is essentially true for style tags // which have their own collection of XSS vectors. result += text; } else { var escaped = escapeHtml(text, false); if (options.textFilter) { result += options.textFilter(escaped); } else { result += escaped; } } if (stack.length) { var frame = stack[stack.length - 1]; frame.text += text; } }, onclosetag: function onclosetag(name) { if (skipText) { skipTextDepth--; if (!skipTextDepth) { skipText = false; } else { return; } } var frame = stack.pop(); if (!frame) { // Do not crash on bad markup return; } skipText = false; depth--; if (skipMap[depth]) { delete skipMap[depth]; frame.updateParentNodeText(); return; } if (transformMap[depth]) { name = transformMap[depth]; delete transformMap[depth]; } if (options.exclusiveFilter && options.exclusiveFilter(frame)) { result = result.substr(0, frame.tagPosition); return; } frame.updateParentNodeText(); if (options.selfClosing.indexOf(name) !== -1) { // Already output /> return; } result += "</" + name + ">"; } }, options.parser); parser.write(html); parser.end(); return result; function escapeHtml(s, quote) { if (typeof s !== 'string') { s = s + ''; } if (options.parser.decodeEntities) { s = s.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/\>/g, '&gt;'); if (quote) { s = s.replace(/\"/g, '&quot;'); } } // TODO: this is inadequate because it will pass `&0;`. This approach // will not work, each & must be considered with regard to whether it // is followed by a 100% syntactically valid entity or not, and escaped // if it is not. If this bothers you, don't set parser.decodeEntities // to false. (The default is true.) s = s.replace(/&(?![a-zA-Z0-9#]{1,20};)/g, '&amp;') // Match ampersands not part of existing HTML entity .replace(/</g, '&lt;').replace(/\>/g, '&gt;'); if (quote) { s = s.replace(/\"/g, '&quot;'); } return s; } function naughtyHref(name, href) { // Browsers ignore character codes of 32 (space) and below in a surprising // number of situations. Start reading here: // https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#Embedded_tab href = href.replace(/[\x00-\x20]+/g, ''); // Clobber any comments in URLs, which the browser might // interpret inside an XML data island, allowing // a javascript: URL to be snuck through href = href.replace(/<\!\-\-.*?\-\-\>/g, ''); // Case insensitive so we don't get faked out by JAVASCRIPT #1 var matches = href.match(/^([a-zA-Z]+)\:/); if (!matches) { // Protocol-relative URL starting with any combination of '/' and '\' if (href.match(/^[\/\\]{2}/)) { return !options.allowProtocolRelative; } // No scheme return false; } var scheme = matches[1].toLowerCase(); if (has(options.allowedSchemesByTag, name)) { return options.allowedSchemesByTag[name].indexOf(scheme) === -1; } return !options.allowedSchemes || options.allowedSchemes.indexOf(scheme) === -1; } /** * Filters user input css properties by whitelisted regex attributes. * * @param {object} abstractSyntaxTree - Object representation of CSS attributes. * @property {array[Declaration]} abstractSyntaxTree.nodes[0] - Each object cointains prop and value key, i.e { prop: 'color', value: 'red' }. * @param {object} allowedStyles - Keys are properties (i.e color), value is list of permitted regex rules (i.e /green/i). * @return {object} - Abstract Syntax Tree with filtered style attributes. */ function filterCss(abstractSyntaxTree, allowedStyles) { if (!allowedStyles) { return abstractSyntaxTree; } var filteredAST = cloneDeep(abstractSyntaxTree); var astRules = abstractSyntaxTree.nodes[0]; var selectedRule; // Merge global and tag-specific styles into new AST. if (allowedStyles[astRules.selector] && allowedStyles['*']) { selectedRule = mergeWith(cloneDeep(allowedStyles[astRules.selector]), allowedStyles['*'], function (objValue, srcValue) { if (Array.isArray(objValue)) { return objValue.concat(srcValue); } }); } else { selectedRule = allowedStyles[astRules.selector] || allowedStyles['*']; } if (selectedRule) { filteredAST.nodes[0].nodes = astRules.nodes.reduce(filterDeclarations(selectedRule), []); } return filteredAST; } /** * Extracts the style attribues from an AbstractSyntaxTree and formats those * values in the inline style attribute format. * * @param {AbstractSyntaxTree} filteredAST * @return {string} - Example: "color:yellow;text-align:center;font-family:helvetica;" */ function stringifyStyleAttributes(filteredAST) { return filteredAST.nodes[0].nodes.reduce(function (extractedAttributes, attributeObject) { extractedAttributes.push(attributeObject.prop + ':' + attributeObject.value); return extractedAttributes; }, []).join(';'); } /** * Filters the existing attributes for the given property. Discards any attributes * which don't match the whitelist. * * @param {object} selectedRule - Example: { color: red, font-family: helvetica } * @param {array} allowedDeclarationsList - List of declarations which pass whitelisting. * @param {object} attributeObject - Object representing the current css property. * @property {string} attributeObject.type - Typically 'declaration'. * @property {string} attributeObject.prop - The CSS property, i.e 'color'. * @property {string} attributeObject.value - The corresponding value to the css property, i.e 'red'. * @return {function} - When used in Array.reduce, will return an array of Declaration objects */ function filterDeclarations(selectedRule) { return function (allowedDeclarationsList, attributeObject) { // If this property is whitelisted... if (selectedRule.hasOwnProperty(attributeObject.prop)) { var matchesRegex = selectedRule[attributeObject.prop].some(function (regularExpression) { return regularExpression.test(attributeObject.value); }); if (matchesRegex) { allowedDeclarationsList.push(attributeObject); } } return allowedDeclarationsList; }; } function filterClasses(classes, allowed) { if (!allowed) { // The class attribute is allowed without filtering on this tag return classes; } classes = classes.split(/\s+/); return classes.filter(function (clss) { return allowed.indexOf(clss) !== -1; }).join(' '); } } // Defaults are accessible to you so that you can use them as a starting point // programmatically if you wish var htmlParserDefaults = { decodeEntities: true }; sanitizeHtml.defaults = { allowedTags: ['h3', 'h4', 'h5', 'h6', 'blockquote', 'p', 'a', 'ul', 'ol', 'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div', 'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre', 'iframe'], allowedAttributes: { a: ['href', 'name', 'target'], // We don't currently allow img itself by default, but this // would make sense if we did. You could add srcset here, // and if you do the URL is checked for safety img: ['src'] }, // Lots of these won't come up by default because we don't allow them selfClosing: ['img', 'br', 'hr', 'area', 'base', 'basefont', 'input', 'link', 'meta'], // URL schemes we permit allowedSchemes: ['http', 'https', 'ftp', 'mailto'], allowedSchemesByTag: {}, allowedSchemesAppliedToAttributes: ['href', 'src', 'cite'], allowProtocolRelative: true }; sanitizeHtml.simpleTransform = function (newTagName, newAttribs, merge) { merge = merge === undefined ? true : merge; newAttribs = newAttribs || {}; return function (tagName, attribs) { var attrib; if (merge) { for (attrib in newAttribs) { attribs[attrib] = newAttribs[attrib]; } } else { attribs = newAttribs; } return { tagName: newTagName, attribs: attribs }; }; };