sanitize-html
Version:
Clean up user-submitted HTML, preserving whitelisted elements and whitelisted attributes on a per-element basis
638 lines (581 loc) • 22.5 kB
JavaScript
var htmlparser = require('htmlparser2');
var extend = require('xtend');
var quoteRegexp = require('lodash.escaperegexp');
var cloneDeep = require('lodash.clonedeep');
var mergeWith = require('lodash.mergewith');
var isString = require('lodash.isstring');
var isPlainObject = require('lodash.isplainobject');
var srcset = require('srcset');
var postcss = require('postcss');
var url = require('url');
function each(obj, cb) {
if (obj) Object.keys(obj).forEach(function (key) {
cb(obj[key], key);
});
}
// Avoid false positives with .__proto__, .hasOwnProperty, etc.
function has(obj, key) {
return {}.hasOwnProperty.call(obj, key);
}
// Returns those elements of `a` for which `cb(a)` returns truthy
function filter(a, cb) {
var n = [];
each(a, function (v) {
if (cb(v)) {
n.push(v);
}
});
return n;
}
module.exports = sanitizeHtml;
// A valid attribute name.
// We use a tolerant definition based on the set of strings defined by
// html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
// and html.spec.whatwg.org/multipage/parsing.html#attribute-name-state .
// The characters accepted are ones which can be appended to the attribute
// name buffer without triggering a parse error:
// * unexpected-equals-sign-before-attribute-name
// * unexpected-null-character
// * unexpected-character-in-attribute-name
// We exclude the empty string because it's impossible to get to the after
// attribute name state with an empty attribute name buffer.
var VALID_HTML_ATTRIBUTE_NAME = /^[^\0\t\n\f\r /<=>]+$/;
// Ignore the _recursing flag; it's there for recursive
// invocation as a guard against this exploit:
// https://github.com/fb55/htmlparser2/issues/105
function sanitizeHtml(html, options, _recursing) {
var result = '';
function Frame(tag, attribs) {
var that = this;
this.tag = tag;
this.attribs = attribs || {};
this.tagPosition = result.length;
this.text = ''; // Node inner text
this.updateParentNodeText = function () {
if (stack.length) {
var parentFrame = stack[stack.length - 1];
parentFrame.text += that.text;
}
};
}
if (!options) {
options = sanitizeHtml.defaults;
options.parser = htmlParserDefaults;
} else {
options = extend(sanitizeHtml.defaults, options);
if (options.parser) {
options.parser = extend(htmlParserDefaults, options.parser);
} else {
options.parser = htmlParserDefaults;
}
}
// Tags that contain something other than HTML, or where discarding
// the text when the tag is disallowed makes sense for other reasons.
// If we are not allowing these tags, we should drop their content too.
// For other tags you would drop the tag but keep its content.
var nonTextTagsArray = options.nonTextTags || ['script', 'style', 'textarea'];
var allowedAttributesMap;
var allowedAttributesGlobMap;
if (options.allowedAttributes) {
allowedAttributesMap = {};
allowedAttributesGlobMap = {};
each(options.allowedAttributes, function (attributes, tag) {
allowedAttributesMap[tag] = [];
var globRegex = [];
attributes.forEach(function (obj) {
if (isString(obj) && obj.indexOf('*') >= 0) {
globRegex.push(quoteRegexp(obj).replace(/\\\*/g, '.*'));
} else {
allowedAttributesMap[tag].push(obj);
}
});
allowedAttributesGlobMap[tag] = new RegExp('^(' + globRegex.join('|') + ')$');
});
}
var allowedClassesMap = {};
each(options.allowedClasses, function (classes, tag) {
// Implicitly allows the class attribute
if (allowedAttributesMap) {
if (!has(allowedAttributesMap, tag)) {
allowedAttributesMap[tag] = [];
}
allowedAttributesMap[tag].push('class');
}
allowedClassesMap[tag] = classes;
});
var transformTagsMap = {};
var transformTagsAll;
each(options.transformTags, function (transform, tag) {
var transFun;
if (typeof transform === 'function') {
transFun = transform;
} else if (typeof transform === "string") {
transFun = sanitizeHtml.simpleTransform(transform);
}
if (tag === '*') {
transformTagsAll = transFun;
} else {
transformTagsMap[tag] = transFun;
}
});
var depth = 0;
var stack = [];
var skipMap = {};
var transformMap = {};
var skipText = false;
var skipTextDepth = 0;
var parser = new htmlparser.Parser({
onopentag: function onopentag(name, attribs) {
if (skipText) {
skipTextDepth++;
return;
}
var frame = new Frame(name, attribs);
stack.push(frame);
var skip = false;
var hasText = frame.text ? true : false;
var transformedTag;
if (has(transformTagsMap, name)) {
transformedTag = transformTagsMap[name](name, attribs);
frame.attribs = attribs = transformedTag.attribs;
if (transformedTag.text !== undefined) {
frame.innerText = transformedTag.text;
}
if (name !== transformedTag.tagName) {
frame.name = name = transformedTag.tagName;
transformMap[depth] = transformedTag.tagName;
}
}
if (transformTagsAll) {
transformedTag = transformTagsAll(name, attribs);
frame.attribs = attribs = transformedTag.attribs;
if (name !== transformedTag.tagName) {
frame.name = name = transformedTag.tagName;
transformMap[depth] = transformedTag.tagName;
}
}
if (options.allowedTags && options.allowedTags.indexOf(name) === -1) {
skip = true;
if (nonTextTagsArray.indexOf(name) !== -1) {
skipText = true;
skipTextDepth = 1;
}
skipMap[depth] = true;
}
depth++;
if (skip) {
// We want the contents but not this tag
return;
}
result += '<' + name;
if (!allowedAttributesMap || has(allowedAttributesMap, name) || allowedAttributesMap['*']) {
each(attribs, function (value, a) {
if (!VALID_HTML_ATTRIBUTE_NAME.test(a)) {
// This prevents part of an attribute name in the output from being
// interpreted as the end of an attribute, or end of a tag.
delete frame.attribs[a];
return;
}
var parsed;
// check allowedAttributesMap for the element and attribute and modify the value
// as necessary if there are specific values defined.
var passedAllowedAttributesMapCheck = false;
if (!allowedAttributesMap || has(allowedAttributesMap, name) && allowedAttributesMap[name].indexOf(a) !== -1 || allowedAttributesMap['*'] && allowedAttributesMap['*'].indexOf(a) !== -1 || has(allowedAttributesGlobMap, name) && allowedAttributesGlobMap[name].test(a) || allowedAttributesGlobMap['*'] && allowedAttributesGlobMap['*'].test(a)) {
passedAllowedAttributesMapCheck = true;
} else if (allowedAttributesMap && allowedAttributesMap[name]) {
var _iteratorNormalCompletion = true;
var _didIteratorError = false;
var _iteratorError = undefined;
try {
for (var _iterator = allowedAttributesMap[name][Symbol.iterator](), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) {
var o = _step.value;
if (isPlainObject(o) && o.name && o.name === a) {
passedAllowedAttributesMapCheck = true;
var newValue = '';
if (o.multiple === true) {
// verify the values that are allowed
var splitStrArray = value.split(' ');
var _iteratorNormalCompletion2 = true;
var _didIteratorError2 = false;
var _iteratorError2 = undefined;
try {
for (var _iterator2 = splitStrArray[Symbol.iterator](), _step2; !(_iteratorNormalCompletion2 = (_step2 = _iterator2.next()).done); _iteratorNormalCompletion2 = true) {
var s = _step2.value;
if (o.values.indexOf(s) !== -1) {
if (newValue === '') {
newValue = s;
} else {
newValue += ' ' + s;
}
}
}
} catch (err) {
_didIteratorError2 = true;
_iteratorError2 = err;
} finally {
try {
if (!_iteratorNormalCompletion2 && _iterator2.return) {
_iterator2.return();
}
} finally {
if (_didIteratorError2) {
throw _iteratorError2;
}
}
}
} else if (o.values.indexOf(value) >= 0) {
// verified an allowed value matches the entire attribute value
newValue = value;
}
value = newValue;
}
}
} catch (err) {
_didIteratorError = true;
_iteratorError = err;
} finally {
try {
if (!_iteratorNormalCompletion && _iterator.return) {
_iterator.return();
}
} finally {
if (_didIteratorError) {
throw _iteratorError;
}
}
}
}
if (passedAllowedAttributesMapCheck) {
if (options.allowedSchemesAppliedToAttributes.indexOf(a) !== -1) {
if (naughtyHref(name, value)) {
delete frame.attribs[a];
return;
}
}
if (name === 'iframe' && a === 'src') {
var allowed = true;
try {
// naughtyHref is in charge of whether protocol relative URLs
// are cool. We should just accept them
parsed = url.parse(value, false, true);
var isRelativeUrl = parsed && parsed.host === null && parsed.protocol === null;
if (isRelativeUrl) {
// default value of allowIframeRelativeUrls is true unless allowIframeHostnames specified
allowed = has(options, "allowIframeRelativeUrls") ? options.allowIframeRelativeUrls : !options.allowedIframeHostnames;
} else if (options.allowedIframeHostnames) {
allowed = options.allowedIframeHostnames.find(function (hostname) {
return hostname === parsed.hostname;
});
}
} catch (e) {
// Unparseable iframe src
allowed = false;
}
if (!allowed) {
delete frame.attribs[a];
return;
}
}
if (a === 'srcset') {
try {
parsed = srcset.parse(value);
each(parsed, function (value) {
if (naughtyHref('srcset', value.url)) {
value.evil = true;
}
});
parsed = filter(parsed, function (v) {
return !v.evil;
});
if (!parsed.length) {
delete frame.attribs[a];
return;
} else {
value = srcset.stringify(filter(parsed, function (v) {
return !v.evil;
}));
frame.attribs[a] = value;
}
} catch (e) {
// Unparseable srcset
delete frame.attribs[a];
return;
}
}
if (a === 'class') {
value = filterClasses(value, allowedClassesMap[name]);
if (!value.length) {
delete frame.attribs[a];
return;
}
}
if (a === 'style') {
try {
var abstractSyntaxTree = postcss.parse(name + " {" + value + "}");
var filteredAST = filterCss(abstractSyntaxTree, options.allowedStyles);
value = stringifyStyleAttributes(filteredAST);
if (value.length === 0) {
delete frame.attribs[a];
return;
}
} catch (e) {
delete frame.attribs[a];
return;
}
}
result += ' ' + a;
if (value.length) {
result += '="' + escapeHtml(value, true) + '"';
}
} else {
delete frame.attribs[a];
}
});
}
if (options.selfClosing.indexOf(name) !== -1) {
result += " />";
} else {
result += ">";
if (frame.innerText && !hasText && !options.textFilter) {
result += frame.innerText;
}
}
},
ontext: function ontext(text) {
if (skipText) {
return;
}
var lastFrame = stack[stack.length - 1];
var tag;
if (lastFrame) {
tag = lastFrame.tag;
// If inner text was set by transform function then let's use it
text = lastFrame.innerText !== undefined ? lastFrame.innerText : text;
}
if (tag === 'script' || tag === 'style') {
// htmlparser2 gives us these as-is. Escaping them ruins the content. Allowing
// script tags is, by definition, game over for XSS protection, so if that's
// your concern, don't allow them. The same is essentially true for style tags
// which have their own collection of XSS vectors.
result += text;
} else {
var escaped = escapeHtml(text, false);
if (options.textFilter) {
result += options.textFilter(escaped);
} else {
result += escaped;
}
}
if (stack.length) {
var frame = stack[stack.length - 1];
frame.text += text;
}
},
onclosetag: function onclosetag(name) {
if (skipText) {
skipTextDepth--;
if (!skipTextDepth) {
skipText = false;
} else {
return;
}
}
var frame = stack.pop();
if (!frame) {
// Do not crash on bad markup
return;
}
skipText = false;
depth--;
if (skipMap[depth]) {
delete skipMap[depth];
frame.updateParentNodeText();
return;
}
if (transformMap[depth]) {
name = transformMap[depth];
delete transformMap[depth];
}
if (options.exclusiveFilter && options.exclusiveFilter(frame)) {
result = result.substr(0, frame.tagPosition);
return;
}
frame.updateParentNodeText();
if (options.selfClosing.indexOf(name) !== -1) {
// Already output />
return;
}
result += "</" + name + ">";
}
}, options.parser);
parser.write(html);
parser.end();
return result;
function escapeHtml(s, quote) {
if (typeof s !== 'string') {
s = s + '';
}
if (options.parser.decodeEntities) {
s = s.replace(/&/g, '&').replace(/</g, '<').replace(/\>/g, '>');
if (quote) {
s = s.replace(/\"/g, '"');
}
}
// TODO: this is inadequate because it will pass `&0;`. This approach
// will not work, each & must be considered with regard to whether it
// is followed by a 100% syntactically valid entity or not, and escaped
// if it is not. If this bothers you, don't set parser.decodeEntities
// to false. (The default is true.)
s = s.replace(/&(?![a-zA-Z0-9#]{1,20};)/g, '&') // Match ampersands not part of existing HTML entity
.replace(/</g, '<').replace(/\>/g, '>');
if (quote) {
s = s.replace(/\"/g, '"');
}
return s;
}
function naughtyHref(name, href) {
// Browsers ignore character codes of 32 (space) and below in a surprising
// number of situations. Start reading here:
// https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#Embedded_tab
href = href.replace(/[\x00-\x20]+/g, '');
// Clobber any comments in URLs, which the browser might
// interpret inside an XML data island, allowing
// a javascript: URL to be snuck through
href = href.replace(/<\!\-\-.*?\-\-\>/g, '');
// Case insensitive so we don't get faked out by JAVASCRIPT #1
var matches = href.match(/^([a-zA-Z]+)\:/);
if (!matches) {
// Protocol-relative URL starting with any combination of '/' and '\'
if (href.match(/^[\/\\]{2}/)) {
return !options.allowProtocolRelative;
}
// No scheme
return false;
}
var scheme = matches[1].toLowerCase();
if (has(options.allowedSchemesByTag, name)) {
return options.allowedSchemesByTag[name].indexOf(scheme) === -1;
}
return !options.allowedSchemes || options.allowedSchemes.indexOf(scheme) === -1;
}
/**
* Filters user input css properties by whitelisted regex attributes.
*
* @param {object} abstractSyntaxTree - Object representation of CSS attributes.
* @property {array[Declaration]} abstractSyntaxTree.nodes[0] - Each object cointains prop and value key, i.e { prop: 'color', value: 'red' }.
* @param {object} allowedStyles - Keys are properties (i.e color), value is list of permitted regex rules (i.e /green/i).
* @return {object} - Abstract Syntax Tree with filtered style attributes.
*/
function filterCss(abstractSyntaxTree, allowedStyles) {
if (!allowedStyles) {
return abstractSyntaxTree;
}
var filteredAST = cloneDeep(abstractSyntaxTree);
var astRules = abstractSyntaxTree.nodes[0];
var selectedRule;
// Merge global and tag-specific styles into new AST.
if (allowedStyles[astRules.selector] && allowedStyles['*']) {
selectedRule = mergeWith(cloneDeep(allowedStyles[astRules.selector]), allowedStyles['*'], function (objValue, srcValue) {
if (Array.isArray(objValue)) {
return objValue.concat(srcValue);
}
});
} else {
selectedRule = allowedStyles[astRules.selector] || allowedStyles['*'];
}
if (selectedRule) {
filteredAST.nodes[0].nodes = astRules.nodes.reduce(filterDeclarations(selectedRule), []);
}
return filteredAST;
}
/**
* Extracts the style attribues from an AbstractSyntaxTree and formats those
* values in the inline style attribute format.
*
* @param {AbstractSyntaxTree} filteredAST
* @return {string} - Example: "color:yellow;text-align:center;font-family:helvetica;"
*/
function stringifyStyleAttributes(filteredAST) {
return filteredAST.nodes[0].nodes.reduce(function (extractedAttributes, attributeObject) {
extractedAttributes.push(attributeObject.prop + ':' + attributeObject.value);
return extractedAttributes;
}, []).join(';');
}
/**
* Filters the existing attributes for the given property. Discards any attributes
* which don't match the whitelist.
*
* @param {object} selectedRule - Example: { color: red, font-family: helvetica }
* @param {array} allowedDeclarationsList - List of declarations which pass whitelisting.
* @param {object} attributeObject - Object representing the current css property.
* @property {string} attributeObject.type - Typically 'declaration'.
* @property {string} attributeObject.prop - The CSS property, i.e 'color'.
* @property {string} attributeObject.value - The corresponding value to the css property, i.e 'red'.
* @return {function} - When used in Array.reduce, will return an array of Declaration objects
*/
function filterDeclarations(selectedRule) {
return function (allowedDeclarationsList, attributeObject) {
// If this property is whitelisted...
if (selectedRule.hasOwnProperty(attributeObject.prop)) {
var matchesRegex = selectedRule[attributeObject.prop].some(function (regularExpression) {
return regularExpression.test(attributeObject.value);
});
if (matchesRegex) {
allowedDeclarationsList.push(attributeObject);
}
}
return allowedDeclarationsList;
};
}
function filterClasses(classes, allowed) {
if (!allowed) {
// The class attribute is allowed without filtering on this tag
return classes;
}
classes = classes.split(/\s+/);
return classes.filter(function (clss) {
return allowed.indexOf(clss) !== -1;
}).join(' ');
}
}
// Defaults are accessible to you so that you can use them as a starting point
// programmatically if you wish
var htmlParserDefaults = {
decodeEntities: true
};
sanitizeHtml.defaults = {
allowedTags: ['h3', 'h4', 'h5', 'h6', 'blockquote', 'p', 'a', 'ul', 'ol', 'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div', 'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre', 'iframe'],
allowedAttributes: {
a: ['href', 'name', 'target'],
// We don't currently allow img itself by default, but this
// would make sense if we did. You could add srcset here,
// and if you do the URL is checked for safety
img: ['src']
},
// Lots of these won't come up by default because we don't allow them
selfClosing: ['img', 'br', 'hr', 'area', 'base', 'basefont', 'input', 'link', 'meta'],
// URL schemes we permit
allowedSchemes: ['http', 'https', 'ftp', 'mailto'],
allowedSchemesByTag: {},
allowedSchemesAppliedToAttributes: ['href', 'src', 'cite'],
allowProtocolRelative: true
};
sanitizeHtml.simpleTransform = function (newTagName, newAttribs, merge) {
merge = merge === undefined ? true : merge;
newAttribs = newAttribs || {};
return function (tagName, attribs) {
var attrib;
if (merge) {
for (attrib in newAttribs) {
attribs[attrib] = newAttribs[attrib];
}
} else {
attribs = newAttribs;
}
return {
tagName: newTagName,
attribs: attribs
};
};
};
;