UNPKG

simple-xss

Version:

Simple whitelist-based html sanitizer for node and browser.

266 lines (243 loc) 6.08 kB
// xss.js VERSION // Copyright (c) 2014 Kumu // Freely distributed under the MIT license. (function($) { function xss(html, options) { return sanitizeHtml(html, options || xss.defaults); } xss.url = function(url, options) { return sanitizeResource(url, options || xss.defaults); }; // Exposed for testing xss._sanitizeAttributes = sanitizeAttributes; xss._getAttributeName = getAttributeName; xss.defaults = { elements: [ "a", "aside", "b", "blockquote", "br", "caption", "code", "del", "dd", "dfn", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "i", "img", "ins", "kbd", "li", "ol", "p", "pre", "q", "samp", "strike", "strong", "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "tt", "ul", "var" ], attributes: { "a": ["href"], "img": ["src"], "div": ["itemscope", "itemtype"], "all": [ "abbr", "accept", "accept-charset", "accesskey", "action", "align", "alt", "axis", "border", "cellpadding", "cellspacing", "char", "charoff", "charset", "checked", "cite", "clear", "cols", "colspan", "color", "compact", "coords", // "data-[a-z0-9-]+", "datetime", "dir", "disabled", "enctype", "for", "frame", "headers", "height", "hreflang", "hspace", "ismap", "label", "lang", "longdesc", "maxlength", "media", "method", "multiple", "name", "nohref", "noshade", "nowrap", "prompt", "readonly", "rel", "rev", "rows", "rowspan", "rules", "scope", "selected", "shape", "size", "span", "start", "summary", "tabindex", "target", "title", "type", "usemap", "valign", "value", "vspace", "width", "itemprop" ] }, // Default protocol support includes http(s), mailto, and relative. // TODO: Support protocol resolution too? //example.com protocols: /^(http|https|mailto|#|\/)/i }; var CONTAINED = {}; CONTAINED.thead = CONTAINED.tbody = CONTAINED.tfoot = /^table$/i; CONTAINED.tr = /^(table|thead|tbody|tfoot)$/i; CONTAINED.th = CONTAINED.td = /^tr$/i; CONTAINED.li = /^(ul|ol)$/i; // src: img, iframe // href: a var RESOURCEFUL = /^(src|href)$/; function sanitizeHtml(html, options) { var $wrapper = $("<body>").html(html); sanitizeChildren($wrapper, initializeOptions(options)); return $wrapper.html(); } function initializeOptions(options) { var opts = {}; opts.protocols = options.protocols; opts.elements = arrayToRegExp(options.elements); opts.attributes = {}; for (var tagName in options.attributes) { var attributes = options.attributes[tagName]; if (tagName != "all") attributes = attributes.concat(options.attributes.all); opts.attributes[tagName] = arrayToRegExp(attributes); } return opts; } function arrayToRegExp(array) { return new RegExp("^(" + array.join("|") + ")$", "i"); } function sanitizeElement($el, options) { if (options.elements.test(getTagName($el)) && isContained($el)) { sanitizeAttributes($el, options); sanitizeChildren($el, options); return $el; } else { $el.remove(); } } function sanitizeChildren($el, options) { $el.children().each(function() { sanitizeElement($(this), options); }); } // List and table items must be contained or they can break out. function isContained($el) { var requiredParent = CONTAINED[getTagNameLower($el)]; return !requiredParent || requiredParent.test(getTagName($el.parent())); } function sanitizeAttributes($el, options) { var tagName = getTagNameLower($el); var attribute, attributes = getAttributes($el); var whitelist = options.attributes[tagName] || options.attributes.all; for (var index in attributes) { if (attributes.hasOwnProperty(index)) { if ((attribute = getAttributeName(attributes, index))) { if (whitelist.test(attribute)) { sanitizeAttribute($el, attribute, options); } else { $el.removeAttr(attribute); } } } } } function sanitizeAttribute($el, attribute, options) { if (RESOURCEFUL.test(attribute)) { $el.attr(attribute, sanitizeResource($el.attr(attribute), options)); } } function sanitizeResource(value, options) { return (value && options.protocols.test(value)) ? value : ''; } // Conformity helpers since cheerio couldn't go the easy route and just // use the same variable names browsers do. function getTagName($el) { return $el[0].tagName || $el[0].name; } function getTagNameLower($el) { return getTagName($el).toLowerCase(); } function getAttributes($el) { return $el[0].attributes || $el[0].attribs; } // In the browser the attributes object looks like: // {"0": {"name": "class"}, "1": ...} // // In node / cheerio the attributes are keyed by name instead. // // - in IE9 it's possible for attribute to be undefined (issue #1) function getAttributeName(attributes, index) { if (Number(index) == index) { var attribute = attributes[String(index)]; return attribute && attribute.name; } else { return index; } } if (typeof window == "undefined") { module.exports = xss; } else { window.xss = xss; } })(typeof window == "undefined" ? require("cheerio") : $);