marked-sanitizer-github
Version:
HTML tag sanitizer for marked
364 lines • 11.5 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.SanitizeConfig = exports.SanitizeWhitelist = void 0;
const url_1 = require("url");
const path_1 = require("path");
const htmlparser2_1 = require("htmlparser2");
const he_1 = require("he");
const voidElements = require("html-void-elements");
const VOID_ELEMENTS = new Set(voidElements);
const RE_EXTRACT_TAG_NAME = /<\/([^>]+)>/;
var HowToSanitize;
(function (HowToSanitize) {
HowToSanitize[HowToSanitize["Escape"] = 0] = "Escape";
HowToSanitize[HowToSanitize["Remove"] = 1] = "Remove";
HowToSanitize[HowToSanitize["DoNothing"] = 2] = "DoNothing";
HowToSanitize[HowToSanitize["EscapeWithoutPush"] = 3] = "EscapeWithoutPush";
})(HowToSanitize || (HowToSanitize = {}));
class SanitizeWhitelist {
constructor() {
this.ELEMENTS = new Set([
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'h7',
'h8',
'br',
'b',
'i',
'strong',
'em',
'a',
'pre',
'code',
'img',
'tt',
'div',
'ins',
'del',
'sup',
'sub',
'p',
'ol',
'ul',
'table',
'thead',
'tbody',
'tfoot',
'blockquote',
'dl',
'dt',
'dd',
'kbd',
'q',
'samp',
'var',
'hr',
'ruby',
'rt',
'rp',
'li',
'tr',
'td',
'th',
's',
'strike',
'summary',
'details',
]);
this.REMOVE_CONTENTS = ['script'];
this.ATTRIBUTES = {
a: ['href'],
img: ['src', 'longdesc'],
div: ['itemscope', 'itemtype'],
blockquote: ['cite'],
del: ['cite'],
ins: ['cite'],
q: ['cite'],
'*': new Set([
'abbr',
'accept',
'accept-charset',
'accesskey',
'action',
'align',
'alt',
'axis',
'border',
'cellpadding',
'cellspacing',
'char',
'charoff',
'charset',
'checked',
'clear',
'cols',
'colspan',
'color',
'compact',
'coords',
'datetime',
'dir',
'disabled',
'enctype',
'for',
'frame',
'headers',
'height',
'hreflang',
'hspace',
'ismap',
'label',
'lang',
'maxlength',
'media',
'method',
'multiple',
'name',
'nohref',
'noshade',
'nowrap',
'open',
'prompt',
'readonly',
'rel',
'rev',
'rows',
'rowspan',
'rules',
'scope',
'selected',
'shape',
'size',
'span',
'start',
'summary',
'tabindex',
'target',
'title',
'type',
'usemap',
'valign',
'value',
'vspace',
'width',
'itemprop',
]),
};
// Note: Relative path also should be allowed
this.PROTOCOLS = {
a: {
href: ['http', 'https', 'mailto', 'github-windows', 'github-mac'],
},
blockquote: {
cite: ['http', 'https'],
},
del: {
cite: ['http', 'https'],
},
ins: {
cite: ['http', 'https'],
},
q: {
cite: ['http', 'https'],
},
img: {
src: ['http', 'https'],
longdesc: ['http', 'https'],
},
};
}
}
exports.SanitizeWhitelist = SanitizeWhitelist;
class SanitizeConfig {
constructor() {
this.LIST = ['ul', 'ol'];
this.LIST_ITEM = 'li';
this.TABLE_ITEMS = ['tr', 'td', 'th'];
this.TABLE = 'table';
this.TABLE_SECTIONS = ['thead', 'tbody', 'tfoot'];
this.whitelist = new SanitizeWhitelist();
}
}
exports.SanitizeConfig = SanitizeConfig;
class SanitizeState {
constructor() {
this.config = new SanitizeConfig();
this.onDetectedBroken = null;
this.broken = false;
this.tagStack = [];
this.parser = new htmlparser2_1.Parser({
onopentag: (name, attrs) => {
this.parsed = { name, attrs };
},
oncomment: () => {
this.parsed = null;
},
});
}
reset() {
this.tagStack = [];
this.broken = false;
}
isInUse() {
return this.broken || this.tagStack.length !== 0;
}
isBroken() {
return this.broken;
}
getSanitizer() {
return this.sanitize.bind(this);
}
sanitize(tag) {
if (this.broken) {
return he_1.escape(tag);
}
if (tag.startsWith('</')) {
return this.sanitizeCloseTag(tag);
}
else {
return this.sanitizeOpenTag(tag);
}
}
itsBroken(msg, tag) {
if (this.broken) {
// Already broken
return;
}
this.broken = true;
if (this.onDetectedBroken !== null) {
this.onDetectedBroken(msg, tag);
}
}
sanitizeCloseTag(tag) {
const matched = tag.match(RE_EXTRACT_TAG_NAME);
if (matched === null) {
this.itsBroken(`Closing HTML tag is broken: '${tag}'`, tag);
return he_1.escape(tag);
}
const tagName = matched[1].toLowerCase();
if (VOID_ELEMENTS.has(tagName)) {
return '';
}
if (!this.config.whitelist.ELEMENTS.has(tagName)) {
// If tag name is not allowed, it is always escaped
return he_1.escape(tag);
}
// Note: This check must be done after above void element check because tag history
// stack is empty when a void element is at toplevel and with a closing tag.
if (this.tagStack.length === 0) {
this.itsBroken('Extra closing HTML tags in the document', tag);
return he_1.escape(tag);
}
// Check top
const [name, how] = this.tagStack[this.tagStack.length - 1];
if (tagName !== name) {
this.itsBroken(`Open/Closing HTML tag mismatch: </${name}> v.s. ${tag}`, tag);
return he_1.escape(tag);
}
// Pop
this.tagStack.pop();
switch (how) {
case HowToSanitize.Remove:
return '';
case HowToSanitize.Escape:
return he_1.escape(tag);
case HowToSanitize.DoNothing:
return tag;
case HowToSanitize.EscapeWithoutPush:
throw new Error('NEVER REACH HERE');
}
}
sanitizeOpenTag(tag) {
const elem = this.parseOpenTag(tag);
switch (elem) {
case null:
// null means comment
return '';
case undefined:
this.itsBroken(`Failed to parse open HTML tag: '${tag}'`, tag);
return he_1.escape(tag);
default:
break;
}
const how = this.howToSanitize(elem);
if (how !== HowToSanitize.EscapeWithoutPush && !tag.endsWith('/>') && !VOID_ELEMENTS.has(elem.name)) {
// Note: If it's not an empty element, push history stack
// Note: If the element is void element, we don't push it to tag hisotry stack.
// On sanitizeCloseTag(), closing tags for void elements are simply skipped and they
// never appear in converted HTML document.
this.tagStack.push([elem.name, how]);
}
switch (how) {
case HowToSanitize.Remove:
return '';
case HowToSanitize.Escape:
case HowToSanitize.EscapeWithoutPush:
return he_1.escape(tag);
case HowToSanitize.DoNothing:
return tag;
}
}
parseOpenTag(tag) {
this.parser.reset();
this.parser.write(tag);
this.parser.end();
const parsed = this.parsed;
this.parsed = undefined;
return parsed;
}
howToSanitize(elem) {
// Top-level <li> elements are removed because they can break out of
// containing markup.
if (elem.name === this.config.LIST_ITEM &&
this.tagStack.every(([name, _]) => this.config.LIST.indexOf(name) === -1)) {
return HowToSanitize.Remove;
}
// Table child elements that are not contained by a <table> are removed.
if ((this.config.TABLE_SECTIONS.indexOf(elem.name) !== -1 ||
this.config.TABLE_ITEMS.indexOf(elem.name) !== -1) &&
this.tagStack.every(([name, _]) => this.config.TABLE !== name)) {
return HowToSanitize.Remove;
}
const wl = this.config.whitelist;
// Check allowed (non escaped) elements
if (!wl.ELEMENTS.has(elem.name)) {
return HowToSanitize.EscapeWithoutPush;
}
// TODO: Check elements should be removed (not escaped, but just removed)
// It's hard to remove content of the element with current markedjs implementation.
const allowedAttrs = wl.ATTRIBUTES[elem.name];
for (const attr of Object.keys(elem.attrs)) {
// Check allowed attributes
const isAllowedOwnAttr = allowedAttrs !== undefined && allowedAttrs.indexOf(attr) !== -1;
const isAllowedAllAttr = wl.ATTRIBUTES['*'].has(attr);
if (!isAllowedOwnAttr && !isAllowedAllAttr) {
return HowToSanitize.Escape;
}
// Check allowed protocols (e.g. 'href' of <a/>)
if (elem.name in wl.PROTOCOLS && attr in wl.PROTOCOLS[elem.name]) {
const value = elem.attrs[attr];
try {
const u = new url_1.URL(value);
const protocol = u.protocol.slice(0, -1); // Omit last ':'
const allowedProtocols = wl.PROTOCOLS[elem.name][attr];
if (allowedProtocols.every(p => p !== protocol)) {
return HowToSanitize.Escape;
}
}
catch (_) {
// Not a URL
if (path_1.isAbsolute(value)) {
return HowToSanitize.Escape;
}
}
}
}
return HowToSanitize.DoNothing;
}
}
exports.default = SanitizeState;
//# sourceMappingURL=index.js.map