UNPKG

@diplodoc/transform

Version:

A simple transformer of text in YFM (Yandex Flavored Markdown) to HTML

810 lines (747 loc) 17.1 kB
import type {Attributes, Tag} from 'sanitize-html'; import type {CssWhiteList} from './typings'; import sanitizeHtml from 'sanitize-html'; // @ts-ignore import cssfilter from 'cssfilter'; import * as cheerio from 'cheerio'; import css from 'css'; import log from './log'; export type SanitizeFunction = ( html: string, options?: SanitizeOptions, additionalOptions?: SanitizeOptions, ) => string; const htmlTags = [ 'a', 'abbr', 'acronym', 'address', 'area', 'article', 'aside', 'audio', 'b', 'bdi', 'bdo', 'big', 'blink', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'content', 'data', 'datalist', 'dd', 'decorator', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'element', 'em', 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'main', 'map', 'mark', 'marquee', 'menu', 'menuitem', 'meter', 'nav', 'nobr', 'ol', 'optgroup', 'option', 'output', 'p', 'picture', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'section', 'select', 'shadow', 'small', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'tr', 'track', 'tt', 'u', 'ul', 'var', 'video', 'wbr', 'iframe', 'style', ]; const svgTags = [ 'svg', 'altglyph', 'altglyphdef', 'altglyphitem', 'animatecolor', 'animatemotion', 'animatetransform', 'circle', 'clippath', 'defs', 'desc', 'ellipse', 'filter', 'font', 'g', 'glyph', 'glyphref', 'hkern', 'image', 'line', 'lineargradient', 'marker', 'mask', 'metadata', 'mpath', 'path', 'pattern', 'polygon', 'polyline', 'radialgradient', 'rect', 'stop', 'switch', 'symbol', 'text', 'textpath', 'title', 'tref', 'tspan', 'view', 'vkern', 'animate', 'use', ]; const htmlAttrs = [ 'accept', 'action', 'align', 'alt', 'autocapitalize', 'autocomplete', 'autopictureinpicture', 'autoplay', 'background', 'bgcolor', 'border', 'capture', 'cellpadding', 'cellspacing', 'checked', 'cite', 'class', 'clear', 'color', 'cols', 'colspan', 'controls', 'controlslist', 'coords', 'crossorigin', 'datetime', 'decoding', 'default', 'dir', 'disabled', 'disablepictureinpicture', 'disableremoteplayback', 'download', 'draggable', 'enctype', 'enterkeyhint', 'face', 'for', 'headers', 'height', 'hidden', 'high', 'href', 'hreflang', 'id', 'inputmode', 'integrity', 'ismap', 'kind', 'label', 'lang', 'list', 'loading', 'loop', 'low', 'max', 'maxlength', 'media', 'method', 'min', 'minlength', 'multiple', 'muted', 'name', 'nonce', 'noshade', 'novalidate', 'nowrap', 'open', 'optimum', 'pattern', 'placeholder', 'playsinline', 'poster', 'preload', 'pubdate', 'radiogroup', 'readonly', 'rel', 'required', 'rev', 'reversed', 'role', 'rows', 'rowspan', 'spellcheck', 'scope', 'selected', 'shape', 'size', 'sizes', 'span', 'srclang', 'start', 'src', 'srcset', 'step', 'style', 'summary', 'tabindex', 'title', 'translate', 'type', 'usemap', 'valign', 'value', 'width', 'xmlns', 'slot', 'frameborder', 'scrolling', 'allow', 'target', 'attributeName', 'aria-hidden', 'referrerpolicy', 'aria-describedby', 'data-*', 'wide-content', 'sticky-header', ]; const svgAttrs = [ 'viewBox', 'accent-height', 'accumulate', 'additive', 'alignment-baseline', 'ascent', 'attributename', 'attributetype', 'azimuth', 'basefrequency', 'baseline-shift', 'begin', 'bias', 'by', 'class', 'clip', 'clippathunits', 'clip-path', 'clip-rule', 'color', 'color-interpolation', 'color-interpolation-filters', 'color-profile', 'color-rendering', 'cx', 'cy', 'd', 'dx', 'dy', 'diffuseconstant', 'direction', 'display', 'divisor', 'dur', 'edgemode', 'elevation', 'end', 'fill', 'fill-opacity', 'fill-rule', 'filter', 'filterunits', 'flood-color', 'flood-opacity', 'font-family', 'font-size', 'font-size-adjust', 'font-stretch', 'font-style', 'font-variant', 'font-weight', 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'glyphref', 'gradientunits', 'gradienttransform', 'height', 'href', 'id', 'image-rendering', 'in', 'in2', 'k', 'k1', 'k2', 'k3', 'k4', 'kerning', 'keypoints', 'keysplines', 'keytimes', 'lang', 'lengthadjust', 'letter-spacing', 'kernelmatrix', 'kernelunitlength', 'lighting-color', 'local', 'marker-end', 'marker-mid', 'marker-start', 'markerheight', 'markerunits', 'markerwidth', 'maskcontentunits', 'maskunits', 'max', 'mask', 'media', 'method', 'mode', 'min', 'name', 'numoctaves', 'offset', 'operator', 'opacity', 'order', 'orient', 'orientation', 'origin', 'overflow', 'paint-order', 'path', 'pathlength', 'patterncontentunits', 'patterntransform', 'patternunits', 'points', 'preservealpha', 'preserveaspectratio', 'primitiveunits', 'r', 'rx', 'ry', 'radius', 'refx', 'refy', 'repeatcount', 'repeatdur', 'restart', 'result', 'rotate', 'scale', 'seed', 'shape-rendering', 'specularconstant', 'specularexponent', 'spreadmethod', 'startoffset', 'stddeviation', 'stitchtiles', 'stop-color', 'stop-opacity', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity', 'stroke', 'stroke-width', 'style', 'surfacescale', 'systemlanguage', 'tabindex', 'targetx', 'targety', 'transform', 'text-anchor', 'text-decoration', 'text-rendering', 'textlength', 'type', 'u1', 'u2', 'unicode', 'values', 'viewbox', 'visibility', 'version', 'vert-adv-y', 'vert-origin-x', 'vert-origin-y', 'width', 'word-spacing', 'wrap', 'writing-mode', 'xchannelselector', 'ychannelselector', 'x', 'x1', 'x2', 'xmlns', 'y', 'y1', 'y2', 'z', 'zoomandpan', 'from', 'to', 'xlink:href', 'use', ]; const defaultCssWhitelist = { ...cssfilter.whiteList, '--method': true, }; const yfmHtmlAttrs = ['note-type', 'term-key']; const allowedTags = Array.from( new Set([...htmlTags, ...svgTags, ...sanitizeHtml.defaults.allowedTags]), ); const allowedAttributes = Array.from(new Set([...htmlAttrs, ...svgAttrs, ...yfmHtmlAttrs])); // For hrefs within "use" only allow local links to ids that start with "#" const useTagTransformer = (tagName: string, attribs: Attributes): Tag => { const cleanHref = (href: string) => { if (href.startsWith('#')) { return href; } else { return null; } }; const cleanAttrs = (attrs: Attributes): Attributes => { const HREF_ATTRIBUTES = ['xlink:href', 'href']; return Object.fromEntries( Object.entries(attrs) .map(([key, value]) => { if (HREF_ATTRIBUTES.includes(key)) { return [key, cleanHref(value)]; } return [key, value]; }) .filter(([_, value]) => value !== null), ); }; return { tagName, attribs: cleanAttrs(attribs), }; }; export interface SanitizeOptions extends sanitizeHtml.IOptions { cssWhiteList?: CssWhiteList; disableStyleSanitizer?: boolean; } export const defaultParseOptions = { lowerCaseAttributeNames: false, }; export const defaultOptions: SanitizeOptions = { ...sanitizeHtml.defaults, allowedTags, allowedAttributes: { ...sanitizeHtml.defaults.allowedAttributes, '*': allowedAttributes, }, allowedSchemesAppliedToAttributes: [ ...sanitizeHtml.defaults.allowedSchemesAppliedToAttributes, 'xlink:href', 'from', 'to', ], allowVulnerableTags: true, parser: defaultParseOptions, cssWhiteList: defaultCssWhitelist, transformTags: { use: useTagTransformer, }, }; // dangerous patterns const DANGEROUS_TAGS_RE = /<\s*(script|iframe|object|embed|svg|img|video|audio|link|meta|base|form|style|template|math|foreignobject)\b/i; const CLOSE_STYLE_RE = /<\s*\/\s*style/i; const DANGEROUS_URL_RE = /url\s*\(\s*['"]?\s*(?:javascript:|vbscript:|data\s*:\s*(?:text\/html|application\/xhtml\+xml|image\/svg\+xml))/i; const IE_EXPR_RE = /expression\s*\(/i; const IE_BEHAVIOR_RE = /behavior\s*:/i; const MOZ_BINDING_RE = /-moz-binding\s*:/i; const AT_RULES_RE = /@(?:import|charset|namespace)\b/i; const COMMENTS_RE = /\/\*[^]*?\*\//g; // CSS comments: /* ... */ // control characters (C0/C1) and BiDi override characters that can hide malicious content const CTRL_BIDI_RE = new RegExp( [ String.raw`[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F]`, // C0/C1 controls String.raw`[\u202A-\u202E\u2066-\u2069]`, // BiDi overrides ].join('|'), 'g', ); const SAFE_VALUE_FAST_CHECK_RE = /[<&\\/]|@|url\s*\(|expression|behavior|-moz-binding/i; // backslash (CSS escapes), ampersand (HTML entities), BiDi overrides const FAST_PATH_RE = /[\\&\u202A-\u202E\u2066-\u2069]/; // combined regex for decoding CSS escapes and HTML entities const RE_DECODE = new RegExp( [ String.raw`\\([0-9A-Fa-f]{1,6})\s?`, // CSS hex escapes: \41 or \000041 → 'A' String.raw`&#x([0-9A-Fa-f]{1,6});`, // HTML hex entities: &#x41; or &#X41; → 'A' String.raw`&#(\d{1,7});`, // HTML decimal entities: &#65; → 'A' String.raw`&([a-zA-Z][a-zA-Z0-9]{1,31});`, // HTML named entities: &lt; or &amp; → '<' or '&' ].join('|'), 'g', ); const htmlEntities: Record<string, string> = { lt: '<', gt: '>', quot: '"', apos: "'", amp: '&', newline: '\n', tab: '\t', colon: ':', sol: '/', lpar: '(', rpar: ')', }; // Decodes a single escaped or encoded token function decodeToken( whole: string, cssHex?: string, htmlHex?: string, htmlDec?: string, named?: string, ): string { if (cssHex) { return String.fromCodePoint(parseInt(cssHex, 16) || 0); } if (htmlHex) { return String.fromCodePoint(parseInt(htmlHex, 16) || 0); } if (htmlDec) { return String.fromCodePoint(parseInt(htmlDec, 10) || 0); } if (named) { const rep = htmlEntities[named] ?? htmlEntities[named.toLowerCase()]; if (rep) { return rep; } } return whole; } // Normalize CSS value by decoding HTML entities and CSS escapes function normalizeCssValue(value: string): string { let normalized = String(value ?? ''); // early-exit if no special chars if (!FAST_PATH_RE.test(normalized)) { return normalized; } // 1. remove CSS comments to prevent hiding escapes inside /* ... */ // 2. strip control characters and BiDi overrides // 3. decode all CSS escapes and HTML entities in one pass normalized = normalized .replace(COMMENTS_RE, '') .replace(CTRL_BIDI_RE, '') .replace(RE_DECODE, decodeToken); // unicode normalization (NFKC) to prevent homograph attacks try { normalized = normalized.normalize('NFKC'); } catch (_) { // silent fail: logging the value could expose sensitive data } return normalized; } // checks if a CSS value is safe from XSS attacks function isSafeCssValue(property: string, value: string): boolean { const prop = property.toLowerCase(); const isContentProperty = prop === 'content'; // normalize first to prevent bypasses via comments/escapes const normalized = normalizeCssValue(value); // early-exit for trivial safe values if (!SAFE_VALUE_FAST_CHECK_RE.test(normalized)) { return true; } // сheck if normalized value looks like an HTML tag const looksLikeTag = /<[^>]{0,128}>/i.test(normalized); const dangerousPatterns = [ looksLikeTag && CLOSE_STYLE_RE, // </style> tag closure !isContentProperty && looksLikeTag && DANGEROUS_TAGS_RE, // dangerous HTML tags DANGEROUS_URL_RE, // javascript:, data:, vbscript: URLs IE_EXPR_RE, // IE expression() IE_BEHAVIOR_RE, // IE behavior: MOZ_BINDING_RE, // FF -moz-binding AT_RULES_RE, // @import, @charset, @namespace ].filter(Boolean) as RegExp[]; return !dangerousPatterns.some((pattern) => pattern.test(normalized)); } function sanitizeStyleTags(dom: cheerio.CheerioAPI, cssWhiteList: CssWhiteList) { const styleTags = dom('style'); styleTags.each((_index, element) => { const styleText = dom(element).text(); try { const parsedCSS = css.parse(styleText); if (!parsedCSS.stylesheet) { return; } parsedCSS.stylesheet.rules = parsedCSS.stylesheet.rules.filter( (rule: css.Rule) => rule.type === 'rule', ); parsedCSS.stylesheet.rules.forEach((rule: css.Rule) => { if (!rule.declarations) { return; } rule.declarations = rule.declarations.filter((declaration: css.Declaration) => { if (!declaration.property || !declaration.value) { return false; } const prop = String(declaration.property).toLowerCase(); const val = String(declaration.value); if (!isSafeCssValue(prop, val)) { return false; } const isWhiteListed = Boolean(cssWhiteList[prop]); if (isWhiteListed) { declaration.value = cssfilter.safeAttrValue(prop, val); } if (!declaration.value) { return false; } return isWhiteListed; }); }); dom(element).text(css.stringify(parsedCSS)); } catch (error) { dom(element).remove(); const errorMessage = error instanceof Error ? error.message : `${error}`; log.info(errorMessage); } }); } function sanitizeStyleAttrs(dom: cheerio.CheerioAPI, cssWhiteList: CssWhiteList) { const options = { whiteList: cssWhiteList, }; const cssSanitizer = new cssfilter.FilterCSS(options); dom('*').each((_index, element) => { const styleAttrValue = dom(element).attr('style'); if (!styleAttrValue) { return; } dom(element).attr('style', cssSanitizer.process(styleAttrValue)); }); } export function sanitizeStyles(html: string, options: SanitizeOptions) { const cssWhiteList = options.cssWhiteList || {}; const $ = cheerio.load(html); sanitizeStyleTags($, cssWhiteList); sanitizeStyleAttrs($, cssWhiteList); const styles = $('head').html() || ''; const content = $('body').html() || ''; return styles + content; } export function sanitize( html: string, options?: SanitizeOptions, additionalOptions?: SanitizeOptions, ) { const sanitizeOptions = options || defaultOptions; if (additionalOptions?.cssWhiteList) { sanitizeOptions.cssWhiteList = { ...sanitizeOptions.cssWhiteList, ...additionalOptions.cssWhiteList, }; } const needToSanitizeStyles = !(sanitizeOptions.disableStyleSanitizer ?? false); const modifiedHtml = needToSanitizeStyles ? sanitizeStyles(html, sanitizeOptions) : html; return sanitizeHtml(modifiedHtml, sanitizeOptions); } export default sanitize;