UNPKG

mcp-sanitizer

Version:

Comprehensive security sanitization library for Model Context Protocol (MCP) servers with trusted security libraries

471 lines (408 loc) 13.7 kB
/** * Security Decoder Module for MCP Sanitizer * * This module provides comprehensive decoding and normalization functions * to prevent encoding-based bypass attacks. It implements defense-in-depth * by handling multiple encoding layers and normalization techniques. * * Security Priority: #1 * Performance Priority: #2 * Developer Experience: #3 */ const unorm = require('unorm'); // Homograph mapping for common attack vectors // Maps confusable Unicode characters to their ASCII equivalents const HOMOGRAPH_MAP = { // Cyrillic lookalikes (most common in attacks) а: 'a', // U+0430 е: 'e', // U+0435 о: 'o', // U+043E р: 'p', // U+0440 с: 'c', // U+0441 у: 'y', // U+0443 х: 'x', // U+0445 А: 'A', // U+0410 В: 'B', // U+0412 Е: 'E', // U+0415 К: 'K', // U+041A М: 'M', // U+041C Н: 'H', // U+041D О: 'O', // U+041E Р: 'P', // U+0420 С: 'C', // U+0421 Т: 'T', // U+0422 Х: 'X', // U+0425 // Greek lookalikes α: 'a', // U+03B1 ο: 'o', // U+03BF ρ: 'p', // U+03C1 τ: 't', // U+03C4 υ: 'u', // U+03C5 χ: 'x', // U+03C7 // Mathematical alphanumeric symbols (various styles) '𝐚': 'a', '𝐛': 'b', '𝐜': 'c', '𝐝': 'd', '𝐞': 'e', '𝐟': 'f', '𝐀': 'A', '𝐁': 'B', '𝐂': 'C', '𝐃': 'D', '𝐄': 'E', '𝐅': 'F', '𝒂': 'a', '𝒃': 'b', '𝒄': 'c', '𝒅': 'd', '𝒆': 'e', '𝒇': 'f', '𝒸': 'c', '𝒶': 'a', '𝓉': 't', '𝓅': 'p', '𝓈': 's', '𝓌': 'w', '𝓬': 'c', '𝓪': 'a', '𝓽': 't' }; /** * Normalize Unicode and replace homographs * @param {string} input - Input string to normalize * @returns {string} Normalized string with homographs replaced */ function normalizeUnicode (input) { if (typeof input !== 'string') return input; // First normalize to NFC (Canonical Composition) let normalized = unorm.nfc(input); // Replace homographs with ASCII equivalents for (const [homograph, ascii] of Object.entries(HOMOGRAPH_MAP)) { normalized = normalized.replace(new RegExp(homograph, 'g'), ascii); } return normalized; } /** * Decode Unicode escape sequences in a string * Handles: \u0041, \U00000041, \x41, etc. * @param {string} input - Input string potentially containing Unicode escapes * @returns {string} Decoded string */ function decodeUnicode (input) { if (typeof input !== 'string') return input; let decoded = input; // Handle \uXXXX format (JavaScript Unicode) decoded = decoded.replace(/\\u([0-9a-fA-F]{4})/g, (match, hex) => { return String.fromCharCode(parseInt(hex, 16)); }); // Handle \UXXXXXXXX format (8-digit Unicode) decoded = decoded.replace(/\\U([0-9a-fA-F]{8})/g, (match, hex) => { const codePoint = parseInt(hex, 16); return String.fromCodePoint(codePoint); }); // Handle \xXX format (hex escape) decoded = decoded.replace(/\\x([0-9a-fA-F]{2})/g, (match, hex) => { return String.fromCharCode(parseInt(hex, 16)); }); // Handle HTML numeric entities &#xHH; and &#DD; decoded = decoded.replace(/&#x([0-9a-fA-F]+);/gi, (match, hex) => { return String.fromCharCode(parseInt(hex, 16)); }); decoded = decoded.replace(/&#(\d+);/g, (match, dec) => { return String.fromCharCode(parseInt(dec, 10)); }); return decoded; } /** * Decode URL-encoded sequences recursively * Handles single, double, and triple encoding * @param {string} input - URL-encoded string * @param {number} maxDepth - Maximum decoding depth (default 3) * @returns {string} Decoded string */ function decodeUrl (input, maxDepth = 3) { if (typeof input !== 'string') return input; let decoded = input; let previousDecoded = ''; let depth = 0; // Recursively decode until no more changes or max depth reached while (decoded !== previousDecoded && depth < maxDepth) { previousDecoded = decoded; try { decoded = decodeURIComponent(decoded); } catch (e) { // If decoding fails, try partial decoding decoded = decoded.replace(/%([0-9a-fA-F]{2})/g, (match, hex) => { return String.fromCharCode(parseInt(hex, 16)); }); } depth++; } return decoded; } /** * Normalize path separators and remove null bytes * Handles Windows backslashes, null bytes, and mixed separators * @param {string} input - Path string * @returns {string} Normalized path */ function normalizePath (input) { if (typeof input !== 'string') return input; let normalized = input; // Remove null bytes normalized = normalized.replace(/\0/g, ''); // Convert all backslashes to forward slashes normalized = normalized.replace(/\\/g, '/'); // Remove multiple consecutive slashes normalized = normalized.replace(/\/+/g, '/'); // Decode Unicode path separators normalized = normalized.replace(/\\u002f/gi, '/'); normalized = normalized.replace(/\\u005c/gi, '/'); return normalized; } /** * Strip dangerous characters for command execution * SECURITY FIX: Replace newlines with spaces instead of removing them * to prevent command concatenation attacks like "ls\nrm -rf /" -> "lsrm -rf /" * @param {string} input - Command string * @returns {string} Sanitized command */ function stripDangerousChars (input) { if (typeof input !== 'string') return input; let sanitized = input; // CRITICAL: Remove ALL control characters first, including null bytes // This MUST happen before any other processing // eslint-disable-next-line no-control-regex sanitized = sanitized.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]/g, ''); // SECURITY FIX: Replace newlines and carriage returns with spaces // This prevents "cmd1\ncmd2" from becoming "cmd1cmd2" sanitized = sanitized.replace(/[\r\n]/g, ' '); // CVE-TBD-001 FIX: Enhanced removal of directional and zero-width characters // U+200B-U+200F: Zero-width spaces and joiners // U+202A-U+202E: Directional formatting (LTR, RTL overrides) - CRITICAL for CVE fix // U+2060-U+2069: Word joiners and directional isolates // U+FEFF: Zero-width no-break space // U+061C: Arabic letter mark // U+2066-U+2069: Directional isolates (LRI, RLI, FSI, PDI) sanitized = sanitized.replace(/[\u200B-\u200F\u202A-\u202E\u2060-\u2069\uFEFF\u061C\u2066-\u2069]/g, ''); // Remove other dangerous Unicode categories sanitized = sanitized.replace(/[\uFFF0-\uFFFF]/g, ''); // Specials block sanitized = sanitized.replace(/[\uDB40-\uDB7F]/g, ''); // High surrogates for private use // Clean up multiple spaces sanitized = sanitized.replace(/\s+/g, ' ').trim(); return sanitized; } /** * Comprehensive input decoder that applies all decoding techniques * This is the main entry point for security decoding * @param {string} input - Input string to decode * @param {Object} options - Decoding options * @returns {Object} Decoded result with metadata */ function securityDecode (input, options = {}) { const { decodeUnicode: doUnicode = true, decodeUrl: doUrl = true, normalizePath: doPath = true, stripDangerous: doStrip = true, normalizeUnicode: doNormalize = true, maxIterations = 3 } = options; if (typeof input !== 'string') { return { decoded: input, wasDecoded: false, decodingSteps: [], originalInput: input }; } const decodingSteps = []; let decoded = input; let previousDecoded = ''; let iterations = 0; // CRITICAL: Apply Unicode normalization FIRST to handle homographs if (doNormalize) { const normalized = normalizeUnicode(decoded); if (normalized !== decoded) { decodingSteps.push('unicode-normalize'); decoded = normalized; } } // Apply decoding in multiple passes to handle nested encoding while (decoded !== previousDecoded && iterations < maxIterations) { previousDecoded = decoded; // Step 1: URL decoding (deepest layer first) if (doUrl) { const urlDecoded = decodeUrl(decoded); if (urlDecoded !== decoded) { decodingSteps.push('url-decode'); decoded = urlDecoded; } } // Step 2: Unicode decoding if (doUnicode) { const unicodeDecoded = decodeUnicode(decoded); if (unicodeDecoded !== decoded) { decodingSteps.push('unicode-decode'); decoded = unicodeDecoded; } } // Step 3: Path normalization if (doPath) { const pathNormalized = normalizePath(decoded); if (pathNormalized !== decoded) { decodingSteps.push('path-normalize'); decoded = pathNormalized; } } // Step 4: Strip dangerous characters if (doStrip) { const stripped = stripDangerousChars(decoded); if (stripped !== decoded) { decodingSteps.push('strip-dangerous'); decoded = stripped; } } // Step 5: Re-normalize after decoding (catches encoded homographs) if (doNormalize) { const reNormalized = normalizeUnicode(decoded); if (reNormalized !== decoded) { decodingSteps.push('unicode-renormalize'); decoded = reNormalized; } } iterations++; } return { decoded, wasDecoded: decoded !== input, decodingSteps, originalInput: input, iterations }; } /** * Check if input contains any encoded sequences * Used to detect potential bypass attempts * @param {string} input - Input to check * @returns {boolean} True if encoding detected */ function hasEncoding (input) { if (typeof input !== 'string') return false; const encodingPatterns = [ /%[0-9a-fA-F]{2}/, // URL encoding /\\u[0-9a-fA-F]{4}/, // Unicode \uXXXX /\\U[0-9a-fA-F]{8}/, // Unicode \UXXXXXXXX /\\x[0-9a-fA-F]{2}/, // Hex \xXX /&#x[0-9a-fA-F]+;/, // HTML hex entity /&#\d+;/, // HTML decimal entity /\0/, // Null byte /[\r\n]/, // Newlines /\\/ // Backslash (potential path separator) ]; return encodingPatterns.some(pattern => pattern.test(input)); } // Timing attack prevention functions removed - not applicable for middleware sanitization // Lazy load security enhancements to avoid circular dependency function getSecurityEnhancements () { return require('./security-enhancements'); } /** * Enhanced security decode with all security checks * @param {string} input - Input to decode * @param {Object} options - Decoding and security options * @returns {Promise<Object>} Enhanced decode result */ async function enhancedSecurityDecode (input, options = {}) { const { // Existing decode options decodeUnicode: doUnicode = true, decodeUrl: doUrl = true, normalizePath: doPath = true, stripDangerous: doStrip = true, normalizeUnicode: doNormalize = true, maxIterations = 3, // New security enhancement options checkDirectionalOverrides = true, checkNullBytes = true, checkMultipleEncoding = true, checkCyrillicHomographs = true, // Timing consistency removed - not applicable for middleware maxEncodingDepth = 4 } = options; // Timing consistency removed - execute directly return performEnhancedDecode(); async function performEnhancedDecode () { // Start with basic security decode const basicResult = securityDecode(input, { decodeUnicode: doUnicode, decodeUrl: doUrl, normalizePath: doPath, stripDangerous: doStrip, normalizeUnicode: doNormalize, maxIterations }); const result = { decoded: basicResult.decoded, wasDecoded: basicResult.wasDecoded, decodingSteps: basicResult.decodingSteps, originalInput: basicResult.originalInput, iterations: basicResult.iterations, warnings: [], securityChecks: {} }; // Apply security enhancements if (checkDirectionalOverrides) { const { detectDirectionalOverrides } = getSecurityEnhancements(); const dirResult = detectDirectionalOverrides(result.decoded); if (dirResult.detected) { result.warnings.push(...dirResult.warnings); result.decoded = dirResult.sanitized; result.securityChecks.directionalOverrides = dirResult.metadata; } } if (checkNullBytes) { const { detectNullBytes } = getSecurityEnhancements(); const nullResult = detectNullBytes(result.decoded); if (nullResult.detected) { result.warnings.push(...nullResult.warnings); result.decoded = nullResult.sanitized; result.securityChecks.nullBytes = nullResult.metadata; } } if (checkMultipleEncoding) { const { detectMultipleUrlEncoding } = getSecurityEnhancements(); const encodingResult = detectMultipleUrlEncoding(input, maxEncodingDepth); if (encodingResult.detected) { result.warnings.push(...encodingResult.warnings); result.securityChecks.multipleEncoding = encodingResult.metadata; // Use the more thoroughly decoded version if different if (encodingResult.decoded !== result.decoded) { result.decoded = encodingResult.decoded; result.wasDecoded = true; } } } if (checkCyrillicHomographs) { const { detectCyrillicHomographs } = getSecurityEnhancements(); const homographResult = detectCyrillicHomographs(result.decoded); if (homographResult.detected) { result.warnings.push(...homographResult.warnings); result.decoded = homographResult.normalized; result.securityChecks.cyrillicHomographs = homographResult.metadata; } } return result; } } module.exports = { normalizeUnicode, decodeUnicode, decodeUrl, normalizePath, stripDangerousChars, securityDecode, hasEncoding, enhancedSecurityDecode };