UNPKG

docwriter-mcp-server

Version:

A Model Context Protocol (MCP) server for programmatic creation, modification, and compilation of structured LaTeX documents. Enables AI agents and automated workflows to generate reports, articles, and papers from templates, with secure, structured conte

537 lines (536 loc) 22.7 kB
/** * @fileoverview Provides a comprehensive `Sanitization` class for various input cleaning and validation tasks. * This module includes utilities for sanitizing HTML, LaTeX, strings, URLs, file paths, JSON, numbers, * and for redacting sensitive information from data intended for logging. * @module src/utils/security/sanitization */ import path from "path"; import sanitizeHtml from "sanitize-html"; import validator from "validator"; import { BaseErrorCode, McpError } from "../../types-global/errors.js"; import { logger, requestContextService } from "../index.js"; /** * A singleton class providing various methods for input sanitization. * Aims to protect against common vulnerabilities like XSS, path traversal, and LaTeX injection. */ export class Sanitization { /** @private */ constructor() { /** * Default list of field names considered sensitive for log redaction. * Case-insensitive matching is applied. * @private */ this.sensitiveFields = [ "password", "token", "secret", "key", "apiKey", "auth", "credential", "jwt", "ssn", "credit", "card", "cvv", "authorization", ]; /** * A list of dangerous LaTeX commands that should be neutralized. * This list targets commands that can read/write files or execute shell commands. * @private */ this.dangerousLatexCommands = [ "input", "include", "openin", "read", "write", "write18", "immediate", "ShellEscape", "def", "gdef", "edef", "xdef", "let", "futurelet", "catcode", "newread", "newwrite", "closein", "closeout", "usepackage", "documentclass", "includeonly", "typeout", "message", "errmessage", "shipout", "special", ]; /** * Default configuration for HTML sanitization. * @private */ this.defaultHtmlSanitizeConfig = { allowedTags: [ "h1", "h2", "h3", "h4", "h5", "h6", "p", "a", "ul", "ol", "li", "b", "i", "strong", "em", "strike", "code", "hr", "br", "div", "table", "thead", "tbody", "tr", "th", "td", "pre", ], allowedAttributes: { a: ["href", "name", "target"], img: ["src", "alt", "title", "width", "height"], "*": ["class", "id", "style"], }, preserveComments: false, }; } /** * Retrieves the singleton instance of the `Sanitization` class. * @returns The singleton `Sanitization` instance. */ static getInstance() { if (!Sanitization.instance) { Sanitization.instance = new Sanitization(); } return Sanitization.instance; } /** * Sets or extends the list of sensitive field names for log sanitization. * @param fields - An array of field names to add to the sensitive list. */ setSensitiveFields(fields) { this.sensitiveFields = [ ...new Set([ ...this.sensitiveFields, ...fields.map((f) => f.toLowerCase()), ]), ]; const logContext = requestContextService.createRequestContext({ operation: "Sanitization.setSensitiveFields", newSensitiveFieldCount: this.sensitiveFields.length, }); logger.debug("Updated sensitive fields list for log sanitization", logContext); } /** * Gets a copy of the current list of sensitive field names. * @returns An array of sensitive field names. */ getSensitiveFields() { return [...this.sensitiveFields]; } /** * Sanitizes an HTML string by removing potentially malicious tags and attributes. * @param input - The HTML string to sanitize. * @param config - Optional custom configuration for `sanitize-html`. * @returns The sanitized HTML string. Returns an empty string if input is falsy. */ sanitizeHtml(input, config) { if (!input) return ""; const effectiveConfig = { ...this.defaultHtmlSanitizeConfig, ...config }; const options = { allowedTags: effectiveConfig.allowedTags, allowedAttributes: effectiveConfig.allowedAttributes, transformTags: effectiveConfig.transformTags, }; if (effectiveConfig.preserveComments) { options.allowedTags = [...(options.allowedTags || []), "!--"]; } return sanitizeHtml(input, options); } /** * Sanitizes a string based on its intended context (e.g., HTML, URL, text). * **Important:** `context: 'javascript'` is disallowed due to security risks. * * @param input - The string to sanitize. * @param options - Options specifying the sanitization context. * @returns The sanitized string. Returns an empty string if input is falsy. * @throws {McpError} If `options.context` is 'javascript', or URL validation fails. */ sanitizeString(input, options = {}) { if (!input) return ""; switch (options.context) { case "html": return this.sanitizeHtml(input, { allowedTags: options.allowedTags, allowedAttributes: options.allowedAttributes ? this.convertAttributesFormat(options.allowedAttributes) : undefined, }); case "attribute": return sanitizeHtml(input, { allowedTags: [], allowedAttributes: {} }); case "url": if (!validator.isURL(input, { protocols: ["http", "https"], require_protocol: true, require_host: true, })) { logger.warning("Potentially invalid URL detected during string sanitization (context: url)", requestContextService.createRequestContext({ operation: "Sanitization.sanitizeString.urlWarning", invalidUrlAttempt: input, })); return ""; } return validator.trim(input); case "javascript": logger.error("Attempted JavaScript sanitization via sanitizeString, which is disallowed.", requestContextService.createRequestContext({ operation: "Sanitization.sanitizeString.jsAttempt", inputSnippet: input.substring(0, 50), })); throw new McpError(BaseErrorCode.VALIDATION_ERROR, "JavaScript sanitization is not supported through sanitizeString due to security risks."); case "text": default: return sanitizeHtml(input, { allowedTags: [], allowedAttributes: {} }); } } /** * Converts attribute format for `sanitizeHtml`. * @param attrs - Attributes in `{ tagName: ['attr1'] }` format. * @returns Attributes in `sanitize-html` expected format. * @private */ convertAttributesFormat(attrs) { return attrs; } /** * Sanitizes a URL string by validating its format and protocol. * @param input - The URL string to sanitize. * @param allowedProtocols - Array of allowed URL protocols. Default: `['http', 'https']`. * @returns The sanitized and trimmed URL string. * @throws {McpError} If the URL is invalid or uses a disallowed protocol. */ sanitizeUrl(input, allowedProtocols = ["http", "https"]) { try { const trimmedInput = input.trim(); if (!validator.isURL(trimmedInput, { protocols: allowedProtocols, require_protocol: true, require_host: true, })) { throw new Error("Invalid URL format or protocol not in allowed list."); } const lowercasedInput = trimmedInput.toLowerCase(); if (lowercasedInput.startsWith("javascript:") || lowercasedInput.startsWith("data:") || lowercasedInput.startsWith("vbscript:")) { throw new Error("Disallowed pseudo-protocol (javascript:, data:, or vbscript:) in URL."); } return trimmedInput; } catch (error) { throw new McpError(BaseErrorCode.VALIDATION_ERROR, error instanceof Error ? error.message : "Invalid or unsafe URL provided.", { input }); } } /** * Sanitizes a string containing LaTeX code to prevent common security vulnerabilities. * This method escapes special LaTeX characters and neutralizes potentially dangerous commands * that could allow file access or command execution. This is intended for user-provided * content, not for structural document commands. * * @param input - The LaTeX string to sanitize. * @param options - Configuration for the sanitization process. * @returns The sanitized LaTeX string. */ sanitizeLatex(input, options = {}) { if (!input) return ""; const { strategy = "escape" } = options; let sanitized = input; // 1. Escape special LaTeX characters to prevent them from being interpreted as commands. // This is the primary defense for user-provided content. sanitized = sanitized .replace(/\\/g, "\\textbackslash{}") .replace(/&/g, "\\&") .replace(/%/g, "\\%") .replace(/\$/g, "\\$") .replace(/#/g, "\\#") .replace(/_/g, "\\_") .replace(/{/g, "\\{") .replace(/}/g, "\\}") .replace(/~/g, "\\textasciitilde{}") .replace(/\^/g, "\\textasciicircum{}"); // 2. Neutralize known dangerous commands as a second layer of defense. // This replaces commands like `\input` with a harmless text representation. this.dangerousLatexCommands.forEach((cmd) => { const commandRegex = new RegExp(`\\\\${cmd}(?![a-zA-Z])`, "g"); sanitized = sanitized.replace(commandRegex, `\\texttt{\\textbackslash${cmd}}`); }); // 3. Neutralize dangerous environments like `filecontents`. const dangerousEnvs = ["filecontents", "filecontents*"]; dangerousEnvs.forEach((env) => { const beginRegex = new RegExp(`\\\\begin\\{${env}\\}`, "g"); const endRegex = new RegExp(`\\\\end\\{${env}\\}`, "g"); sanitized = sanitized.replace(beginRegex, `\\begin{verbatim} % Blocked Environment: ${env}`); sanitized = sanitized.replace(endRegex, `\\end{verbatim} % Blocked Environment: ${env}`); }); if (strategy === "strict") { logger.debug("Using strict LaTeX sanitization strategy.", requestContextService.createRequestContext({ operation: "Sanitization.sanitizeLatex.strict", })); } return sanitized; } /** * Sanitizes a file path to prevent path traversal and normalize format. * @param input - The file path string to sanitize. * @param options - Options to control sanitization behavior. * @returns An object with the sanitized path and sanitization metadata. * @throws {McpError} If the path is invalid or unsafe. */ sanitizePath(input, options = {}) { const originalInput = input; const effectiveOptions = { toPosix: options.toPosix ?? false, allowAbsolute: options.allowAbsolute ?? false, rootDir: options.rootDir ? path.resolve(options.rootDir) : undefined, }; let wasAbsoluteInitially = false; let convertedToRelative = false; try { if (!input || typeof input !== "string") throw new Error("Invalid path input: must be a non-empty string."); if (input.includes("\0")) throw new Error("Path contains null byte, which is disallowed."); let normalized = path.normalize(input); wasAbsoluteInitially = path.isAbsolute(normalized); if (effectiveOptions.toPosix) { normalized = normalized.replace(/\\/g, "/"); } let finalSanitizedPath; if (effectiveOptions.rootDir) { const fullPath = path.resolve(effectiveOptions.rootDir, normalized); if (!fullPath.startsWith(effectiveOptions.rootDir + path.sep) && fullPath !== effectiveOptions.rootDir) { throw new Error("Path traversal detected: attempts to escape the defined root directory."); } finalSanitizedPath = path.relative(effectiveOptions.rootDir, fullPath); finalSanitizedPath = finalSanitizedPath === "" ? "." : finalSanitizedPath; if (path.isAbsolute(finalSanitizedPath) && !effectiveOptions.allowAbsolute) { throw new Error("Path resolved to absolute outside root when absolute paths are disallowed."); } } else { if (path.isAbsolute(normalized)) { if (!effectiveOptions.allowAbsolute) { finalSanitizedPath = normalized.replace(/^(?:[A-Za-z]:)?[/\\]+/, ""); convertedToRelative = true; } else { finalSanitizedPath = normalized; } } else { const resolvedAgainstCwd = path.resolve(normalized); const currentWorkingDir = path.resolve("."); if (!resolvedAgainstCwd.startsWith(currentWorkingDir + path.sep) && resolvedAgainstCwd !== currentWorkingDir) { throw new Error("Relative path traversal detected (escapes current working directory context)."); } finalSanitizedPath = normalized; } } return { sanitizedPath: finalSanitizedPath, originalInput, wasAbsolute: wasAbsoluteInitially, convertedToRelative: wasAbsoluteInitially && !path.isAbsolute(finalSanitizedPath) && !effectiveOptions.allowAbsolute, optionsUsed: effectiveOptions, }; } catch (error) { logger.warning("Path sanitization error", requestContextService.createRequestContext({ operation: "Sanitization.sanitizePath.error", originalPathInput: originalInput, pathOptionsUsed: effectiveOptions, errorMessage: error instanceof Error ? error.message : String(error), })); throw new McpError(BaseErrorCode.VALIDATION_ERROR, error instanceof Error ? error.message : "Invalid or unsafe path provided.", { input: originalInput }); } } /** * Sanitizes a JSON string by parsing it to validate its format. * Optionally checks if the JSON string exceeds a maximum allowed size. * @template T The expected type of the parsed JSON object. Defaults to `unknown`. * @param input - The JSON string to sanitize/validate. * @param maxSize - Optional maximum allowed size of the JSON string in bytes. * @returns The parsed JavaScript object. * @throws {McpError} If input is not a string, too large, or invalid JSON. */ sanitizeJson(input, maxSize) { try { if (typeof input !== "string") throw new Error("Invalid input: expected a JSON string."); if (maxSize !== undefined && Buffer.byteLength(input, "utf8") > maxSize) { throw new McpError(BaseErrorCode.VALIDATION_ERROR, `JSON string exceeds maximum allowed size of ${maxSize} bytes.`, { actualSize: Buffer.byteLength(input, "utf8"), maxSize }); } return JSON.parse(input); } catch (error) { if (error instanceof McpError) throw error; throw new McpError(BaseErrorCode.VALIDATION_ERROR, error instanceof Error ? error.message : "Invalid JSON format.", { inputPreview: input.length > 100 ? `${input.substring(0, 100)}...` : input, }); } } /** * Validates and sanitizes a numeric input, converting strings to numbers. * Clamps the number to `min`/`max` if provided. * @param input - The number or string to validate and sanitize. * @param min - Minimum allowed value (inclusive). * @param max - Maximum allowed value (inclusive). * @returns The sanitized (and potentially clamped) number. * @throws {McpError} If input is not a valid number, NaN, or Infinity. */ sanitizeNumber(input, min, max) { let value; if (typeof input === "string") { const trimmedInput = input.trim(); if (trimmedInput === "" || !validator.isNumeric(trimmedInput)) { throw new McpError(BaseErrorCode.VALIDATION_ERROR, "Invalid number format: input is empty or not numeric.", { input }); } value = parseFloat(trimmedInput); } else if (typeof input === "number") { value = input; } else { throw new McpError(BaseErrorCode.VALIDATION_ERROR, "Invalid input type: expected number or string.", { input: String(input) }); } if (isNaN(value) || !isFinite(value)) { throw new McpError(BaseErrorCode.VALIDATION_ERROR, "Invalid number value (NaN or Infinity).", { input }); } let clamped = false; let originalValueForLog = value; if (min !== undefined && value < min) { value = min; clamped = true; } if (max !== undefined && value > max) { value = max; clamped = true; } if (clamped) { logger.debug("Number clamped to range.", requestContextService.createRequestContext({ operation: "Sanitization.sanitizeNumber.clamped", originalInput: String(input), parsedValue: originalValueForLog, minValue: min, maxValue: max, clampedValue: value, })); } return value; } /** * Sanitizes input for logging by redacting sensitive fields. * Creates a deep clone and replaces values of fields matching `this.sensitiveFields` * (case-insensitive substring match) with "[REDACTED]". * * It uses `structuredClone` if available for a high-fidelity deep clone. * If `structuredClone` is not available (e.g., in older Node.js environments), * it falls back to `JSON.parse(JSON.stringify(input))`. This fallback has limitations: * - `Date` objects are converted to ISO date strings. * - `undefined` values within objects are removed. * - `Map`, `Set`, `RegExp` objects are converted to empty objects (`{}`). * - Functions are removed. * - `BigInt` values will throw an error during `JSON.stringify` unless a `toJSON` method is provided. * - Circular references will cause `JSON.stringify` to throw an error. * * @param input - The input data to sanitize for logging. * @returns A sanitized (deep cloned) version of the input, safe for logging. * Returns original input if not object/array, or "[Log Sanitization Failed]" on error. */ sanitizeForLogging(input) { try { if (!input || typeof input !== "object") return input; const clonedInput = typeof structuredClone === "function" ? structuredClone(input) : JSON.parse(JSON.stringify(input)); this.redactSensitiveFields(clonedInput); return clonedInput; } catch (error) { logger.error("Error during log sanitization, returning placeholder.", requestContextService.createRequestContext({ operation: "Sanitization.sanitizeForLogging.error", errorMessage: error instanceof Error ? error.message : String(error), })); return "[Log Sanitization Failed]"; } } /** * Recursively redacts sensitive fields in an object or array in place. * @param obj - The object or array to redact. * @private */ redactSensitiveFields(obj) { if (!obj || typeof obj !== "object") return; if (Array.isArray(obj)) { obj.forEach((item) => this.redactSensitiveFields(item)); return; } for (const key in obj) { if (Object.prototype.hasOwnProperty.call(obj, key)) { const value = obj[key]; const lowerKey = key.toLowerCase(); const isSensitive = this.sensitiveFields.some((field) => lowerKey.includes(field)); if (isSensitive) { obj[key] = "[REDACTED]"; } else if (value && typeof value === "object") { this.redactSensitiveFields(value); } } } } } /** * Singleton instance of the `Sanitization` class. * Use this for all input sanitization tasks. */ export const sanitization = Sanitization.getInstance(); /** * Convenience function calling `sanitization.sanitizeForLogging`. * @param input - The input data to sanitize. * @returns A sanitized version of the input, safe for logging. */ export const sanitizeInputForLogging = (input) => sanitization.sanitizeForLogging(input);