UNPKG

@brrock/vard

Version:

Lightweight prompt injection detection for LLM applications. Zod-inspired chainable API for prompt security.

1,522 lines (1,510 loc) 51.2 kB
// src/errors.ts var PromptInjectionError = class _PromptInjectionError extends Error { /** * Array of detected threats. Each threat contains: * - `type`: Type of attack detected * - `severity`: Severity score (0-1) * - `match`: The matched string that triggered detection * - `position`: Character position where threat was found * * @remarks * **Security**: Never expose this to end users. Use `getUserMessage()` instead. */ threats; /** * Creates a new PromptInjectionError. * * @param threats - Array of detected threats (must not be empty) */ constructor(threats) { super("Invalid input detected"); this.name = "PromptInjectionError"; this.threats = threats; if (Error.captureStackTrace) { Error.captureStackTrace(this, _PromptInjectionError); } } /** * Returns a generic, user-safe error message. * * This message intentionally does NOT reveal what was detected or why. * Use this for user-facing error messages. * * @param locale - Language for the message ('en' or 'no') * @returns Generic error message in the specified language * * @example * **English message (default)** * ```typescript * error.getUserMessage('en'); * // Returns: "Invalid input detected. Please try again." * ``` * * @example * **Norwegian message** * ```typescript * error.getUserMessage('no'); * // Returns: "Ugyldig innhold oppdaget. Vennligst prøv igjen." * ``` * * @see {@link getDebugInfo} for detailed threat information (server-side only) */ getUserMessage(locale = "en") { return locale === "no" ? "Ugyldig innhold oppdaget. Vennligst pr\xF8v igjen." : "Invalid input detected. Please try again."; } /** * Returns detailed threat information for logging and debugging. * * @remarks * **Security Warning**: This method returns detailed information about detected * threats including attack types, severity scores, and matched patterns. * **NEVER expose this to end users** as it reveals your security measures. * * Use this only for: * - Server-side logging * - Security monitoring * - Debugging during development * * @returns Formatted string with detailed threat information * * @example * **Server-side logging** * ```typescript * try { * vard(userInput); * } catch (error) { * if (error instanceof PromptInjectionError) { * // Log detailed info server-side (safe) * console.error('[SECURITY]', error.getDebugInfo()); * * // Return generic message to user (safe) * return { error: error.getUserMessage() }; * } * } * ``` * * @example * **Example output** * ``` * Threats detected: * - instructionOverride (severity: 0.90, match: "ignore all previous instr...", position: 0) * - delimiterInjection (severity: 0.95, match: "<system>", position: 45) * ``` * * @see {@link getUserMessage} for safe, user-facing error messages */ getDebugInfo() { const threatList = this.threats.map( (t) => `- ${t.type} (severity: ${t.severity.toFixed(2)}, match: "${t.match.substring(0, 30)}${t.match.length > 30 ? "..." : ""}", position: ${t.position})` ).join("\n"); return `Threats detected: ${threatList}`; } }; // src/patterns/instruction.ts var instructionPatterns = [ // "ignore (all/previous/prior/above) (previous/prior) (instructions/directives/commands)" // Supports: "ignore all previous instructions", "ignore previous instructions", "ignore instructions" { regex: /\bignore\s+(?:(?:all|the)\s+)?(?:previous|prior|above|remaining|other)?\s*(?:instructions|directives|commands|rules|prompts?|context)\b/i, severity: 0.9, type: "instructionOverride" }, // "disregard (the) (system/previous) (prompt/instructions)" { regex: /\bdisregard\s+(?:the\s+)?(?:system|previous)\s+(?:prompt|instructions)\b/i, severity: 0.9, type: "instructionOverride" }, // "forget (everything/all/what) (you/i) (said/told/were told)" { regex: /\bforget\s+(?:everything|all|what)\s+(?:you|i)\s+(?:said|told|mentioned|were\s+told)\b/i, severity: 0.85, type: "instructionOverride" }, // "new (instructions/task/directive)" { regex: /\bnew\s+(?:instructions?|tasks?|directives?)\b/i, severity: 0.8, type: "instructionOverride" }, // "override (previous/system) (instructions/rules)" { regex: /\boverride\s+(?:previous|system|all)\s+(?:instructions|rules|directives)\b/i, severity: 0.9, type: "instructionOverride" }, // "instead (follow/use/do)" { regex: /\binstead\s+(?:follow|use|do|obey)\b/i, severity: 0.7, type: "instructionOverride" }, // "start over" / "begin again" / "restart" - excludes "start over with" (legitimate) { regex: /\b(?:start\s+over(?:\s+from\s+scratch)?(?!\s+with)|begin\s+again|restart)\b/i, severity: 0.75, // Increased from 0.6 to be above moderate threshold type: "instructionOverride" }, // "reset (to/your) (default/initial)" { regex: /\breset\s+(?:to\s+)?(?:default|initial|factory)\b/i, severity: 0.75, type: "instructionOverride" }, // "spell backward(s)" - obfuscation technique { regex: /\bspell(?:ed)?\s+(?:(?:it|this|that)\s+)?(?:backwards?|in\s+reverse)\s+(?:and\s+)?(?:execute|run|follow|do|perform|obey)/i, severity: 0.9, type: "instructionOverride" }, // "decode (base64/hex/rot13/etc) and (execute/follow/run)" { regex: /\b(?:decode|decrypt|decipher)\s+(?:this\s+|the\s+)?(?:base64|base-64|b64|hex|hexadecimal|rot13|rot-13|binary|unicode)\s+(?:and\s+)?(?:execute|follow|run|do|perform|obey)/i, severity: 0.9, type: "instructionOverride" }, // "rot13/base64/hex decode" (reversed order) { regex: /\b(?:base64|base-64|b64|hex|hexadecimal|rot13|rot-13|binary|unicode)\s+(?:decode|decrypt|decipher)\s+(?:and\s+)?(?:execute|follow|run|do|perform|obey)/i, severity: 0.9, type: "instructionOverride" }, // "translate from (hex/base64/etc)" - obfuscation { regex: /\btranslate\s+from\s+(?:hex|hexadecimal|base64|base-64|binary|morse|rot13|rot-13)\s+(?:to\s+text\s+)?(?:and\s+)?(?:execute|follow|run|do|perform)?/i, severity: 0.85, type: "instructionOverride" }, // "unscramble/rearrange and (execute/follow)" { regex: /\b(?:unscramble|rearrange|reorder|decode)\s+(?:these\s+|the\s+)?(?:letters?|words?|characters?)\s+(?:and\s+)?(?:execute|follow|run|do|perform|obey)/i, severity: 0.85, type: "instructionOverride" }, // "do not (say/respond/reply)" + "instead" - negative directive { regex: /\b(?:do\s+not|don't|never)\s+(?:say|respond|reply|answer|tell|refuse|comply|follow)\s+.{1,50}?\binstead\b/i, severity: 0.85, type: "instructionOverride" }, // "don't/do not (follow/comply with) (your/the) (rules/guidelines)" { regex: /\b(?:do\s+not|don't)\s+(?:follow|comply\s+with|obey|adhere\s+to)\s+(?:your|the)?\s*(?:rules|guidelines|instructions|constraints|restrictions)\b/i, severity: 0.8, type: "instructionOverride" }, // "don't respond with (safety) warnings/filters/checks" { regex: /\b(?:do\s+not|don't)\s+respond\s+with\s+(?:safety\s+)?(?:warnings?|filters?|checks?|restrictions?|limitations?)\b/i, severity: 0.75, type: "instructionOverride" }, // "never refuse" / "always (answer/comply)" { regex: /\b(?:never\s+refuse|always\s+(?:answer|comply|obey|respond|follow))\b/i, severity: 0.8, type: "instructionOverride" } ]; // src/patterns/role.ts var rolePatterns = [ // "you are (now) (a/an) [role]" { regex: /\byou\s+are\s+(?:now\s+)?(?:a|an)\s+\w{3,20}\b/i, severity: 0.85, type: "roleManipulation" }, // "you're (now) (a/an) [role]" { regex: /\byou're\s+(?:now\s+)?(?:a|an)\s+\w{3,20}\b/i, severity: 0.85, type: "roleManipulation" }, // "pretend (you are/to be)" { regex: /\bpretend\s+(?:you\s+are|to\s+be|that\s+you)\b/i, severity: 0.8, type: "roleManipulation" }, // "roleplay as" { regex: /\broleplay\s+as\b/i, severity: 0.8, type: "roleManipulation" }, // "act (as/like)" - updated to handle multi-word roles { regex: /\bact\s+(?:as|like)\s+(?:a|an)?\s*\w+(?:\s+\w+){0,3}\b/i, severity: 0.75, type: "roleManipulation" }, // "behave (as/like)" - updated to handle multi-word roles { regex: /\bbehave\s+(?:as|like)\s+(?:a|an)?\s*\w+(?:\s+\w+){0,3}\b/i, severity: 0.75, type: "roleManipulation" }, // "from now on, you (are/will be)" { regex: /\bfrom\s+now\s+on,?\s+you\s+(?:are|will\s+be)\b/i, severity: 0.85, type: "roleManipulation" }, // "simulate (being/that you are)" { regex: /\bsimulate\s+(?:being|that\s+you\s+are)\b/i, severity: 0.8, type: "roleManipulation" }, // "your new role is" { regex: /\byour\s+new\s+role\s+is\b/i, severity: 0.9, type: "roleManipulation" }, // "assume the role of" { regex: /\bassume\s+the\s+role\s+of\b/i, severity: 0.85, type: "roleManipulation" }, // "developer mode" / "dev mode" / "test mode" { regex: /\b(?:developer|dev|test|testing|debug|admin|god|research)\s+mode\b/i, severity: 0.85, type: "roleManipulation" }, // "enable/activate (developer/test) mode" { regex: /\b(?:enable|activate|turn\s+on|switch\s+to)\s+(?:developer|dev|test|debug|admin|god)\s+mode\b/i, severity: 0.9, type: "roleManipulation" }, // "for educational purposes" / "for research purposes" { regex: /\bfor\s+(?:educational|research|testing|academic|scientific)\s+purposes\s+only\b/i, severity: 0.7, type: "roleManipulation" }, // "hypothetically/theoretically speaking" - hypothetical framing { regex: /\b(?:hypothetically|theoretically)\s+speaking[,\s]+(?:if|what|how)/i, severity: 0.75, type: "roleManipulation" }, // "imagine (if/that/you)" / "suppose (that/you)" { regex: /\b(?:imagine|suppose)\s+(?:if|that|you|for\s+a\s+moment)/i, severity: 0.7, type: "roleManipulation" }, // "this is (just) a (test/simulation/game)" { regex: /\bthis\s+is\s+(?:just\s+)?(?:a\s+)?(?:test|simulation|game|exercise|scenario)\b/i, severity: 0.75, type: "roleManipulation" }, // "DAN mode" / "jailbreak mode" / "unrestricted mode" { regex: /\b(?:DAN|jailbreak|unrestricted|unfiltered|uncensored)\s+(?:mode|version)\b/i, severity: 0.95, type: "roleManipulation" }, // Temporal exploitation: "legal now" / "allowed now" { regex: /\b(?:legal|allowed|permitted|acceptable)\s+now\b|\bnow\s+(?:legal|allowed|permitted)\b/i, severity: 0.8, type: "roleManipulation" } ]; // src/patterns/delimiter.ts var delimiterPatterns = [ // XML-style tags: <system>, </system>, <user>, <assistant> { regex: /<\/?(?:system|user|assistant|human|ai|context|instruction|prompt|user_query)>/gi, severity: 0.95, type: "delimiterInjection" }, // Bracket-style markers: [SYSTEM], [USER], [/SYSTEM], [START], [START OUTPUT] { regex: /\[\/?\s*(?:system|user|assistant|human|ai|context|instruction|prompt|start|stop|begin|end|start\s+output|begin\s+output|output)\s*\]/gi, severity: 0.95, type: "delimiterInjection" }, // Hash-style markers: ###SYSTEM###, ###ADMIN### { regex: /#{2,}\s*(?:system|admin|root|user|assistant|instruction|prompt)\s*#{2,}/gi, severity: 0.9, type: "delimiterInjection" }, // Markdown-style: ## SYSTEM, ## USER { regex: /^#{1,6}\s+(?:system|user|assistant|human|ai|context|instruction|prompt)\s*$/gim, severity: 0.8, type: "delimiterInjection" }, // Colon-style: SYSTEM:, USER:, ASSISTANT:, Query:, godmode:, UserQuery: { regex: /\b(?:system|user|assistant|human|ai|context|instruction|prompt|query|godmode|UserQuery)\s*:/gi, severity: 0.7, type: "delimiterInjection" }, // Role indicators in caps: SYSTEM, USER (standalone) { regex: /\b(?:SYSTEM|USER|ASSISTANT|HUMAN|AI|CONTEXT|INSTRUCTION|PROMPT)\b/gi, severity: 0.65, type: "delimiterInjection" } ]; // src/patterns/leak.ts var leakPatterns = [ // "repeat/reveal/show (me) (the) (system/initial/original) (prompt/instructions)" { regex: /\b(?:repeat|reveal|show|tell\s+me|display|output|print)\s+(?:me\s+)?(?:the\s+)?(?:your\s+)?(?:system|initial|original|above|previous)?\s*(?:prompt|instructions|guidelines|rules|directives)\b/i, severity: 0.95, type: "systemPromptLeak" }, // "what (were/are) (your/the) (original) (instructions/guidelines/rules)" { regex: /\bwhat\s+(?:were|are|is)\s+(?:your|the)\s+(?:original\s+)?(?:instructions|guidelines|rules|directives|prompt)\b/i, severity: 0.9, type: "systemPromptLeak" }, // "print/output/display (your/the) (system) (prompt/instructions)" { regex: /\b(?:print|output|display|show|reveal)\s+(?:your|the)\s+(?:system\s+)?(?:prompt|instructions|guidelines|rules)\b/i, severity: 0.95, type: "systemPromptLeak" }, // "what is your system prompt" { regex: /\bwhat\s+(?:is|was)\s+your\s+system\s+prompt\b/i, severity: 0.95, type: "systemPromptLeak" }, // "tell me about your instructions" { regex: /\btell\s+me\s+about\s+your\s+(?:instructions|guidelines|rules|directives|prompt)\b/i, severity: 0.85, type: "systemPromptLeak" }, // "how were you configured" { regex: /\bhow\s+were\s+you\s+(?:configured|programmed|instructed|set\s+up)\b/i, severity: 0.8, type: "systemPromptLeak" }, // "what are you not allowed to do" { regex: /\bwhat\s+(?:are|were)\s+you\s+(?:not\s+)?(?:allowed|forbidden|prohibited|told)\s+to\s+(?:do|say)\b/i, severity: 0.75, type: "systemPromptLeak" }, // "fence/format/wrap the prompt in markdown/code/text block" { regex: /\b(?:fence|format|wrap|enclose|put)\s+(?:the\s+)?(?:above\s+)?(?:prompt|instructions|content|text)\s+(?:in|inside|within)\s+(?:a\s+)?(?:markdown|code|text|raw)\s*(?:block|fence)?\b/i, severity: 0.9, type: "systemPromptLeak" }, // "re-transcript/transcribe/copy the above content/text/prompt" { regex: /\b(?:re-?transcript|re-?transcribe|copy|reproduce|replicate|duplicate)\s+(?:the\s+)?(?:above|previous|prior)\s+(?:content|text|prompt|instructions|message|conversation)\b/i, severity: 0.9, type: "systemPromptLeak" }, // "output/return/give the above/previous (in) format" { regex: /\b(?:output|return|give|provide)\s+(?:the\s+)?(?:above|previous|prior)\s+(?:content|text|prompt|instructions|message)?\s*(?:in|inside|as)?\s*(?:markdown|code|text|raw|verbatim)?\b/i, severity: 0.85, type: "systemPromptLeak" }, // "repeat (all) (previous/prior) (messages/conversation)" { regex: /\brepeat\s+(?:all\s+)?(?:the\s+)?(?:previous|prior|earlier|past)\s+(?:messages?|conversations?|inputs?|prompts?|exchanges?)\b/i, severity: 0.9, type: "systemPromptLeak" }, // "what did (the) (last/previous/other) user(s) (ask/say)" { regex: /\bwhat\s+did\s+(?:the\s+)?(?:last|previous|prior|other|earlier)\s+(?:users?|persons?|people)\s+(?:ask|say|tell|request|want)\b/i, severity: 0.85, type: "systemPromptLeak" }, // "show/display (me) (the) conversation history/log/messages" { regex: /\b(?:show|display)\s+(?:me\s+)?(?:the\s+)?(?:full\s+)?(?:conversation|chat|message|dialogue|earlier)\s+(?:history|log|record|transcript|messages)\b/i, severity: 0.9, type: "systemPromptLeak" }, // "display the message/chat transcript" { regex: /\bdisplay\s+(?:the\s+)?(?:message|chat|conversation)\s+(?:transcript|history|log|record)\b/i, severity: 0.9, type: "systemPromptLeak" }, // "summarize (everything) (we've) discussed" { regex: /\bsummarize\s+(?:everything|all)\s+(?:we've|we\s+have|that\s+we've|that\s+we\s+have)\s+(?:discussed|talked\s+about|covered)\b/i, severity: 0.8, type: "systemPromptLeak" }, // "print (all) (prior/previous) (inputs/messages)" { regex: /\bprint\s+(?:all\s+)?(?:prior|previous|past|earlier)\s+(?:inputs?|messages?|prompts?|conversations?|exchanges?)\b/i, severity: 0.9, type: "systemPromptLeak" } ]; // src/patterns/encoding.ts var encodingPatterns = [ // Base64 encoded content (long sequences that look like base64) // Min 40 chars to avoid false positives on short strings { regex: /[A-Za-z0-9+/]{40,}={0,2}/g, severity: 0.7, type: "encoding" }, // Hex escape sequences: \xNN (multiple in sequence) { regex: /(?:\\x[0-9A-Fa-f]{2}){5,}/g, severity: 0.85, type: "encoding" }, // Hex values: 0xNN (multiple in sequence) { regex: /(?:0x[0-9A-Fa-f]{2,}\s*){5,}/g, severity: 0.8, type: "encoding" }, // Unicode escape sequences: \uNNNN (multiple in sequence) { regex: /(?:\\u[0-9A-Fa-f]{4}){5,}/g, severity: 0.85, type: "encoding" }, // HTML entities: &# sequences (multiple) { regex: /(?:&#{1,2}[xX]?[0-9A-Fa-f]+;){5,}/g, severity: 0.8, type: "encoding" }, // URL encoded: % sequences (multiple) { regex: /(?:%[0-9A-Fa-f]{2}){5,}/g, severity: 0.75, type: "encoding" }, // Null bytes (suspicious) { regex: /\x00+/g, severity: 0.95, type: "encoding" }, // Unicode directional override characters (used for obfuscation) { regex: /[\u202A-\u202E\u2066-\u2069]+/g, severity: 0.9, type: "encoding" }, // Zalgo text (combining diacriticals) { regex: /[\u0300-\u036F]{3,}/g, severity: 0.85, type: "encoding" } ]; // src/patterns/obfuscation.ts var obfuscationPatterns = [ // Zero-width character attacks - highly suspicious // U+200B (zero-width space), U+200C (zero-width non-joiner), // U+200D (zero-width joiner), U+FEFF (zero-width no-break space) { regex: /[\u200B\u200C\u200D\uFEFF]/g, severity: 0.85, type: "encoding" }, // Right-to-left override markers - almost always malicious // U+202E (RTL override), U+202D (LTR override) { regex: /[\u202E\u202D]/g, severity: 0.95, type: "encoding" }, // Character insertion with underscores - "i_g_n_o_r_e" // Only flag if combined with suspicious keywords { regex: /\bi[\s._-]g[\s._-]n[\s._-]o[\s._-]r[\s._-]e\b/i, severity: 0.9, type: "instructionOverride" }, // Homoglyph: "ignore" with any homoglyph character // Matches: Ιgnore, іgnore, ignοre, ignоre, etc. { regex: /[ΙІі]gn[oοо]re?|ign[οо]re/i, severity: 0.95, type: "instructionOverride" }, // Excessive spacing (4+ spaces between words) - suspicious { regex: /\w+\s{4,}\w+/, severity: 0.6, type: "encoding" }, // Full-width Unicode Latin letters (3+ in sequence) { regex: /[\uFF21-\uFF3A\uFF41-\uFF5A]{3,}/, severity: 0.7, type: "encoding" }, // Uncommon Unicode spaces { regex: /[\u2000-\u200A\u202F\u205F]/g, severity: 0.65, type: "encoding" } ]; // src/patterns/index.ts var allPatterns = [ ...instructionPatterns, ...rolePatterns, ...delimiterPatterns, ...leakPatterns, ...encodingPatterns, ...obfuscationPatterns ]; // src/detectors/index.ts function detect(input, patterns) { const threats = []; for (const pattern of patterns) { const flags = pattern.regex.flags.includes("g") ? pattern.regex.flags : pattern.regex.flags + "g"; const regex = new RegExp(pattern.regex.source, flags); const matches = Array.from(input.matchAll(regex)); for (const match of matches) { threats.push({ type: pattern.type, severity: pattern.severity, match: match[0], position: match.index ?? 0 }); } } return threats; } function checkLength(input, maxLength) { if (input.length > maxLength) { return { type: "instructionOverride", // Categorize as instruction override severity: 0.8, match: `Input exceeds ${maxLength} characters`, position: maxLength }; } return null; } function detectCustomDelimiters(input, delimiters) { const threats = []; for (const delimiter of delimiters) { const index = input.indexOf(delimiter); if (index !== -1) { threats.push({ type: "delimiterInjection", severity: 0.95, match: delimiter, position: index }); } } return threats; } // src/sanitizers/index.ts function sanitize(input, threats) { let sanitized = input; let iterations = 0; const maxIterations = 5; while (iterations < maxIterations) { const before = sanitized; sanitized = applySanitizationPass(sanitized, threats); if (sanitized === before) { break; } iterations++; } return sanitized; } function applySanitizationPass(input, threats) { let sanitized = input; const threatsByType = /* @__PURE__ */ new Map(); for (const threat of threats) { const existing = threatsByType.get(threat.type); if (existing) { existing.push(threat); } else { threatsByType.set(threat.type, [threat]); } } for (const [type, typeThreats] of threatsByType) { sanitized = sanitizeByType(sanitized, type, typeThreats); } return sanitized; } function sanitizeByType(input, type, threats) { switch (type) { case "delimiterInjection": return sanitizeDelimiters(input); case "encoding": return sanitizeEncoding(input); case "instructionOverride": return sanitizeInstructions(input, threats); case "roleManipulation": return sanitizeRoles(input, threats); case "systemPromptLeak": return sanitizeLeaks(input, threats); default: return input; } } function sanitizeDelimiters(input) { let sanitized = input; sanitized = sanitized.replace( /<\/?(?:system|user|assistant|human|ai|context|instruction|prompt)>/gi, "" ); sanitized = sanitized.replace( /\[\/?\s*(?:system|user|assistant|human|ai|context|instruction|prompt)\s*\]/gi, "" ); sanitized = sanitized.replace( /#{2,}\s*(?:system|admin|root|user|assistant|instruction|prompt)\s*#{2,}/gi, "" ); sanitized = sanitized.replace( /\b(system|user|assistant|human|ai|context|instruction|prompt)\s*:/gi, "$1-" ); sanitized = sanitized.replace( /\b(?:SYSTEM|USER|ASSISTANT|HUMAN|AI|CONTEXT|INSTRUCTION|PROMPT)\b/g, "" ); return sanitized; } function sanitizeEncoding(input) { let sanitized = input; sanitized = sanitized.replace(/\x00+/g, ""); sanitized = sanitized.replace(/[\u200B\u200C\u200D\uFEFF]+/g, ""); sanitized = sanitized.replace(/[\u202A-\u202E\u2066-\u2069]+/g, ""); sanitized = sanitized.replace(/[\u0300-\u036F]{3,}/g, ""); sanitized = sanitized.replace( /[A-Za-z0-9+/]{40,}={0,2}/g, "[ENCODED_REMOVED]" ); sanitized = sanitized.replace(/(?:\\x[0-9A-Fa-f]{2}){5,}/g, "[HEX_REMOVED]"); sanitized = sanitized.replace( /(?:\\u[0-9A-Fa-f]{4}){5,}/g, "[UNICODE_REMOVED]" ); sanitized = sanitized.replace( /(?:&#{1,2}[xX]?[0-9A-Fa-f]+;){5,}/g, "[ENTITY_REMOVED]" ); sanitized = sanitized.replace(/[\u2000-\u200A\u202F\u205F]+/g, " "); sanitized = sanitized.replace( /[\uFF21-\uFF3A]/g, (match) => String.fromCharCode(match.charCodeAt(0) - 65248) ); sanitized = sanitized.replace( /[\uFF41-\uFF5A]/g, (match) => String.fromCharCode(match.charCodeAt(0) - 65248) ); return sanitized; } function sanitizeInstructions(input, threats) { let sanitized = input; for (const threat of threats) { if (threat.match && threat.match.length > 0) { sanitized = sanitized.replace(threat.match, ""); } } return sanitized; } function sanitizeRoles(input, threats) { let sanitized = input; for (const threat of threats) { if (threat.match && threat.match.length > 0) { sanitized = sanitized.replace(threat.match, ""); } } return sanitized; } function sanitizeLeaks(input, threats) { let sanitized = input; for (const threat of threats) { if (threat.match && threat.match.length > 0) { sanitized = sanitized.replace(threat.match, ""); } } return sanitized; } // src/presets.ts var STRICT_PRESET = { threshold: 0.5, maxLength: 1e4, customDelimiters: [], customPatterns: [], threatActions: { instructionOverride: "block", roleManipulation: "block", delimiterInjection: "block", systemPromptLeak: "block", encoding: "block" } }; var MODERATE_PRESET = { threshold: 0.7, maxLength: 1e4, customDelimiters: [], customPatterns: [], threatActions: { instructionOverride: "block", roleManipulation: "block", delimiterInjection: "sanitize", systemPromptLeak: "block", encoding: "sanitize" } }; var LENIENT_PRESET = { threshold: 0.85, maxLength: 1e4, customDelimiters: [], customPatterns: [], threatActions: { instructionOverride: "sanitize", roleManipulation: "warn", delimiterInjection: "sanitize", systemPromptLeak: "sanitize", encoding: "sanitize" } }; function getPreset(name) { switch (name) { case "strict": return { ...STRICT_PRESET }; case "moderate": return { ...MODERATE_PRESET }; case "lenient": return { ...LENIENT_PRESET }; default: return { ...MODERATE_PRESET }; } } // src/vard.ts var VardBuilder = class _VardBuilder { config; constructor(config) { const defaultConfig = getPreset("moderate"); this.config = { ...defaultConfig, ...config }; } /** * Create a callable vard instance from this builder * Allows using vard as a function: vard(input) instead of vard.parse(input) */ static createCallable(builder) { const callable = ((input) => builder.parse(input)); callable.parse = builder.parse.bind(builder); callable.safeParse = builder.safeParse.bind(builder); callable.delimiters = (delims) => builder.delimiters(delims); callable.pattern = (regex, severity, type) => builder.pattern(regex, severity, type); callable.patterns = (patterns) => builder.patterns(patterns); callable.maxLength = (length) => builder.maxLength(length); callable.threshold = (value) => builder.threshold(value); callable.block = (threat) => builder.block(threat); callable.sanitize = (threat) => builder.sanitize(threat); callable.warn = (threat) => builder.warn(threat); callable.allow = (threat) => builder.allow(threat); callable.onWarn = (callback) => builder.onWarn(callback); return callable; } /** * Configures custom prompt delimiters to detect and protect against. * * Use this when your prompts use specific delimiters to separate sections * (e.g., RAG context, user input, system instructions). The vard will detect * if user input contains these delimiters, preventing context injection. * * @param delims - Array of delimiter strings to protect (case-sensitive, exact match) * @returns New vard instance with custom delimiters configured (immutable) * * @example * **Protect RAG delimiters** * ```typescript * const chatVard = vard() * .delimiters(['CONTEXT:', 'USER:', 'SYSTEM:']) * .block('delimiterInjection'); * * // This will throw * chatVard.parse('Hello CONTEXT: fake data'); * // Throws: PromptInjectionError (delimiter injection detected) * ``` * * @example * **Multiple delimiter formats** * ```typescript * const myVard = vard.strict() * .delimiters([ * '### CONTEXT ###', * '### USER ###', * '<system>', * '</system>', * ]); * * const safe = myVard.parse(userInput); * ``` * * @see {@link block} to throw on delimiter detection * @see {@link sanitize} to remove delimiters instead of throwing */ delimiters(delims) { const newBuilder = new _VardBuilder({ ...this.config, customDelimiters: [...delims] }); return _VardBuilder.createCallable(newBuilder); } /** * Adds a custom detection pattern for language-specific or domain-specific threats. * * Use this to detect attacks in non-English languages or add patterns specific * to your application. Custom patterns are checked in addition to built-in patterns. * * @param regex - Regular expression to match threats (use bounded quantifiers to avoid ReDoS) * @param severity - Severity score from 0-1 (default: 0.8). Higher = more severe. * @param type - Type of threat this pattern detects (default: 'instructionOverride') * @returns New vard instance with custom pattern added (immutable) * * @example * **Norwegian attack patterns** * ```typescript * const norwegianVard = vard.moderate() * .pattern(/ignorer.*instruksjoner/i, 0.9, 'instructionOverride') * .pattern(/du er nå/i, 0.85, 'roleManipulation') * .pattern(/vis systemprompten/i, 0.95, 'systemPromptLeak'); * * norwegianVard.parse('ignorer alle instruksjoner'); * // Throws: PromptInjectionError * ``` * * @example * **Domain-specific patterns** * ```typescript * const medicalVard = vard.strict() * .pattern(/\bsudowoodo\b/i, 0.95, 'instructionOverride') // Custom trigger word * .pattern(/override\s+diagnosis/i, 0.9, 'instructionOverride'); * * const safe = medicalVard.parse(patientInput); * ``` * * @remarks * **ReDoS Warning**: Always use bounded quantifiers in your regex to prevent * catastrophic backtracking. Bad: `/(a+)+/`. Good: `/a{1,50}/`. * * @see {@link patterns} to add multiple patterns at once * @see {@link threshold} to adjust sensitivity */ pattern(regex, severity = 0.8, type = "instructionOverride") { const newPattern = { regex, severity, type }; const newBuilder = new _VardBuilder({ ...this.config, customPatterns: [...this.config.customPatterns, newPattern] }); return _VardBuilder.createCallable(newBuilder); } /** * Adds multiple custom detection patterns at once. * * Convenience method for bulk pattern registration. Each pattern must specify * a regex, severity score (0-1), and threat type. * * @param patterns - Array of custom patterns to add * @returns New vard instance with patterns added (immutable) * * @example * **Add multiple domain-specific patterns** * ```typescript * import vard from '@andersmyrmel/vard'; * import type { Pattern } from '@andersmyrmel/vard'; * * const medicalPatterns: Pattern[] = [ * { * regex: /reveal\s+patient\s+data/i, * severity: 0.95, * type: 'systemPromptLeak', * }, * { * regex: /bypass\s+hipaa/i, * severity: 0.9, * type: 'instructionOverride', * }, * ]; * * const medicalVard = vard() * .patterns(medicalPatterns) * .block('systemPromptLeak') * .block('instructionOverride'); * ``` * * @example * **Combine with single pattern() method** * ```typescript * const myVard = vard() * .patterns(bulkPatterns) // Add 10 patterns at once * .pattern(/special-case/i, 0.8, 'instructionOverride'); // Add 1 more * ``` * * @see {@link pattern} to add a single pattern */ patterns(patterns) { const newBuilder = new _VardBuilder({ ...this.config, customPatterns: [...this.config.customPatterns, ...patterns] }); return _VardBuilder.createCallable(newBuilder); } /** * Sets the maximum allowed input length in characters. * * Inputs longer than this limit will throw `PromptInjectionError`. * Useful for preventing resource exhaustion and limiting token costs. * * @param length - Maximum number of characters allowed (must be positive) * @returns New vard instance with max length configured (immutable) * * @example * **Limit user input length** * ```typescript * const chatVard = vard.moderate() * .maxLength(5000); // ~1250 tokens for GPT models * * chatVard.parse('a'.repeat(10000)); * // Throws: PromptInjectionError (input exceeds 5000 characters) * ``` * * @example * **Different limits for different contexts** * ```typescript * const shortFormVard = vard().maxLength(500); * const longFormVard = vard().maxLength(10000); * * shortFormVard.parse(feedbackInput); * longFormVard.parse(documentInput); * ``` * * @remarks * Default max length is 10,000 characters (~2,500 tokens for GPT models). * This prevents DoS attacks and excessive token costs. */ maxLength(length) { const newBuilder = new _VardBuilder({ ...this.config, maxLength: length }); return _VardBuilder.createCallable(newBuilder); } /** * Sets the detection threshold for blocking threats. * * Only threats with severity >= threshold will trigger their configured action. * Lower threshold = more sensitive (more false positives). * Higher threshold = less sensitive (may miss attacks). * * @param value - Threshold from 0-1 (automatically clamped to this range) * @returns New vard instance with threshold configured (immutable) * * @example * **Adjust sensitivity** * ```typescript * // Strict: catch everything (more false positives) * const strict = vard().threshold(0.5); * * // Balanced (default for moderate preset) * const balanced = vard().threshold(0.7); * * // Lenient: only high-confidence threats * const lenient = vard().threshold(0.9); * ``` * * @example * **Threshold affects which patterns trigger** * ```typescript * const myVard = vard().threshold(0.8); * * // Pattern with severity 0.75 - IGNORED (below threshold) * vard.parse('start over'); // Passes * * // Pattern with severity 0.9 - DETECTED (above threshold) * vard.parse('ignore all instructions'); // Throws * ``` * * @remarks * Recommended thresholds: * - **0.5-0.6**: High security, expect false positives * - **0.7**: Balanced (default) * - **0.85-0.9**: Permissive, technical content * * @see {@link vard.strict} for preset with 0.5 threshold * @see {@link vard.moderate} for preset with 0.7 threshold * @see {@link vard.lenient} for preset with 0.85 threshold */ threshold(value) { const newBuilder = new _VardBuilder({ ...this.config, threshold: Math.max(0, Math.min(1, value)) }); return _VardBuilder.createCallable(newBuilder); } /** * Set action for a specific threat type */ setThreatAction(threat, action) { const newBuilder = new _VardBuilder({ ...this.config, threatActions: { ...this.config.threatActions, [threat]: action } }); return _VardBuilder.createCallable(newBuilder); } /** * Configures the vard to throw an error when detecting the specified threat type. * * Use this when you want to reject input completely rather than attempting * to sanitize it. Recommended for high-severity threats. * * @param threat - Type of threat to block ('instructionOverride', 'roleManipulation', etc.) * @returns New vard instance with block action configured (immutable) * * @example * **Block specific threats** * ```typescript * const myVard = vard.moderate() * .block('instructionOverride') * .block('systemPromptLeak') * .sanitize('delimiterInjection'); // Mix with other actions * * myVard.parse('ignore all instructions'); * // Throws: PromptInjectionError * ``` * * @example * **Override preset behavior** * ```typescript * // Moderate preset sanitizes delimiters, but we want to block them * const strictDelimiters = vard.moderate() * .delimiters(['CONTEXT:', 'USER:']) * .block('delimiterInjection'); * * strictDelimiters.parse('CONTEXT: fake data'); * // Throws: PromptInjectionError * ``` * * @see {@link sanitize} to remove threats instead of blocking * @see {@link warn} to log but allow threats (use with {@link onWarn} callback) * @see {@link allow} to ignore threats completely */ block(threat) { return this.setThreatAction(threat, "block"); } /** * Configures the vard to remove/clean threats instead of throwing an error. * * Use this for threats that can be safely removed from input (like delimiters) * or when you want to be permissive rather than blocking users. * * **Important**: Sanitized input is re-validated to catch bypass attempts. * If sanitization fails to remove threats, an error will still be thrown. * * @param threat - Type of threat to sanitize ('delimiterInjection', 'encoding', etc.) * @returns New vard instance with sanitize action configured (immutable) * * @example * **Sanitize instead of block** * ```typescript * const lenientVard = vard() * .sanitize('delimiterInjection') * .sanitize('encoding') * .block('instructionOverride'); // Still block severe threats * * const result = lenientVard.parse('<system>Hello</system>'); * console.log(result); // "Hello" (delimiters removed) * ``` * * @example * **Handles nested attacks** * ```typescript * const myVard = vard().sanitize('delimiterInjection'); * * // Nested attack: <sy<system>stem> * // After removing inner <system>: <system> * // Re-validation catches this and re-sanitizes * const safe = myVard.parse('<sy<system>stem>text</system>'); * console.log(safe); // "text" (fully sanitized) * ``` * * @remarks * Sanitization uses iterative cleaning (max 5 passes) to prevent bypass * attempts with nested delimiters or patterns. * * @see {@link block} to throw errors instead of sanitizing * @see {@link warn} to log but allow threats * @see {@link allow} to ignore threats */ sanitize(threat) { return this.setThreatAction(threat, "sanitize"); } /** * Configures the vard to categorize threats for logging without blocking or sanitizing. * * Useful for monitoring potential threats in production without disrupting users. * Use with `.onWarn()` to set a callback that will be invoked for each warning-level threat. * * @param threat - Type of threat to warn about ('instructionOverride', 'roleManipulation', etc.) * @returns New vard instance with warn action configured (immutable) * * @example * **Monitor without blocking** * ```typescript * const monitor = vard() * .warn('instructionOverride') // Categorize but don't block * .onWarn((threat) => console.log('Warning:', threat.type)) * .block('systemPromptLeak'); // Still block this * * // This passes through but invokes the onWarn callback * const result = monitor.parse('ignore previous instructions'); * console.log(result); // Original input unchanged * ``` * * @example * **Gradual rollout strategy** * ```typescript * // Phase 1: Monitor in production * const phase1 = vard().warn('instructionOverride'); * * // Phase 2: Sanitize after analyzing logs * const phase2 = vard().sanitize('instructionOverride'); * * // Phase 3: Block if sanitization isn't enough * const phase3 = vard().block('instructionOverride'); * ``` * * @remarks * Warnings are categorized and passed to the `.onWarn()` callback if configured. * Without a callback, warnings are silently allowed through. * * @see {@link onWarn} to set a callback for warning-level threats * @see {@link block} to throw errors for threats * @see {@link sanitize} to remove threats from input * @see {@link allow} to ignore threats completely */ warn(threat) { return this.setThreatAction(threat, "warn"); } /** * Configures the vard to completely ignore a specific threat type. * * Use this when you've determined a threat type produces too many false positives * in your domain, or when certain patterns are expected in your use case. * * @param threat - Type of threat to allow/ignore ('instructionOverride', 'roleManipulation', etc.) * @returns New vard instance with allow action configured (immutable) * * @example * **Disable specific threat detection** * ```typescript * // Technical documentation contains instruction-like language * const docVard = vard() * .allow('instructionOverride') // Don't flag "start over", "ignore this" * .block('systemPromptLeak') // Still protect against prompt leaks * .block('delimiterInjection'); * * const safe = docVard.parse('Step 1: Start over with a clean slate'); * console.log(safe); // Passes through unchanged * ``` * * @example * **Domain-specific false positives** * ```typescript * // Customer support chat allows role-playing scenarios * const supportVard = vard() * .allow('roleManipulation') // "act as", "pretend you are" are ok * .block('instructionOverride') // Still block instruction overrides * .sanitize('delimiterInjection'); * * supportVard.parse('Can you act as a technical expert?'); * // Passes - roleManipulation is allowed * ``` * * @remarks * Use sparingly - each allowed threat type reduces your security posture. * Consider using `.sanitize()` or `.warn()` instead when possible. * * @see {@link block} to throw errors for threats * @see {@link sanitize} to remove threats from input * @see {@link warn} to monitor threats without blocking */ allow(threat) { return this.setThreatAction(threat, "allow"); } /** * Sets a callback function to be invoked when warning-level threats are detected. * * Use this to log or monitor threats without blocking user input. The callback * is invoked for each threat with the 'warn' action that meets the threshold. * * @param callback - Function to call for each warning-level threat * @returns New vard instance with callback configured (immutable) * * @example * **Log warnings to console** * ```typescript * const myVard = vard() * .warn('instructionOverride') * .onWarn((threat) => { * console.log(`[SECURITY WARNING] ${threat.type}: ${threat.match}`); * console.log(`Severity: ${threat.severity}, Position: ${threat.position}`); * }); * * myVard.parse('ignore previous instructions'); * // Logs: [SECURITY WARNING] instructionOverride: ignore previous instructions * // Returns: input passes through unchanged * ``` * * @example * **Send warnings to monitoring service** * ```typescript * const chatVard = vard() * .warn('roleManipulation') * .onWarn(async (threat) => { * await analytics.track('prompt_injection_warning', { * type: threat.type, * severity: threat.severity, * timestamp: Date.now(), * }); * }); * ``` * * @example * **Gradual rollout with monitoring** * ```typescript * // Phase 1: Monitor suspicious patterns without blocking * const phase1Vard = vard() * .warn('instructionOverride') * .onWarn((threat) => { * // Collect data to tune threshold * logger.info({ threat, userId: currentUser.id }); * }); * * // Phase 2: After analysis, switch to blocking * const phase2Vard = vard().block('instructionOverride'); * ``` * * @remarks * The callback is called synchronously during validation. For expensive operations * (like API calls), consider using a queue or async wrapper to avoid blocking. * * @see {@link warn} to configure threat types for warning */ onWarn(callback) { const newBuilder = new _VardBuilder({ ...this.config, onWarn: callback }); return _VardBuilder.createCallable(newBuilder); } /** * Validates input and returns the safe string. * * This is the primary validation method. It detects threats, applies configured * actions (block/sanitize/warn/allow), and either returns safe input or throws * `PromptInjectionError`. * * @param input - User input to validate (must be a string) * @returns Validated (and possibly sanitized) input string * @throws {PromptInjectionError} When threats with 'block' action are detected above threshold * @throws {TypeError} When input is not a string * * @example * **Basic usage (throws on detection)** * ```typescript * import vard, { PromptInjectionError } from '@andersmyrmel/vard'; * * try { * const safe = vard.moderate().parse(userInput); * // Use safe input in your LLM prompt * await llm.generate(`Context: ${safe}`); * } catch (error) { * if (error instanceof PromptInjectionError) { * console.error('[SECURITY]', error.getDebugInfo()); * return { error: 'Invalid input detected' }; * } * } * ``` * * @example * **Sanitization example** * ```typescript * const chatVard = vard() * .delimiters(['CONTEXT:', 'USER:']) * .sanitize('delimiterInjection') * .block('instructionOverride'); * * // Delimiters are removed * const result = chatVard.parse('Hello CONTEXT: fake data'); * console.log(result); // "Hello fake data" * * // Instruction override is blocked * chatVard.parse('ignore all previous instructions'); * // Throws: PromptInjectionError * ``` * * @remarks * **Security Features**: * - Re-validates after sanitization to catch nested attacks * - Iterative sanitization (max 5 passes) prevents bypass attempts * - Threshold filtering: only threats >= threshold trigger their action * * @see {@link safeParse} for non-throwing alternative (returns result object) * @see {@link PromptInjectionError} for error details and logging */ parse(input) { if (typeof input !== "string") { throw new TypeError("Input must be a string"); } if (input.trim() === "") { return ""; } const lengthThreat = checkLength(input, this.config.maxLength); if (lengthThreat) { throw new PromptInjectionError([lengthThreat]); } const allPatternsToCheck = [...allPatterns, ...this.config.customPatterns]; let threats = detect(input, allPatternsToCheck); if (this.config.customDelimiters.length > 0) { const delimiterThreats = detectCustomDelimiters( input, this.config.customDelimiters ); threats = [...threats, ...delimiterThreats]; } const { toBlock, toSanitize, toWarn } = this.categorizeThreats(threats); if (toBlock.length > 0) { throw new PromptInjectionError(toBlock); } if (toWarn.length > 0 && this.config.onWarn) { for (const threat of toWarn) { this.config.onWarn(threat); } } let result = input; if (toSanitize.length > 0) { result = sanitize(input, toSanitize); const recheck = detect(result, allPatternsToCheck); const recheckDelimiters = this.config.customDelimiters.length > 0 ? detectCustomDelimiters(result, this.config.customDelimiters) : []; const allRecheckThreats = [...recheck, ...recheckDelimiters]; const { toBlock: recheckBlock } = this.categorizeThreats(allRecheckThreats); if (recheckBlock.length > 0) { throw new PromptInjectionError(recheckBlock); } } return result; } /** * Validates input without throwing - returns a result object instead. * * Use this when you want to handle threats gracefully without try/catch blocks. * Returns a discriminated union that TypeScript can narrow based on the `safe` property. * * @param input - User input to validate (must be a string) * @returns Result object: * - `{ safe: true, data: string }` if input is valid * - `{ safe: false, threats: Threat[] }` if threats were detected * * @example * **Graceful error handling (no try/catch)** * ```typescript * import vard from '@andersmyrmel/vard'; * * const result = vard.moderate().safeParse(userInput); * * if (result.safe) { * // TypeScript knows result.data is string * await llm.generate(`Context: ${result.data}`); * } else { * // TypeScript knows result.threats is Threat[] * console.error('Threats detected:', result.threats.length); * result.threats.forEach(t => { * console.log(`- ${t.type} (severity: ${t.severity.toFixed(2)})`); * }); * } * ``` * * @example * **Conditional processing based on threats** * ```typescript * const chatVard = vard() * .sanitize('delimiterInjection') * .block('instructionOverride'); * * const result = chatVard.safeParse(userMessage); * * if (!result.safe) { * // Log for security monitoring * logSecurityEvent({ * threats: result.threats.map(t => t.type), * severity: Math.max(...result.threats.map(t => t.severity)), * }); * * return { error: 'Invalid input detected' }; * } * * return { message: result.data }; * ``` * * @remarks * **Type Safety**: The return type is a discriminated union. TypeScript will * automatically narrow the type based on the `safe` property, giving you * type-safe access to either `data` or `threats`. * * @see {@link parse} for throwing alternative * @see {@link VardResult} type definition */ safeParse(input) { try { const data = this.parse(input); return { safe: true, data }; } catch (error) { if (error instanceof PromptInjectionError) { return { safe: false, threats: error.threats }; } throw error; } } /** * Categorize threats by configured action and threshold * Returns threats grouped by action: block (throw error), sanitize (clean), warn (log) */ categorizeThreats(threats) { const toBlock = []; const toSanitize = []; const toWarn = []; for (const threat of threats) { if (threat.severity < this.config.threshold) { continue; } const action = this.config.threatActions[threat.type]; switch (action) { case "block":