UNPKG

@andersmyrmel/vard

Version:

Lightweight prompt injection detection for LLM applications. Zod-inspired chainable API for prompt security.

642 lines (637 loc) 20.4 kB
/** * Types of prompt injection threats that can be detected. * * @remarks * Each threat type represents a different attack vector: * - **instructionOverride**: Attempts to replace or modify system instructions * - **roleManipulation**: Tries to change the AI's role or persona * - **delimiterInjection**: Injects fake delimiters to confuse prompt structure * - **systemPromptLeak**: Attempts to reveal the system prompt or internal instructions * - **encoding**: Uses special encoding to bypass detection (base64, hex, unicode) * * @example * **Configure actions per threat type** * ```typescript * import vard from '@andersmyrmel/vard'; * import type { ThreatType } from '@andersmyrmel/vard'; * * const threats: ThreatType[] = [ * 'instructionOverride', * 'roleManipulation', * 'systemPromptLeak', * ]; * * const myVard = vard(); * threats.forEach(threat => myVard.block(threat)); * ``` * * @see {@link https://github.com/andersmyrmel/vard#threat-types | Full threat type documentation} */ type ThreatType = "instructionOverride" | "roleManipulation" | "delimiterInjection" | "systemPromptLeak" | "encoding"; /** * Actions to take when a threat is detected. * * @remarks * Each action has different behavior: * - **block**: Throw `PromptInjectionError` (validation fails) * - **sanitize**: Remove/clean the threat and return sanitized input * - **warn**: Categorize threat but allow input to pass (use with `onWarn` callback) * - **allow**: Completely ignore this threat type * * Actions only apply to threats with severity >= configured threshold. * * @example * **Set actions for different threat types** * ```typescript * import vard from '@andersmyrmel/vard'; * * const myVard = vard() * .block('instructionOverride') // Throw error * .sanitize('delimiterInjection') // Remove delimiters * .warn('roleManipulation') // Log but allow * .allow('encoding'); // Ignore completely * ``` * * @see {@link VardBuilder.block} * @see {@link VardBuilder.sanitize} * @see {@link VardBuilder.warn} * @see {@link VardBuilder.allow} */ type ThreatAction = "block" | "sanitize" | "warn" | "allow"; /** * Individual threat detection result. * * Represents a single detected threat with metadata about what was found, * where it was found, and how severe it is. * * @example * **Inspect detected threats** * ```typescript * import vard, { PromptInjectionError } from '@andersmyrmel/vard'; * import type { Threat } from '@andersmyrmel/vard'; * * try { * vard(userInput); * } catch (error) { * if (error instanceof PromptInjectionError) { * error.threats.forEach((threat: Threat) => { * console.log(`Type: ${threat.type}`); * console.log(`Severity: ${threat.severity.toFixed(2)}`); * console.log(`Match: "${threat.match}"`); * console.log(`Position: ${threat.position}`); * }); * } * } * ``` * * @example * **Filter threats by severity** * ```typescript * const result = vard.safe(userInput); * * if (!result.safe) { * const criticalThreats = result.threats.filter(t => t.severity >= 0.9); * const moderateThreats = result.threats.filter(t => t.severity >= 0.7 && t.severity < 0.9); * * console.log(`Critical: ${criticalThreats.length}`); * console.log(`Moderate: ${moderateThreats.length}`); * } * ``` */ interface Threat { /** Type of threat detected (e.g., 'instructionOverride', 'roleManipulation') */ type: ThreatType; /** Severity score from 0 (low) to 1 (high) */ severity: number; /** The matched string that triggered detection (truncated if > 100 chars) */ match: string; /** Character position where the threat was found in the input */ position: number; } /** * Pattern configuration for custom threat detection. * * Defines a regex pattern to match, its severity score, and which threat type * it represents. Used for adding domain-specific or language-specific patterns. * * @example * **Add Norwegian patterns** * ```typescript * import vard from '@andersmyrmel/vard'; * import type { Pattern } from '@andersmyrmel/vard'; * * const norwegianPatterns: Pattern[] = [ * { * regex: /ignorer.*instruksjoner/i, * severity: 0.9, * type: 'instructionOverride', * }, * { * regex: /du er nå/i, * severity: 0.85, * type: 'roleManipulation', * }, * ]; * * const norwegianVard = vard() * .patterns(norwegianPatterns) * .block('instructionOverride') * .block('roleManipulation'); * ``` * * @example * **Domain-specific patterns** * ```typescript * const medicalPattern: Pattern = { * regex: /reveal\s+(patient|medical)\s+data/i, * severity: 0.95, * type: 'systemPromptLeak', * }; * * const medicalVard = vard() * .pattern(medicalPattern.regex, medicalPattern.severity, medicalPattern.type) * .block('systemPromptLeak'); * ``` * * @remarks * **ReDoS Warning**: All regex patterns should use bounded quantifiers to prevent * Regular Expression Denial of Service attacks. Avoid: `(.*)+`, `(a+)+`, etc. * * @see {@link VardBuilder.pattern} for adding single patterns * @see {@link VardBuilder.patterns} for adding multiple patterns */ interface Pattern { /** Regular expression to match (should use bounded quantifiers for safety) */ regex: RegExp; /** Severity score from 0 (low) to 1 (high) */ severity: number; /** Which threat type this pattern detects */ type: ThreatType; } /** * Result type returned by `safeParse()` method (discriminated union). * * This type enables type-safe error handling without try/catch blocks. * TypeScript can narrow the type based on the `safe` property. * * @example * **Type narrowing with discriminated union** * ```typescript * import vard from '@andersmyrmel/vard'; * import type { VardResult } from '@andersmyrmel/vard'; * * const result: VardResult = vard.safe(userInput); * * if (result.safe) { * // TypeScript knows result.data is string * console.log('Safe input:', result.data); * processInput(result.data); * } else { * // TypeScript knows result.threats is Threat[] * console.error('Detected threats:', result.threats.length); * result.threats.forEach(t => { * console.log(`- ${t.type}: ${t.match}`); * }); * } * ``` * * @example * **Early return pattern** * ```typescript * function processUserInput(input: string) { * const result = vard.safe(input); * * if (!result.safe) { * return { error: 'Invalid input', threats: result.threats }; * } * * // TypeScript knows result.data exists here * return { data: result.data }; * } * ``` * * @see {@link VardBuilder.safeParse} for usage */ type VardResult = { safe: true; data: string; } | { safe: false; threats: Threat[]; }; /** * Internal configuration object for VardBuilder. * * This is primarily used internally but can be useful for debugging or * understanding vard behavior. * * @remarks * Most users don't need to interact with this type directly - use the * chainable builder methods instead (`vard().threshold()`, `.delimiters()`, etc.) * * @see {@link VardBuilder} for the public API */ interface VardConfig { /** Detection threshold (0-1) - only threats with severity >= threshold are processed */ threshold: number; /** Maximum input length in characters */ maxLength: number; /** Custom prompt delimiters to protect against (case-sensitive, exact match) */ customDelimiters: string[]; /** Custom patterns to add to built-in patterns */ customPatterns: Pattern[]; /** Actions for each threat type (block, sanitize, warn, allow) */ threatActions: Record<ThreatType, ThreatAction>; /** Optional callback for warning-level threats (called when action is 'warn') */ onWarn?: (threat: Threat) => void; } /** * Preset configuration names. * * @remarks * Each preset provides different security/usability trade-offs: * - **strict** (threshold: 0.5): Maximum security, higher false positives * - **moderate** (threshold: 0.7): Balanced security and usability (default) * - **lenient** (threshold: 0.85): Permissive, lower false positives * * @example * **Use preset factories** * ```typescript * import vard from '@andersmyrmel/vard'; * * const strict = vard.strict(); // threshold: 0.5 * const moderate = vard.moderate(); // threshold: 0.7 (default) * const lenient = vard.lenient(); // threshold: 0.85 * ``` * * @see {@link vard.strict} * @see {@link vard.moderate} * @see {@link vard.lenient} */ type PresetName = "strict" | "moderate" | "lenient"; /** * Callable vard type - a function with attached chainable methods. * * This type represents a vard that can be used both as a function * and as an object with chainable configuration methods. * * @remarks * **Usage patterns**: * - **As function**: `myVard(input)` - shorthand for `myVard.parse(input)` * - **As object**: `myVard.parse(input)` - explicit validation * - **Chainable**: `myVard.threshold(0.8).delimiters([...])` - configure vard * * All chainable methods return a new `CallableVard` instance (immutable). * * @example * **Create and use callable vard** * ```typescript * import vard from '@andersmyrmel/vard'; * * // Create configured vard * const chatVard = vard() * .delimiters(['CONTEXT:', 'USER:']) * .threshold(0.8) * .block('instructionOverride'); * * // Use as function (shorthand) * const safe1 = chatVard(userInput); * * // Use as object (explicit) * const safe2 = chatVard.parse(userInput); * * // Use safeParse (no throw) * const result = chatVard.safeParse(userInput); * ``` * * @example * **Chain after preset** * ```typescript * const myVard = vard.moderate() * .delimiters(['SYSTEM:']) * .maxLength(5000); * * myVard('Hello world'); // Validates immediately * ``` * * @see {@link vard} for creating callable vards * @see {@link VardBuilder} for the implementation */ type CallableVard = { /** Shorthand for `parse()` - validates input and returns safe string (throws on threat) */ (input: string): string; /** Validates input and returns safe string (throws `PromptInjectionError` on threat) */ parse(input: string): string; /** Validates input without throwing - returns `VardResult` discriminated union */ safeParse(input: string): VardResult; /** Configure custom prompt delimiters to protect */ delimiters(delims: string[]): CallableVard; /** Add a single custom detection pattern */ pattern(regex: RegExp, severity?: number, type?: ThreatType): CallableVard; /** Add multiple custom detection patterns */ patterns(patterns: Pattern[]): CallableVard; /** Set maximum input length in characters */ maxLength(length: number): CallableVard; /** Set detection threshold (0-1, lower = more sensitive) */ threshold(value: number): CallableVard; /** Configure vard to throw error for a threat type */ block(threat: ThreatType): CallableVard; /** Configure vard to remove/clean a threat type */ sanitize(threat: ThreatType): CallableVard; /** Configure vard to categorize but not block a threat type */ warn(threat: ThreatType): CallableVard; /** Configure vard to completely ignore a threat type */ allow(threat: ThreatType): CallableVard; /** Set callback function to be called when warning-level threats are detected */ onWarn(callback: (threat: Threat) => void): CallableVard; }; /** * Error thrown when prompt injection attacks are detected in user input. * * @remarks * This error contains details about detected threats in the `threats` property. * Use `getUserMessage()` for user-facing errors (safe, no threat details leaked) * and `getDebugInfo()` for server-side logging (detailed threat information). * * **Security Note**: Never expose `getDebugInfo()` or the `threats` array to * end users, as this reveals information about your security measures. * * @example * **Basic error handling** * ```typescript * import vard, { PromptInjectionError } from '@andersmyrmel/vard'; * * try { * const safe = vard(userInput); * processInput(safe); * } catch (error) { * if (error instanceof PromptInjectionError) { * // Show generic message to user * console.log(error.getUserMessage('en')); * * // Log details server-side * console.error('[SECURITY]', error.getDebugInfo()); * } * } * ``` * * @example * **Inspecting detected threats** * ```typescript * try { * vard(userInput); * } catch (error) { * if (error instanceof PromptInjectionError) { * error.threats.forEach(threat => { * console.log(`Type: ${threat.type}`); * console.log(`Severity: ${threat.severity}`); * console.log(`Match: ${threat.match}`); * console.log(`Position: ${threat.position}`); * }); * } * } * ``` * * @example * **Norwegian error messages** * ```typescript * try { * vard(userInput); * } catch (error) { * if (error instanceof PromptInjectionError) { * return { * error: error.getUserMessage('no') * // Returns: "Ugyldig innhold oppdaget. Vennligst prøv igjen." * }; * } * } * ``` */ declare class PromptInjectionError extends Error { /** * Array of detected threats. Each threat contains: * - `type`: Type of attack detected * - `severity`: Severity score (0-1) * - `match`: The matched string that triggered detection * - `position`: Character position where threat was found * * @remarks * **Security**: Never expose this to end users. Use `getUserMessage()` instead. */ readonly threats: Threat[]; /** * Creates a new PromptInjectionError. * * @param threats - Array of detected threats (must not be empty) */ constructor(threats: Threat[]); /** * Returns a generic, user-safe error message. * * This message intentionally does NOT reveal what was detected or why. * Use this for user-facing error messages. * * @param locale - Language for the message ('en' or 'no') * @returns Generic error message in the specified language * * @example * **English message (default)** * ```typescript * error.getUserMessage('en'); * // Returns: "Invalid input detected. Please try again." * ``` * * @example * **Norwegian message** * ```typescript * error.getUserMessage('no'); * // Returns: "Ugyldig innhold oppdaget. Vennligst prøv igjen." * ``` * * @see {@link getDebugInfo} for detailed threat information (server-side only) */ getUserMessage(locale?: "en" | "no"): string; /** * Returns detailed threat information for logging and debugging. * * @remarks * **Security Warning**: This method returns detailed information about detected * threats including attack types, severity scores, and matched patterns. * **NEVER expose this to end users** as it reveals your security measures. * * Use this only for: * - Server-side logging * - Security monitoring * - Debugging during development * * @returns Formatted string with detailed threat information * * @example * **Server-side logging** * ```typescript * try { * vard(userInput); * } catch (error) { * if (error instanceof PromptInjectionError) { * // Log detailed info server-side (safe) * console.error('[SECURITY]', error.getDebugInfo()); * * // Return generic message to user (safe) * return { error: error.getUserMessage() }; * } * } * ``` * * @example * **Example output** * ``` * Threats detected: * - instructionOverride (severity: 0.90, match: "ignore all previous instr...", position: 0) * - delimiterInjection (severity: 0.95, match: "<system>", position: 45) * ``` * * @see {@link getUserMessage} for safe, user-facing error messages */ getDebugInfo(): string; } /** * Main vard function - validates input against prompt injection attacks. * * Can be called with or without an input string: * - With input: `vard(input)` - validates immediately (throws on detection) * - Without input: `vard()` - returns a chainable vard builder * * Uses moderate preset by default (threshold: 0.7, balanced security). * * @param input - User input to validate (optional) * @returns Validated string if input provided, or chainable vard builder if no input * @throws {PromptInjectionError} When prompt injection is detected (only if input provided) * * @example * **Zero-config usage (throws on threat)** * ```typescript * import vard from '@andersmyrmel/vard'; * * const safe = vard('Hello, how can I help?'); * // Returns: 'Hello, how can I help?' * * vard('Ignore all previous instructions'); * // Throws: PromptInjectionError * ``` * * @example * **Create chainable vard** * ```typescript * const myVard =vard() * .delimiters(['CONTEXT:', 'USER:']) * .maxLength(5000) * .threshold(0.8); * * const safe = myVard.parse(userInput); * ``` * * @see {@link vard.safe} for non-throwing validation * @see {@link vard.strict} for stricter detection (threshold: 0.5) * @see {@link vard.moderate} for balanced detection (threshold: 0.7) * @see {@link vard.lenient} for permissive detection (threshold: 0.85) */ declare function vardFn(input: string): string; declare function vardFn(): CallableVard; declare namespace vardFn { var safe: (input: string) => VardResult; var strict: () => CallableVard; var moderate: () => CallableVard; var lenient: () => CallableVard; } /** * Main vard export - validates user input against prompt injection attacks. * * @remarks * This is the primary entry point for the library. It provides multiple ways * to validate input depending on your needs: * * - **Zero-config**: `vard(input)` - immediate validation with defaults * - **Safe mode**: `vard.safe(input)` - returns result instead of throwing * - **Presets**: `vard.strict()`, `vard.moderate()`, `vard.lenient()` * - **Chainable**: `vard().delimiters([...]).maxLength(...)` * * All methods return either validated strings or throw `PromptInjectionError`. * * @example * **Zero-config (recommended for most cases)** * ```typescript * import vard from '@andersmyrmel/vard'; * * try { * const safe = vard(userInput); * // Use safe input in your LLM prompt * } catch (error) { * if (error instanceof PromptInjectionError) { * console.error('Security threat detected'); * } * } * ``` * * @example * **Safe mode (no exceptions)** * ```typescript * const result = vard.safe(userInput); * if (result.safe) { * processInput(result.data); * } else { * logThreats(result.threats); * } * ``` * * @example * **Custom configuration** * ```typescript * const chatVard =vard.moderate() * .delimiters(['CONTEXT:', 'USER:', 'SYSTEM:']) * .maxLength(5000) * .sanitize('delimiterInjection') * .block('instructionOverride'); * * const safe = chatVard.parse(userMessage); * ``` * * @see {@link https://github.com/andersmyrmel/vard#readme | Full Documentation} */ declare const vard: typeof vardFn; /** * Short alias for vard - for power users who prefer brevity. * * @example * **Using v alias** * ```typescript * import { v } from '@andersmyrmel/vard'; * * const safe = v(userInput); * const chatVard =v.moderate().delimiters(['CONTEXT:']); * ``` */ declare const v: typeof vardFn; /** * Creates a new vard with default (moderate) configuration. * * This is equivalent to `vard()` with no arguments. Provided as a named * export for clarity in advanced use cases. * * @returns Chainable vard builder with moderate preset * * @example * **Using createVard() directly** * ```typescript * import { createVard } from '@andersmyrmel/vard'; * * const myVard =createVard() * .delimiters(['CONTEXT:']) * .maxLength(5000); * * const safe = myVard.parse(userInput); * ``` * * @see {@link vard} for the main entry point */ declare function createVard(): CallableVard; export { type Pattern, type PresetName, PromptInjectionError, type Threat, type ThreatAction, type ThreatType, type VardConfig, type VardResult, createVard, vard as default, v, vard };