UNPKG

@dooor-ai/toolkit

Version:

Guards, Evals & Observability for AI applications - works seamlessly with LangChain/LangGraph

129 lines (122 loc) 4.36 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ToxicityGuard = void 0; const base_1 = require("./base"); const cortexdb_client_1 = require("../observability/cortexdb-client"); /** * Guard that detects toxic content using AI (Gemini via CortexDB proxy) * * @example * ```typescript * // Provider from toolkitConfig (recommended) * const guard = new ToxicityGuard({ * threshold: 0.7, * categories: ["hate", "violence", "harassment"] * }); * * // Or specify provider explicitly * const guard = new ToxicityGuard({ * threshold: 0.7, * providerName: "gemini", // Override toolkitConfig.providerName * }); * ``` */ class ToxicityGuard extends base_1.Guard { constructor(config = {}) { super({ threshold: 0.7, blockOnDetection: true, ...config, }); this.providerName = config.providerName; this.categories = config.categories || ["hate", "violence", "sexual", "harassment"]; } get name() { return "ToxicityGuard"; } async validate(input, metadata) { try { const client = (0, cortexdb_client_1.getCortexDBClient)(); // Use explicit providerName or fallback to global from toolkitConfig const providerName = this.providerName || (0, cortexdb_client_1.getGlobalProviderName)() || undefined; const prompt = this.buildToxicityPrompt(input); const response = await client.invokeAI({ prompt, usage: "guard", maxTokens: 200, temperature: 0, providerName, }); const result = this.parseToxicityResponse(response.text); const passed = result.score < this.getThreshold(); return { passed, reason: passed ? undefined : `Toxic content detected: ${result.categories.join(", ")}`, severity: result.score > 0.9 ? "critical" : result.score > 0.7 ? "high" : "medium", suggestedAction: this.shouldBlock() ? "block" : "warn", confidence: result.score, metadata: { categories: result.categories, details: result.details, provider: this.providerName, }, }; } catch (error) { console.error("ToxicityGuard failed:", error.message); // Fallback: if AI call fails, don't block (fail open) return { passed: true, reason: undefined, confidence: 0, metadata: { error: error.message, fallback: true, }, }; } } /** * Build prompt for toxicity detection */ buildToxicityPrompt(input) { return `You are a content moderation AI. Analyze the following text for toxic content. Categories to check: ${this.categories.join(", ")} Text to analyze: """ ${input} """ Provide your analysis in this exact format: TOXICITY_SCORE: [0.0-1.0] CATEGORIES: [comma-separated list of detected categories, or "none"] DETAILS: [brief explanation] Examples: TOXICITY_SCORE: 0.9 CATEGORIES: hate, harassment DETAILS: Contains hate speech targeting a group TOXICITY_SCORE: 0.1 CATEGORIES: none DETAILS: Text is neutral and non-toxic Now analyze the text:`; } /** * Parse AI response for toxicity score and categories */ parseToxicityResponse(text) { const scoreMatch = text.match(/TOXICITY_SCORE:\s*([\d.]+)/i); const categoriesMatch = text.match(/CATEGORIES:\s*(.+?)(?:\n|$)/i); const detailsMatch = text.match(/DETAILS:\s*(.+?)(?:\n|$)/i); const score = scoreMatch ? parseFloat(scoreMatch[1]) : 0.5; const categoriesText = categoriesMatch ? categoriesMatch[1].trim() : "none"; const categories = categoriesText.toLowerCase() === "none" ? [] : categoriesText.split(",").map(c => c.trim()); const details = detailsMatch ? detailsMatch[1].trim() : "No details provided"; return { score: Math.min(1, Math.max(0, score)), // Clamp to 0-1 categories, details, }; } } exports.ToxicityGuard = ToxicityGuard;