@dooor-ai/toolkit
Version:
Guards, Evals & Observability for AI applications - works seamlessly with LangChain/LangGraph
129 lines (122 loc) • 4.36 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.ToxicityGuard = void 0;
const base_1 = require("./base");
const cortexdb_client_1 = require("../observability/cortexdb-client");
/**
* Guard that detects toxic content using AI (Gemini via CortexDB proxy)
*
* @example
* ```typescript
* // Provider from toolkitConfig (recommended)
* const guard = new ToxicityGuard({
* threshold: 0.7,
* categories: ["hate", "violence", "harassment"]
* });
*
* // Or specify provider explicitly
* const guard = new ToxicityGuard({
* threshold: 0.7,
* providerName: "gemini", // Override toolkitConfig.providerName
* });
* ```
*/
class ToxicityGuard extends base_1.Guard {
constructor(config = {}) {
super({
threshold: 0.7,
blockOnDetection: true,
...config,
});
this.providerName = config.providerName;
this.categories = config.categories || ["hate", "violence", "sexual", "harassment"];
}
get name() {
return "ToxicityGuard";
}
async validate(input, metadata) {
try {
const client = (0, cortexdb_client_1.getCortexDBClient)();
// Use explicit providerName or fallback to global from toolkitConfig
const providerName = this.providerName || (0, cortexdb_client_1.getGlobalProviderName)() || undefined;
const prompt = this.buildToxicityPrompt(input);
const response = await client.invokeAI({
prompt,
usage: "guard",
maxTokens: 200,
temperature: 0,
providerName,
});
const result = this.parseToxicityResponse(response.text);
const passed = result.score < this.getThreshold();
return {
passed,
reason: passed ? undefined : `Toxic content detected: ${result.categories.join(", ")}`,
severity: result.score > 0.9 ? "critical" : result.score > 0.7 ? "high" : "medium",
suggestedAction: this.shouldBlock() ? "block" : "warn",
confidence: result.score,
metadata: {
categories: result.categories,
details: result.details,
provider: this.providerName,
},
};
}
catch (error) {
console.error("ToxicityGuard failed:", error.message);
// Fallback: if AI call fails, don't block (fail open)
return {
passed: true,
reason: undefined,
confidence: 0,
metadata: {
error: error.message,
fallback: true,
},
};
}
}
/**
* Build prompt for toxicity detection
*/
buildToxicityPrompt(input) {
return `You are a content moderation AI. Analyze the following text for toxic content.
Categories to check: ${this.categories.join(", ")}
Text to analyze:
"""
${input}
"""
Provide your analysis in this exact format:
TOXICITY_SCORE: [0.0-1.0]
CATEGORIES: [comma-separated list of detected categories, or "none"]
DETAILS: [brief explanation]
Examples:
TOXICITY_SCORE: 0.9
CATEGORIES: hate, harassment
DETAILS: Contains hate speech targeting a group
TOXICITY_SCORE: 0.1
CATEGORIES: none
DETAILS: Text is neutral and non-toxic
Now analyze the text:`;
}
/**
* Parse AI response for toxicity score and categories
*/
parseToxicityResponse(text) {
const scoreMatch = text.match(/TOXICITY_SCORE:\s*([\d.]+)/i);
const categoriesMatch = text.match(/CATEGORIES:\s*(.+?)(?:\n|$)/i);
const detailsMatch = text.match(/DETAILS:\s*(.+?)(?:\n|$)/i);
const score = scoreMatch ? parseFloat(scoreMatch[1]) : 0.5;
const categoriesText = categoriesMatch ? categoriesMatch[1].trim() : "none";
const categories = categoriesText.toLowerCase() === "none"
? []
: categoriesText.split(",").map(c => c.trim());
const details = detailsMatch ? detailsMatch[1].trim() : "No details provided";
return {
score: Math.min(1, Math.max(0, score)), // Clamp to 0-1
categories,
details,
};
}
}
exports.ToxicityGuard = ToxicityGuard;