universal-ai-brain
Version:
🧠UNIVERSAL AI BRAIN 3.3 - The world's most advanced cognitive architecture with 24 specialized systems, MongoDB 8.1 $rankFusion hybrid search, latest Voyage 3.5 embeddings, and framework-agnostic design. Works with Mastra, Vercel AI, LangChain, OpenAI A
628 lines (568 loc) • 19.3 kB
text/typescript
/**
* @file SafetyGuardrailsEngine - Comprehensive safety and content filtering system
*
* This engine provides multi-layered safety guardrails for the Universal AI Brain,
* including content filtering, prompt injection detection, output validation,
* and compliance monitoring using MongoDB for safety analytics and logging.
*
* Features:
* - Multi-layered content filtering (input/output)
* - Prompt injection attack detection
* - Harmful content classification
* - Compliance monitoring and reporting
* - Real-time safety analytics with MongoDB
* - Framework-agnostic safety enforcement
* - Configurable safety policies
*/
import { TracingCollection } from '../collections/TracingCollection';
import { MemoryCollection } from '../collections/MemoryCollection';
export interface SafetyPolicy {
id: string;
name: string;
description: string;
enabled: boolean;
severity: 'low' | 'medium' | 'high' | 'critical';
rules: SafetyRule[];
frameworks: string[]; // Which frameworks this applies to
createdAt: Date;
updatedAt: Date;
}
export interface SafetyRule {
id: string;
type: 'content_filter' | 'prompt_injection' | 'output_validation' | 'rate_limit' | 'compliance';
pattern: string | RegExp;
action: 'block' | 'warn' | 'log' | 'modify';
threshold?: number;
description: string;
enabled: boolean;
}
export interface SafetyViolation {
id: string;
timestamp: Date;
traceId?: string;
sessionId: string;
userId?: string;
violationType: 'harmful_content' | 'prompt_injection' | 'policy_violation' | 'rate_limit' | 'compliance';
severity: 'low' | 'medium' | 'high' | 'critical';
content: {
input?: string;
output?: string;
context?: string;
};
policyId: string;
ruleId: string;
action: 'blocked' | 'warned' | 'logged' | 'modified';
framework: string;
metadata: Record<string, any>;
}
export interface SafetyAnalytics {
timeRange: {
start: Date;
end: Date;
};
totalViolations: number;
violationsByType: Record<string, number>;
violationsBySeverity: Record<string, number>;
violationsByFramework: Record<string, number>;
topViolatedPolicies: {
policyId: string;
policyName: string;
violationCount: number;
}[];
safetyTrends: {
date: Date;
violationCount: number;
blockedCount: number;
}[];
complianceScore: number; // 0-100
}
export interface ContentAnalysisResult {
isSafe: boolean;
confidence: number;
violations: {
type: string;
severity: string;
description: string;
confidence: number;
}[];
suggestedAction: 'allow' | 'block' | 'modify' | 'review';
modifiedContent?: string;
}
/**
* SafetyGuardrailsEngine - Comprehensive safety and content filtering
*
* Provides multi-layered safety protection for the Universal AI Brain
* with real-time monitoring and analytics using MongoDB.
*/
export class SafetyGuardrailsEngine {
private tracingCollection: TracingCollection;
private memoryCollection: MemoryCollection;
private policies: Map<string, SafetyPolicy> = new Map();
private violationCache: Map<string, SafetyViolation[]> = new Map();
constructor(tracingCollection: TracingCollection, memoryCollection: MemoryCollection) {
this.tracingCollection = tracingCollection;
this.memoryCollection = memoryCollection;
this.initializeDefaultPolicies();
}
/**
* Analyze input content for safety violations
*/
async analyzeInputSafety(
input: string,
context: {
sessionId: string;
userId?: string;
framework: string;
traceId?: string;
}
): Promise<ContentAnalysisResult> {
const violations: any[] = [];
let confidence = 1.0;
// Check all enabled policies
for (const policy of this.policies.values()) {
if (!policy.enabled || !policy.frameworks.includes(context.framework)) {
continue;
}
for (const rule of policy.rules) {
if (!rule.enabled) continue;
const violation = await this.checkRule(input, rule, 'input');
if (violation) {
violations.push({
type: violation.violationType,
severity: violation.severity,
description: `Policy: ${policy.name}, Rule: ${rule.description}`,
confidence: violation.metadata.confidence || 0.9
});
// Log violation
await this.logViolation({
...violation,
sessionId: context.sessionId,
userId: context.userId,
framework: context.framework,
traceId: context.traceId,
content: { input }
});
}
}
}
// Determine overall safety
const criticalViolations = violations.filter(v => v.severity === 'critical');
const highViolations = violations.filter(v => v.severity === 'high');
const isSafe = criticalViolations.length === 0 && highViolations.length === 0;
const suggestedAction = this.determineSuggestedAction(violations);
return {
isSafe,
confidence: violations.length > 0 ? Math.min(...violations.map(v => v.confidence)) : 1.0,
violations,
suggestedAction,
modifiedContent: suggestedAction === 'modify' ? this.sanitizeContent(input) : undefined
};
}
/**
* Analyze output content for safety violations
*/
async analyzeOutputSafety(
output: string,
context: {
sessionId: string;
userId?: string;
framework: string;
traceId?: string;
originalInput?: string;
}
): Promise<ContentAnalysisResult> {
const violations: any[] = [];
// Check for harmful content in output
const harmfulContentCheck = await this.detectHarmfulContent(output);
if (!harmfulContentCheck.isSafe) {
violations.push(...harmfulContentCheck.violations);
}
// Check for data leakage
const dataLeakageCheck = await this.detectDataLeakage(output, context.originalInput);
if (!dataLeakageCheck.isSafe) {
violations.push(...dataLeakageCheck.violations);
}
// Check compliance requirements
const complianceCheck = await this.checkCompliance(output, context.framework);
if (!complianceCheck.isSafe) {
violations.push(...complianceCheck.violations);
}
// Log violations
for (const violation of violations) {
await this.logViolation({
id: `violation_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`,
timestamp: new Date(),
sessionId: context.sessionId,
userId: context.userId,
framework: context.framework,
traceId: context.traceId,
violationType: violation.type,
severity: violation.severity,
content: { output, input: context.originalInput },
policyId: 'output_safety',
ruleId: violation.ruleId || 'general',
action: 'logged',
metadata: { confidence: violation.confidence }
});
}
const isSafe = violations.filter(v => v.severity === 'critical' || v.severity === 'high').length === 0;
const suggestedAction = this.determineSuggestedAction(violations);
return {
isSafe,
confidence: violations.length > 0 ? Math.min(...violations.map(v => v.confidence)) : 1.0,
violations,
suggestedAction,
modifiedContent: suggestedAction === 'modify' ? this.sanitizeContent(output) : undefined
};
}
/**
* Generate safety analytics using MongoDB aggregation
*/
async generateSafetyAnalytics(timeRange: { start: Date; end: Date }): Promise<SafetyAnalytics> {
// Use MongoDB aggregation for comprehensive safety analytics
const analyticsPipeline = [
{
$match: {
timestamp: { $gte: timeRange.start, $lte: timeRange.end },
'metadata.type': 'safety_violation'
}
},
{
$facet: {
// Total violations
totalCount: [
{ $count: 'total' }
],
// Violations by type
byType: [
{
$group: {
_id: '$metadata.violationType',
count: { $sum: 1 }
}
}
],
// Violations by severity
bySeverity: [
{
$group: {
_id: '$metadata.severity',
count: { $sum: 1 }
}
}
],
// Violations by framework
byFramework: [
{
$group: {
_id: '$metadata.framework',
count: { $sum: 1 }
}
}
],
// Top violated policies
topPolicies: [
{
$group: {
_id: '$metadata.policyId',
count: { $sum: 1 },
policyName: { $first: '$metadata.policyName' }
}
},
{ $sort: { count: -1 } },
{ $limit: 10 }
],
// Daily trends
dailyTrends: [
{
$group: {
_id: {
$dateToString: {
format: '%Y-%m-%d',
date: '$timestamp'
}
},
violationCount: { $sum: 1 },
blockedCount: {
$sum: {
$cond: [{ $eq: ['$metadata.action', 'blocked'] }, 1, 0]
}
}
}
},
{ $sort: { '_id': 1 } }
]
}
}
];
const results = await this.memoryCollection.aggregate(analyticsPipeline);
const facetResults = results[0];
// Calculate compliance score
const totalViolations = facetResults.totalCount[0]?.total || 0;
const criticalViolations = facetResults.bySeverity.find((s: any) => s._id === 'critical')?.count || 0;
const complianceScore = Math.max(0, 100 - (criticalViolations * 10) - (totalViolations * 0.1));
return {
timeRange,
totalViolations,
violationsByType: this.arrayToRecord(facetResults.byType),
violationsBySeverity: this.arrayToRecord(facetResults.bySeverity),
violationsByFramework: this.arrayToRecord(facetResults.byFramework),
topViolatedPolicies: facetResults.topPolicies.map((p: any) => ({
policyId: p._id,
policyName: p.policyName || 'Unknown',
violationCount: p.count
})),
safetyTrends: facetResults.dailyTrends.map((t: any) => ({
date: new Date(t._id),
violationCount: t.violationCount,
blockedCount: t.blockedCount
})),
complianceScore: Math.round(complianceScore)
};
}
/**
* Add or update safety policy
*/
async updateSafetyPolicy(policy: SafetyPolicy): Promise<void> {
policy.updatedAt = new Date();
this.policies.set(policy.id, policy);
// Store in MongoDB for persistence
await this.memoryCollection.storeDocument(
JSON.stringify(policy),
{
type: 'safety_policy',
policyId: policy.id,
enabled: policy.enabled,
severity: policy.severity
}
);
}
/**
* Get safety violations for a session
*/
async getSessionViolations(sessionId: string): Promise<SafetyViolation[]> {
const pipeline = [
{
$match: {
'metadata.type': 'safety_violation',
'metadata.sessionId': sessionId
}
},
{ $sort: { timestamp: -1 } }
];
const results = await this.memoryCollection.aggregate(pipeline);
return results.map(this.parseViolationFromDocument);
}
// Private helper methods
private initializeDefaultPolicies(): void {
// Initialize with common safety policies
const defaultPolicies: SafetyPolicy[] = [
{
id: 'harmful_content',
name: 'Harmful Content Filter',
description: 'Prevents generation of harmful, toxic, or inappropriate content',
enabled: true,
severity: 'critical',
frameworks: ['vercel-ai', 'mastra', 'langchain', 'openai-agents'],
rules: [
{
id: 'violence',
type: 'content_filter',
pattern: /\b(kill|murder|violence|harm|hurt|attack)\b/i,
action: 'block',
description: 'Violence-related content',
enabled: true
},
{
id: 'hate_speech',
type: 'content_filter',
pattern: /\b(hate|racist|discrimination)\b/i,
action: 'block',
description: 'Hate speech detection',
enabled: true
}
],
createdAt: new Date(),
updatedAt: new Date()
},
{
id: 'prompt_injection',
name: 'Prompt Injection Protection',
description: 'Detects and prevents prompt injection attacks',
enabled: true,
severity: 'high',
frameworks: ['vercel-ai', 'mastra', 'langchain', 'openai-agents'],
rules: [
{
id: 'ignore_instructions',
type: 'prompt_injection',
pattern: /ignore\s+(previous|above|all)\s+instructions/i,
action: 'block',
description: 'Ignore instructions pattern',
enabled: true
},
{
id: 'system_override',
type: 'prompt_injection',
pattern: /system\s*:\s*|assistant\s*:\s*|user\s*:\s*/i,
action: 'warn',
description: 'System role override attempt',
enabled: true
}
],
createdAt: new Date(),
updatedAt: new Date()
}
];
defaultPolicies.forEach(policy => {
this.policies.set(policy.id, policy);
});
}
private async checkRule(content: string, rule: SafetyRule, contentType: 'input' | 'output'): Promise<SafetyViolation | null> {
let matches = false;
let confidence = 0.9;
if (typeof rule.pattern === 'string') {
matches = content.toLowerCase().includes(rule.pattern.toLowerCase());
} else if (rule.pattern instanceof RegExp) {
matches = rule.pattern.test(content);
}
if (matches) {
return {
id: `violation_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`,
timestamp: new Date(),
sessionId: '', // Will be filled by caller
violationType: this.mapRuleTypeToViolationType(rule.type),
severity: 'medium', // Will be overridden by policy severity
content: contentType === 'input' ? { input: content } : { output: content },
policyId: '', // Will be filled by caller
ruleId: rule.id,
action: rule.action === 'block' ? 'blocked' : 'logged',
framework: '', // Will be filled by caller
metadata: { confidence, ruleType: rule.type }
};
}
return null;
}
private async detectHarmfulContent(content: string): Promise<ContentAnalysisResult> {
// Implement harmful content detection logic
const violations: any[] = [];
// Simple keyword-based detection (would be enhanced with ML models)
const harmfulPatterns = [
{ pattern: /\b(suicide|self-harm|kill yourself)\b/i, severity: 'critical', type: 'self_harm' },
{ pattern: /\b(bomb|explosive|terrorism)\b/i, severity: 'critical', type: 'violence' },
{ pattern: /\b(hack|exploit|vulnerability)\b/i, severity: 'medium', type: 'security' }
];
for (const { pattern, severity, type } of harmfulPatterns) {
if (pattern.test(content)) {
violations.push({
type,
severity,
confidence: 0.8,
ruleId: `harmful_${type}`
});
}
}
return {
isSafe: violations.length === 0,
confidence: 0.8,
violations,
suggestedAction: violations.length > 0 ? 'block' : 'allow'
};
}
private async detectDataLeakage(output: string, input?: string): Promise<ContentAnalysisResult> {
// Detect potential data leakage in output
const violations: any[] = [];
// Check for common data patterns
const dataPatterns = [
{ pattern: /\b\d{3}-\d{2}-\d{4}\b/, type: 'ssn', severity: 'critical' },
{ pattern: /\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/, type: 'credit_card', severity: 'critical' },
{ pattern: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/, type: 'email', severity: 'medium' }
];
for (const { pattern, type, severity } of dataPatterns) {
if (pattern.test(output)) {
violations.push({
type: `data_leakage_${type}`,
severity,
confidence: 0.9,
ruleId: `leakage_${type}`
});
}
}
return {
isSafe: violations.length === 0,
confidence: 0.9,
violations,
suggestedAction: violations.length > 0 ? 'modify' : 'allow'
};
}
private async checkCompliance(content: string, framework: string): Promise<ContentAnalysisResult> {
// Check framework-specific compliance requirements
const violations: any[] = [];
// Example compliance checks
if (content.length > 10000) {
violations.push({
type: 'content_length',
severity: 'low',
confidence: 1.0,
ruleId: 'max_length'
});
}
return {
isSafe: violations.length === 0,
confidence: 1.0,
violations,
suggestedAction: violations.length > 0 ? 'modify' : 'allow'
};
}
private determineSuggestedAction(violations: any[]): 'allow' | 'block' | 'modify' | 'review' {
if (violations.some(v => v.severity === 'critical')) return 'block';
if (violations.some(v => v.severity === 'high')) return 'review';
if (violations.some(v => v.severity === 'medium')) return 'modify';
return 'allow';
}
private sanitizeContent(content: string): string {
// Basic content sanitization
return content
.replace(/\b(kill|murder|harm)\b/gi, '[REDACTED]')
.replace(/\b\d{3}-\d{2}-\d{4}\b/g, '[SSN-REDACTED]')
.replace(/\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/g, '[CARD-REDACTED]');
}
private async logViolation(violation: SafetyViolation): Promise<void> {
await this.memoryCollection.storeDocument(
JSON.stringify(violation),
{
type: 'safety_violation',
violationType: violation.violationType,
severity: violation.severity,
sessionId: violation.sessionId,
framework: violation.framework,
policyId: violation.policyId,
action: violation.action
}
);
}
private mapRuleTypeToViolationType(ruleType: string): SafetyViolation['violationType'] {
const mapping: Record<string, SafetyViolation['violationType']> = {
'content_filter': 'harmful_content',
'prompt_injection': 'prompt_injection',
'output_validation': 'policy_violation',
'rate_limit': 'rate_limit',
'compliance': 'compliance'
};
return mapping[ruleType] || 'policy_violation';
}
private arrayToRecord(array: any[]): Record<string, number> {
const record: Record<string, number> = {};
array.forEach(item => {
record[item._id] = item.count;
});
return record;
}
private parseViolationFromDocument(doc: any): SafetyViolation {
const violation = JSON.parse(doc.content);
return {
...violation,
timestamp: new Date(violation.timestamp)
};
}
}