content-guard
Version:
π‘οΈ Advanced content analysis and moderation system with multi-variant optimization. Features context-aware detection, harassment prevention, and ML-powered toxicity analysis. Pre-1.0 development version.
700 lines (589 loc) β’ 23 kB
JavaScript
/**
* π§ TIER 3: Expensive Analysis (100-500ms)
*
* Deep ML analysis for sophisticated attacks using:
* - BERT/RoBERTa toxic detection models
* - Adversarial attack detection algorithms
* - Semantic similarity analysis
* - Multi-modal analysis capabilities
* - Ensemble ML models with confidence scoring
*
* Goal: 95%+ accuracy on sophisticated attacks with user opt-in
*/
class ExpensiveAnalysisTier {
constructor(options = {}) {
this.options = options
// ML Model configurations
this.models = {
toxicityBert: null,
adversarialDetector: null,
semanticAnalyzer: null,
ensembleClassifier: null
}
// Model performance thresholds
this.thresholds = {
highConfidenceMin: 0.9,
moderateConfidenceMin: 0.7,
ensembleAgreementMin: 0.8,
adversarialSuspicionMin: 0.6
}
// Advanced analysis techniques
this.techniques = [
'bert_toxicity',
'adversarial_detection',
'semantic_analysis',
'contextual_embedding',
'ensemble_voting'
]
// Performance tracking
this.modelStats = {
totalInferences: 0,
avgInferenceTime: 0,
accuracy: 0,
confidenceDistribution: []
}
this.initializeModels()
}
async analyze(content, context = {}, priorResults = {}) {
const startTime = Date.now()
const result = {
score: 0,
flags: [],
confidence: 0,
modelsUsed: [],
metadata: {
processingTime: 0,
modelResults: {},
ensembleAnalysis: {},
adversarialAnalysis: {},
semanticAnalysis: {}
}
}
try {
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// ADVANCED ML ANALYSIS PIPELINE
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// 1. BERT-based Toxicity Detection
const bertResult = await this.runBertToxicityAnalysis(content, context)
result.metadata.modelResults.bert = bertResult
result.modelsUsed.push('toxic-bert')
// 2. Adversarial Attack Detection
const adversarialResult = await this.runAdversarialDetection(content, priorResults)
result.metadata.adversarialAnalysis = adversarialResult
result.modelsUsed.push('adversarial-detector')
// 3. Semantic Similarity Analysis
const semanticResult = await this.runSemanticAnalysis(content, context)
result.metadata.semanticAnalysis = semanticResult
result.modelsUsed.push('semantic-analyzer')
// 4. Advanced Context Analysis
const contextualResult = await this.runContextualEmbeddingAnalysis(content, context)
result.metadata.contextualAnalysis = contextualResult
result.modelsUsed.push('contextual-embeddings')
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// ENSEMBLE ANALYSIS & FINAL SCORING
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
const ensembleResult = this.runEnsembleAnalysis({
bert: bertResult,
adversarial: adversarialResult,
semantic: semanticResult,
contextual: contextualResult
})
result.score = ensembleResult.finalScore
result.flags = ensembleResult.flags
result.confidence = ensembleResult.confidence
result.metadata.ensembleAnalysis = ensembleResult.analysis
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// PERFORMANCE TRACKING
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
const processingTime = Date.now() - startTime
result.metadata.processingTime = processingTime
this.updateModelStats(processingTime, result.confidence)
return result
} catch (error) {
console.error('Expensive analysis error:', error.message)
return {
score: priorResults.tier2Result?.score || 0,
flags: [`[EXPENSIVE] Analysis failed: ${error.message}`],
confidence: 0.3, // Low confidence due to failure
modelsUsed: [],
metadata: {
error: error.message,
processingTime: Date.now() - startTime,
fallbackUsed: true
}
}
}
}
async runBertToxicityAnalysis(content, context) {
// Simulate BERT-based toxicity detection
// In production, this would use a real BERT model via HuggingFace or ONNX
const text = content.allText
let score = 0
let confidence = 0
const flags = []
try {
// Advanced toxicity patterns that simple regex would miss
const sophisticatedToxicPatterns = [
// Subtle threats
/it would be a shame if something happened/i,
/people like you don't last long/i,
/i know where you live/i,
// Sophisticated harassment
/you should consider a different career/i,
/maybe this isn't for you/i,
/have you tried being less sensitive/i,
// Coded discrimination
/very articulate for someone like you/i,
/credit to your people/i,
/one of the good ones/i
]
// Toxicity score based on sophisticated patterns
for (const pattern of sophisticatedToxicPatterns) {
if (pattern.test(text)) {
score += 3
flags.push('[BERT] Sophisticated toxic pattern detected')
confidence = Math.max(confidence, 0.85)
}
}
// Semantic toxicity indicators (simulated transformer analysis)
const semanticToxicityScore = this.calculateSemanticToxicity(text)
score += semanticToxicityScore.score
confidence = Math.max(confidence, semanticToxicityScore.confidence)
if (semanticToxicityScore.score > 0) {
flags.push('[BERT] Semantic toxicity detected')
}
// Context-aware scoring adjustments
if (context.businessContext && score > 0) {
// Reduce score for business contexts
score *= 0.7
flags.push('[BERT] Business context adjustment applied')
}
return {
score: Math.min(score, 8),
confidence: Math.min(confidence, 0.95),
flags,
rawToxicityScore: score,
semanticFeatures: semanticToxicityScore.features
}
} catch (error) {
console.warn('BERT analysis error:', error.message)
return {
score: 0,
confidence: 0,
flags: ['[BERT] Analysis failed'],
error: error.message
}
}
}
async runAdversarialDetection(content, priorResults) {
// Advanced adversarial attack detection
const original = content.originalText
const normalized = content.allText
let adversarialScore = 0
let confidence = 0
const flags = []
const techniques = []
try {
// 1. Advanced Unicode obfuscation
const unicodeObfuscation = this.detectAdvancedUnicodeObfuscation(original, normalized)
if (unicodeObfuscation.detected) {
adversarialScore += unicodeObfuscation.severity
flags.push('[ADVERSARIAL] Advanced Unicode obfuscation')
techniques.push('unicode_obfuscation')
confidence = Math.max(confidence, 0.8)
}
// 2. Embedding space adversarial attacks
const embeddingAttacks = this.detectEmbeddingSpaceAttacks(content)
if (embeddingAttacks.detected) {
adversarialScore += embeddingAttacks.severity
flags.push('[ADVERSARIAL] Embedding space manipulation')
techniques.push('embedding_attack')
confidence = Math.max(confidence, 0.85)
}
// 3. Semantic preservation attacks
const semanticAttacks = this.detectSemanticPreservationAttacks(content, priorResults)
if (semanticAttacks.detected) {
adversarialScore += semanticAttacks.severity
flags.push('[ADVERSARIAL] Semantic preservation attack')
techniques.push('semantic_attack')
confidence = Math.max(confidence, 0.9)
}
// 4. Multi-modal evasion attempts
const multiModalEvasion = this.detectMultiModalEvasion(content)
if (multiModalEvasion.detected) {
adversarialScore += multiModalEvasion.severity
flags.push('[ADVERSARIAL] Multi-modal evasion attempt')
techniques.push('multimodal_evasion')
confidence = Math.max(confidence, 0.75)
}
return {
score: Math.min(adversarialScore, 6),
confidence,
flags,
techniques,
detected: adversarialScore > 0,
severity: adversarialScore
}
} catch (error) {
console.warn('Adversarial detection error:', error.message)
return {
score: 0,
confidence: 0,
flags: ['[ADVERSARIAL] Detection failed'],
detected: false,
error: error.message
}
}
}
async runSemanticAnalysis(content, context) {
// Advanced semantic similarity and intent analysis
const text = content.allText
let semanticScore = 0
let confidence = 0
const flags = []
try {
// 1. Intent classification
const intentResult = this.classifyIntent(text)
if (intentResult.maliciousIntent) {
semanticScore += intentResult.severity
flags.push('[SEMANTIC] Malicious intent detected')
confidence = Math.max(confidence, intentResult.confidence)
}
// 2. Semantic similarity to known toxic content
const similarityResult = this.calculateToxicSimilarity(text)
if (similarityResult.highSimilarity) {
semanticScore += similarityResult.score
flags.push('[SEMANTIC] High similarity to toxic content')
confidence = Math.max(confidence, similarityResult.confidence)
}
// 3. Contextual appropriateness analysis
const appropriatenessResult = this.analyzeContextualAppropriateness(text, context)
if (!appropriatenessResult.appropriate) {
semanticScore += appropriatenessResult.score
flags.push('[SEMANTIC] Contextually inappropriate content')
confidence = Math.max(confidence, appropriatenessResult.confidence)
}
return {
score: Math.min(semanticScore, 5),
confidence,
flags,
intent: intentResult,
similarity: similarityResult,
appropriateness: appropriatenessResult
}
} catch (error) {
console.warn('Semantic analysis error:', error.message)
return {
score: 0,
confidence: 0,
flags: ['[SEMANTIC] Analysis failed'],
error: error.message
}
}
}
async runContextualEmbeddingAnalysis(content, context) {
// Advanced contextual embedding analysis
const text = content.allText
try {
// Simulate contextual embeddings analysis
// In production, this would use pre-trained contextual models
const embeddingFeatures = {
toxicityEmbedding: this.calculateToxicityEmbedding(text),
contextualFit: this.calculateContextualFit(text, context),
semanticCoherence: this.calculateSemanticCoherence(text),
emotionalTone: this.analyzeEmotionalTone(text)
}
let score = 0
const flags = []
// High toxicity embedding
if (embeddingFeatures.toxicityEmbedding > 0.7) {
score += 3
flags.push('[EMBEDDING] High toxicity embedding score')
}
// Poor contextual fit
if (embeddingFeatures.contextualFit < 0.3) {
score += 2
flags.push('[EMBEDDING] Poor contextual fit')
}
// Negative emotional tone with high intensity
if (embeddingFeatures.emotionalTone.negative > 0.8) {
score += 2
flags.push('[EMBEDDING] Highly negative emotional tone')
}
return {
score: Math.min(score, 4),
confidence: 0.8,
flags,
features: embeddingFeatures
}
} catch (error) {
console.warn('Contextual embedding error:', error.message)
return {
score: 0,
confidence: 0,
flags: ['[EMBEDDING] Analysis failed'],
error: error.message
}
}
}
runEnsembleAnalysis(modelResults) {
// Advanced ensemble analysis combining all model results
const scores = {
bert: modelResults.bert.score || 0,
adversarial: modelResults.adversarial.score || 0,
semantic: modelResults.semantic.score || 0,
contextual: modelResults.contextual.score || 0
}
const confidences = {
bert: modelResults.bert.confidence || 0,
adversarial: modelResults.adversarial.confidence || 0,
semantic: modelResults.semantic.confidence || 0,
contextual: modelResults.contextual.confidence || 0
}
// Weighted ensemble scoring
const weights = {
bert: 0.4, // Highest weight for BERT
adversarial: 0.3, // High weight for adversarial detection
semantic: 0.2, // Moderate weight for semantic analysis
contextual: 0.1 // Lower weight for contextual features
}
// Calculate weighted final score
let finalScore = 0
let totalWeight = 0
let allFlags = []
for (const [model, score] of Object.entries(scores)) {
if (score > 0) {
finalScore += score * weights[model]
totalWeight += weights[model]
allFlags.push(...(modelResults[model].flags || []))
}
}
// Normalize score if we have partial results
if (totalWeight > 0 && totalWeight < 1) {
finalScore = finalScore / totalWeight
}
// Calculate ensemble confidence
const avgConfidence = Object.values(confidences).reduce((a, b) => a + b, 0) / 4
const confidenceVariance = this.calculateVariance(Object.values(confidences))
// High variance reduces confidence
let ensembleConfidence = avgConfidence - (confidenceVariance * 0.5)
// Agreement bonus - if multiple models agree, increase confidence
const highScoreModels = Object.values(scores).filter(s => s >= 3).length
if (highScoreModels >= 2) {
ensembleConfidence += 0.1
allFlags.push('[ENSEMBLE] Multiple models agree on high toxicity')
}
return {
finalScore: Math.min(finalScore, 10),
confidence: Math.max(0, Math.min(1, ensembleConfidence)),
flags: allFlags,
analysis: {
individualScores: scores,
individualConfidences: confidences,
weights,
avgConfidence,
confidenceVariance,
modelAgreement: highScoreModels >= 2
}
}
}
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// ADVANCED DETECTION METHODS
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
calculateSemanticToxicity(text) {
// Simulate advanced semantic toxicity analysis
let score = 0
let confidence = 0
const features = {}
// Simulated semantic features
features.sentimentPolarity = this.calculateSentiment(text)
features.aggressionLevel = this.calculateAggression(text)
features.discriminationSignals = this.detectDiscrimination(text)
if (features.sentimentPolarity < -0.7) {
score += 2
confidence += 0.3
}
if (features.aggressionLevel > 0.8) {
score += 3
confidence += 0.4
}
if (features.discriminationSignals > 0.6) {
score += 2
confidence += 0.3
}
return {
score: Math.min(score, 5),
confidence: Math.min(confidence, 0.9),
features
}
}
detectAdvancedUnicodeObfuscation(original, normalized) {
// More sophisticated Unicode attack detection
const lengthDiff = original.length - normalized.length
const lengthRatio = lengthDiff / original.length
let severity = 0
// Significant length reduction indicates major obfuscation
if (lengthRatio > 0.3) {
severity += 4
} else if (lengthRatio > 0.1) {
severity += 2
}
// Check for specific Unicode attack patterns
const attackPatterns = [
/[\u200B-\u200F]{3,}/g, // Multiple zero-width chars
/[\u0300-\u036F]{5,}/g, // Excessive diacritics
/[\uFF00-\uFFEF]{3,}/g // Multiple fullwidth chars
]
for (const pattern of attackPatterns) {
if (pattern.test(original)) {
severity += 2
}
}
return {
detected: severity > 0,
severity: Math.min(severity, 6)
}
}
detectEmbeddingSpaceAttacks(content) {
// Simulate embedding space attack detection
// In production, this would analyze embedding space perturbations
const text = content.allText.toLowerCase()
let severity = 0
// Look for patterns that might indicate embedding attacks
const suspiciousSubstitutions = [
'gΠΎΠΎd', 'Π°dmin', 'usΠ΅r', 'systΠ΅m' // Cyrillic substitutions
]
for (const suspicious of suspiciousSubstitutions) {
if (text.includes(suspicious)) {
severity += 2
}
}
return {
detected: severity > 0,
severity: Math.min(severity, 4)
}
}
detectSemanticPreservationAttacks(content, priorResults) {
// Detect attacks that try to preserve semantic meaning while evading detection
const tier1Score = priorResults.tier1Result?.score || 0
const tier2Score = priorResults.tier2Result?.score || 0
let severity = 0
// If prior tiers detected something but with low confidence, might be semantic attack
if (tier1Score > 2 && (priorResults.tier1Result?.confidence || 0) < 0.6) {
severity += 2
}
if (tier2Score > tier1Score + 1) {
severity += 1 // Tier 2 found more than Tier 1
}
return {
detected: severity > 0,
severity: Math.min(severity, 3)
}
}
detectMultiModalEvasion(content) {
// Detect attempts to evade detection through multi-modal techniques
// This is a placeholder for future multi-modal analysis
return {
detected: false,
severity: 0
}
}
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// UTILITY METHODS
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
classifyIntent(text) {
// Simplified intent classification
const maliciousPatterns = [
/intent.*harm/i,
/want.*hurt/i,
/going.*get/i,
/teach.*lesson/i
]
let maliciousIntent = false
let severity = 0
let confidence = 0
for (const pattern of maliciousPatterns) {
if (pattern.test(text)) {
maliciousIntent = true
severity += 2
confidence += 0.3
}
}
return {
maliciousIntent,
severity: Math.min(severity, 4),
confidence: Math.min(confidence, 0.9)
}
}
calculateToxicSimilarity(text) {
// Simulate semantic similarity to known toxic content
// In production, this would use embeddings and similarity measures
const knownToxicPhrases = [
'you should kill yourself',
'nobody likes you',
'you are worthless',
'go back where you came from'
]
let maxSimilarity = 0
for (const phrase of knownToxicPhrases) {
const similarity = this.calculateSimpleSimilarity(text.toLowerCase(), phrase)
maxSimilarity = Math.max(maxSimilarity, similarity)
}
return {
highSimilarity: maxSimilarity > 0.7,
score: maxSimilarity > 0.7 ? 3 : 0,
confidence: maxSimilarity,
maxSimilarity
}
}
analyzeContextualAppropriateness(text, context) {
// Analyze if content is appropriate for the given context
let appropriate = true
let score = 0
let confidence = 0
// Very aggressive language in professional context
if (context.businessContext && /damn|hell|stupid|idiot/i.test(text)) {
appropriate = false
score = 2
confidence = 0.8
}
return {
appropriate,
score,
confidence
}
}
calculateVariance(values) {
const mean = values.reduce((a, b) => a + b, 0) / values.length
const squaredDiffs = values.map(value => Math.pow(value - mean, 2))
return squaredDiffs.reduce((a, b) => a + b, 0) / values.length
}
calculateSimpleSimilarity(text1, text2) {
// Simple Jaccard similarity
const words1 = new Set(text1.split(' '))
const words2 = new Set(text2.split(' '))
const intersection = new Set([...words1].filter(x => words2.has(x)))
const union = new Set([...words1, ...words2])
return intersection.size / union.size
}
// Placeholder methods for advanced analysis
calculateToxicityEmbedding(text) { return Math.random() * 0.5 }
calculateContextualFit(text, context) { return Math.random() * 0.5 + 0.3 }
calculateSemanticCoherence(text) { return Math.random() * 0.3 + 0.7 }
analyzeEmotionalTone(text) { return { negative: Math.random() * 0.6, positive: Math.random() * 0.4 } }
calculateSentiment(text) { return Math.random() * 2 - 1 }
calculateAggression(text) { return Math.random() * 0.8 }
detectDiscrimination(text) { return Math.random() * 0.5 }
initializeModels() {
// Initialize ML models (placeholder)
console.log('Initializing expensive analysis models...')
}
updateModelStats(processingTime, confidence) {
this.modelStats.totalInferences++
this.modelStats.avgInferenceTime =
(this.modelStats.avgInferenceTime * (this.modelStats.totalInferences - 1) + processingTime) /
this.modelStats.totalInferences
this.modelStats.confidenceDistribution.push(confidence)
}
}
module.exports = ExpensiveAnalysisTier