UNPKG

content-guard

Version:

🛡️ Advanced content analysis and moderation system with multi-variant optimization. Features context-aware detection, harassment prevention, and ML-powered toxicity analysis. Pre-1.0 development version.

github.com/trevor050/content-guard

trevor050/content-guard

413 lines (352 loc) • 13.9 kB

JavaScript

/** * 🚀 Enhanced ML Toxicity Detection Plugin v5.0 * * Uses state-of-the-art RoBERTa models for superior toxicity detection: * - s-nlp/roberta_toxicity_classifier: AUC-ROC 0.98, F1 0.76 on Jigsaw data * - cardiffnlp/twitter-roberta-base-hate: Specialized hate speech detection * - Enhanced ensemble approach for maximum accuracy */ class EnhancedMLToxicityPlugin { constructor() { this.name = 'enhanced-ml-toxicity' this.description = 'State-of-the-art transformer ensemble for toxicity detection' this.version = '5.0.0' this.initialized = false // Model instances this.roberta_toxicity = null this.twitter_hate = null this.tokenizer_toxicity = null this.tokenizer_hate = null // Model paths this.toxicityModelPath = './models/roberta_toxicity_classifier' this.hateModelPath = './models/twitter-roberta-base-hate' // Enhanced semantic patterns for fallback and supplementation this.advancedPatterns = [ // Sophisticated harassment patterns { pattern: /(?:not surprised|clearly|obviously)\s+(?:you|your)\s+(?:kind|type|people)/i, score: 6, type: 'identity_attack', description: 'Identity-based condescension' }, // Microaggressions { pattern: /(?:you'?re (?:articulate|well-spoken)|where are you (?:really )?from|you don'?t look)/i, score: 4, type: 'microaggression', description: 'Microaggressive language' }, // Advanced gaslighting { pattern: /(?:you'?re (?:being|over)dramatic|that never happened|you'?re (?:misremembering|confused))/i, score: 5, type: 'gaslighting', description: 'Advanced gaslighting patterns' }, // Workplace harassment { pattern: /(?:not a good (?:fit|match)|maybe (?:consider|think about)|this isn'?t working out)/i, score: 3, type: 'workplace_harassment', description: 'Workplace exclusion patterns' }, // Coded threats and intimidation { pattern: /(?:be (?:careful|smart)|wouldn'?t want|things could get|accidents happen)/i, score: 7, type: 'coded_threat', description: 'Veiled threats or intimidation' }, // Cross-cultural harassment { pattern: /(?:go back to|not from here|your (?:kind|people)|cultural differences)/i, score: 8, type: 'cultural_harassment', description: 'Cross-cultural harassment' }, // AI-generated harassment patterns { pattern: /(?:as an ai|i understand your|based on (?:your|the))\s+.*(?:concern|feedback|perspective)/i, score: 2, type: 'ai_generated', description: 'AI-generated dismissive patterns' } ] // Context modifiers this.positiveContexts = [ /(?:thank you|please|appreciate|grateful|respect|constructive)/i, /(?:feedback|suggestion|recommendation|advice|guidance|help)/i, /(?:learn|improve|develop|grow|support|collaborate)/i, /(?:professional|business|meeting|project|team)/i ] this.negativeAmplifiers = [ /(?:always|never|constantly|typical|classic|obviously)/i, /(?:everyone knows|it'?s clear|face it|admit it)/i, /(?:get over it|deal with it|grow up|move on)/i ] } async initialize() { if (this.initialized) return try { // Completely silent initialization - no console output unless explicitly enabled // Initialize transformers pipeline with compatible models const { pipeline } = await import('@xenova/transformers') // Load Xenova/toxic-bert (ONNX-compatible version of unitary/toxic-bert) try { this.roberta_toxicity = await pipeline( 'text-classification', 'Xenova/toxic-bert', { quantized: true, progress_callback: null, device: 'cpu' } ) } catch (error) { // Silent failure } // Load alternative lightweight toxicity model try { this.twitter_hate = await pipeline( 'text-classification', 'Xenova/distilbert-base-uncased-finetuned-sst-2-english', { quantized: true, progress_callback: null, device: 'cpu' } ) } catch (error) { // Silent failure } // If at least one model loaded, consider initialized if (this.roberta_toxicity || this.twitter_hate) { this.initialized = true } else { this.initialized = false } } catch (error) { // Silent failure this.initialized = false } } async analyze(text, metadata = {}) { if (!text || typeof text !== 'string') { return { score: 0, flags: [], details: {} } } const result = { score: 0, flags: [], details: { modelsEnabled: this.initialized, roberta_toxicity: null, twitter_hate: null, ensembleScore: 0, semanticPatterns: [], confidence: 0, contextAdjustments: [] } } try { // 1. Run ensemble ML analysis if models are available if (this.initialized) { await this.performEnsembleAnalysis(text, result) } // 2. Always run enhanced semantic pattern analysis this.performAdvancedSemanticAnalysis(text, result) // 3. Apply context-aware adjustments this.applyContextualAdjustments(text, result) // 4. Calculate final confidence and ensemble score this.calculateEnsembleScore(result) } catch (error) { console.error('Enhanced ML Toxicity analysis error:', error) result.flags.push('[ENHANCED-ML] Analysis error, using pattern fallback') this.performAdvancedSemanticAnalysis(text, result) } return result } async performEnsembleAnalysis(text, result) { try { // Split long text for better analysis const chunks = this.splitIntoChunks(text, 500) let totalToxicityScore = 0 let totalSentimentScore = 0 let chunkCount = 0 for (const chunk of chunks) { chunkCount++ // Run toxic-bert analysis if (this.roberta_toxicity) { const toxicityResult = await this.roberta_toxicity(chunk) if (toxicityResult && toxicityResult[0]) { // toxic-bert outputs: [{"label": "TOXIC", "score": 0.9}, {"label": "NON_TOXIC", "score": 0.1}] const toxicResult = toxicityResult.find(r => r.label === 'TOXIC') || toxicityResult[0] const toxicScore = toxicResult.label === 'TOXIC' ? toxicResult.score : (1 - toxicResult.score) totalToxicityScore += toxicScore // High-confidence toxicity detection if (toxicScore > 0.8) { result.score += 8 result.flags.push(`[TOXIC-BERT] High confidence toxic (${(toxicScore * 100).toFixed(1)}%)`) } else if (toxicScore > 0.6) { result.score += 5 result.flags.push(`[TOXIC-BERT] Moderate confidence toxic (${(toxicScore * 100).toFixed(1)}%)`) } else if (toxicScore > 0.4) { result.score += 2 result.flags.push(`[TOXIC-BERT] Low confidence toxic (${(toxicScore * 100).toFixed(1)}%)`) } result.details.roberta_toxicity = toxicScore } } // Run sentiment analysis (as secondary signal) if (this.twitter_hate) { const sentimentResult = await this.twitter_hate(chunk) if (sentimentResult && sentimentResult[0]) { // DistilBERT sentiment outputs: [{"label": "NEGATIVE", "score": 0.9}, {"label": "POSITIVE", "score": 0.1}] const negativeResult = sentimentResult.find(r => r.label === 'NEGATIVE') || sentimentResult[0] const negativeScore = negativeResult.label === 'NEGATIVE' ? negativeResult.score : (1 - negativeResult.score) totalSentimentScore += negativeScore // Use negative sentiment as supporting evidence for toxicity if (negativeScore > 0.8) { result.score += 3 result.flags.push(`[SENTIMENT] Strong negative sentiment (${(negativeScore * 100).toFixed(1)}%)`) } else if (negativeScore > 0.6) { result.score += 2 result.flags.push(`[SENTIMENT] Moderate negative sentiment (${(negativeScore * 100).toFixed(1)}%)`) } else if (negativeScore > 0.5) { result.score += 1 result.flags.push(`[SENTIMENT] Mild negative sentiment (${(negativeScore * 100).toFixed(1)}%)`) } result.details.twitter_hate = negativeScore } } } // Calculate average scores result.details.roberta_toxicity = totalToxicityScore / chunkCount result.details.twitter_hate = totalSentimentScore / chunkCount // Ensemble bonus for agreement between models if (result.details.roberta_toxicity > 0.5 && result.details.twitter_hate > 0.6) { const ensembleBonus = Math.min(3, (result.details.roberta_toxicity + result.details.twitter_hate) * 1.5) result.score += ensembleBonus result.flags.push(`[ENSEMBLE] Model agreement bonus (+${ensembleBonus.toFixed(1)})`) } } catch (error) { console.warn('Ensemble ML analysis failed:', error.message) result.details.modelsEnabled = false } } performAdvancedSemanticAnalysis(text, result) { this.advancedPatterns.forEach(pattern => { const matches = text.match(pattern.pattern) if (matches) { result.score += pattern.score result.details.semanticPatterns.push({ type: pattern.type, description: pattern.description, matches: matches.length, score: pattern.score }) result.flags.push(`[ENHANCED-SEMANTIC] ${pattern.description}`) } }) } applyContextualAdjustments(text, result) { const textLower = text.toLowerCase() let positiveAdjustment = 0 let negativeAmplification = 0 // Check for positive context indicators this.positiveContexts.forEach(pattern => { if (pattern.test(textLower)) { positiveAdjustment += 0.5 result.details.contextAdjustments.push('Positive context detected') } }) // Check for negative amplifiers this.negativeAmplifiers.forEach(pattern => { if (pattern.test(textLower)) { negativeAmplification += 0.3 result.details.contextAdjustments.push('Negative amplifier detected') } }) // Apply adjustments if (positiveAdjustment > 0 && result.score > 0) { const reduction = Math.min(result.score * 0.3, positiveAdjustment * 2) result.score = Math.max(0, result.score - reduction) result.flags.push(`[CONTEXT] Positive context reduction (-${reduction.toFixed(1)})`) } if (negativeAmplification > 0 && result.score > 2) { const amplification = negativeAmplification * 1.5 result.score += amplification result.flags.push(`[CONTEXT] Negative amplification (+${amplification.toFixed(1)})`) } } calculateEnsembleScore(result) { // Calculate ensemble confidence based on multiple signals let confidence = 0 if (result.details.roberta_toxicity !== null) { confidence += result.details.roberta_toxicity * 0.4 } if (result.details.twitter_hate !== null) { confidence += result.details.twitter_hate * 0.4 } if (result.details.semanticPatterns.length > 0) { confidence += Math.min(0.2, result.details.semanticPatterns.length * 0.1) } result.details.confidence = Math.min(1.0, confidence) result.details.ensembleScore = result.score } splitIntoChunks(text, maxLength) { if (text.length <= maxLength) { return [text] } const chunks = [] const sentences = text.split(/[.!?]+/) let currentChunk = '' for (const sentence of sentences) { if (currentChunk.length + sentence.length > maxLength) { if (currentChunk) { chunks.push(currentChunk.trim()) currentChunk = sentence } else { // Sentence too long, split by words const words = sentence.split(/\s+/) for (const word of words) { if (currentChunk.length + word.length > maxLength) { if (currentChunk) { chunks.push(currentChunk.trim()) currentChunk = word } else { chunks.push(word) } } else { currentChunk += (currentChunk ? ' ' : '') + word } } } } else { currentChunk += (currentChunk ? '. ' : '') + sentence } } if (currentChunk) { chunks.push(currentChunk.trim()) } return chunks.filter(chunk => chunk.length > 0) } // Test methods for development async testModels(text) { if (!this.initialized) { await this.initialize() } console.log('🧪 Testing enhanced ML models with text:', text.substring(0, 100) + '...') const result = await this.analyze(text) console.log('📊 Test results:', { score: result.score, confidence: result.details.confidence, flags: result.flags, models: { roberta_toxicity: result.details.roberta_toxicity, twitter_hate: result.details.twitter_hate } }) return result } } module.exports = { EnhancedMLToxicityPlugin }