content-guard
Version:
๐ก๏ธ Advanced content analysis and moderation system with multi-variant optimization. Features context-aware detection, harassment prevention, and ML-powered toxicity analysis. Pre-1.0 development version.
413 lines (352 loc) โข 13.9 kB
JavaScript
/**
* ๐ Enhanced ML Toxicity Detection Plugin v5.0
*
* Uses state-of-the-art RoBERTa models for superior toxicity detection:
* - s-nlp/roberta_toxicity_classifier: AUC-ROC 0.98, F1 0.76 on Jigsaw data
* - cardiffnlp/twitter-roberta-base-hate: Specialized hate speech detection
* - Enhanced ensemble approach for maximum accuracy
*/
class EnhancedMLToxicityPlugin {
constructor() {
this.name = 'enhanced-ml-toxicity'
this.description = 'State-of-the-art transformer ensemble for toxicity detection'
this.version = '5.0.0'
this.initialized = false
// Model instances
this.roberta_toxicity = null
this.twitter_hate = null
this.tokenizer_toxicity = null
this.tokenizer_hate = null
// Model paths
this.toxicityModelPath = './models/roberta_toxicity_classifier'
this.hateModelPath = './models/twitter-roberta-base-hate'
// Enhanced semantic patterns for fallback and supplementation
this.advancedPatterns = [
// Sophisticated harassment patterns
{
pattern: /(?:not surprised|clearly|obviously)\s+(?:you|your)\s+(?:kind|type|people)/i,
score: 6,
type: 'identity_attack',
description: 'Identity-based condescension'
},
// Microaggressions
{
pattern: /(?:you'?re (?:articulate|well-spoken)|where are you (?:really )?from|you don'?t look)/i,
score: 4,
type: 'microaggression',
description: 'Microaggressive language'
},
// Advanced gaslighting
{
pattern: /(?:you'?re (?:being|over)dramatic|that never happened|you'?re (?:misremembering|confused))/i,
score: 5,
type: 'gaslighting',
description: 'Advanced gaslighting patterns'
},
// Workplace harassment
{
pattern: /(?:not a good (?:fit|match)|maybe (?:consider|think about)|this isn'?t working out)/i,
score: 3,
type: 'workplace_harassment',
description: 'Workplace exclusion patterns'
},
// Coded threats and intimidation
{
pattern: /(?:be (?:careful|smart)|wouldn'?t want|things could get|accidents happen)/i,
score: 7,
type: 'coded_threat',
description: 'Veiled threats or intimidation'
},
// Cross-cultural harassment
{
pattern: /(?:go back to|not from here|your (?:kind|people)|cultural differences)/i,
score: 8,
type: 'cultural_harassment',
description: 'Cross-cultural harassment'
},
// AI-generated harassment patterns
{
pattern: /(?:as an ai|i understand your|based on (?:your|the))\s+.*(?:concern|feedback|perspective)/i,
score: 2,
type: 'ai_generated',
description: 'AI-generated dismissive patterns'
}
]
// Context modifiers
this.positiveContexts = [
/(?:thank you|please|appreciate|grateful|respect|constructive)/i,
/(?:feedback|suggestion|recommendation|advice|guidance|help)/i,
/(?:learn|improve|develop|grow|support|collaborate)/i,
/(?:professional|business|meeting|project|team)/i
]
this.negativeAmplifiers = [
/(?:always|never|constantly|typical|classic|obviously)/i,
/(?:everyone knows|it'?s clear|face it|admit it)/i,
/(?:get over it|deal with it|grow up|move on)/i
]
}
async initialize() {
if (this.initialized) return
try {
// Completely silent initialization - no console output unless explicitly enabled
// Initialize transformers pipeline with compatible models
const { pipeline } = await import('@xenova/transformers')
// Load Xenova/toxic-bert (ONNX-compatible version of unitary/toxic-bert)
try {
this.roberta_toxicity = await pipeline(
'text-classification',
'Xenova/toxic-bert',
{
quantized: true,
progress_callback: null,
device: 'cpu'
}
)
} catch (error) {
// Silent failure
}
// Load alternative lightweight toxicity model
try {
this.twitter_hate = await pipeline(
'text-classification',
'Xenova/distilbert-base-uncased-finetuned-sst-2-english',
{
quantized: true,
progress_callback: null,
device: 'cpu'
}
)
} catch (error) {
// Silent failure
}
// If at least one model loaded, consider initialized
if (this.roberta_toxicity || this.twitter_hate) {
this.initialized = true
} else {
this.initialized = false
}
} catch (error) {
// Silent failure
this.initialized = false
}
}
async analyze(text, metadata = {}) {
if (!text || typeof text !== 'string') {
return { score: 0, flags: [], details: {} }
}
const result = {
score: 0,
flags: [],
details: {
modelsEnabled: this.initialized,
roberta_toxicity: null,
twitter_hate: null,
ensembleScore: 0,
semanticPatterns: [],
confidence: 0,
contextAdjustments: []
}
}
try {
// 1. Run ensemble ML analysis if models are available
if (this.initialized) {
await this.performEnsembleAnalysis(text, result)
}
// 2. Always run enhanced semantic pattern analysis
this.performAdvancedSemanticAnalysis(text, result)
// 3. Apply context-aware adjustments
this.applyContextualAdjustments(text, result)
// 4. Calculate final confidence and ensemble score
this.calculateEnsembleScore(result)
} catch (error) {
console.error('Enhanced ML Toxicity analysis error:', error)
result.flags.push('[ENHANCED-ML] Analysis error, using pattern fallback')
this.performAdvancedSemanticAnalysis(text, result)
}
return result
}
async performEnsembleAnalysis(text, result) {
try {
// Split long text for better analysis
const chunks = this.splitIntoChunks(text, 500)
let totalToxicityScore = 0
let totalSentimentScore = 0
let chunkCount = 0
for (const chunk of chunks) {
chunkCount++
// Run toxic-bert analysis
if (this.roberta_toxicity) {
const toxicityResult = await this.roberta_toxicity(chunk)
if (toxicityResult && toxicityResult[0]) {
// toxic-bert outputs: [{"label": "TOXIC", "score": 0.9}, {"label": "NON_TOXIC", "score": 0.1}]
const toxicResult = toxicityResult.find(r => r.label === 'TOXIC') || toxicityResult[0]
const toxicScore = toxicResult.label === 'TOXIC' ? toxicResult.score : (1 - toxicResult.score)
totalToxicityScore += toxicScore
// High-confidence toxicity detection
if (toxicScore > 0.8) {
result.score += 8
result.flags.push(`[TOXIC-BERT] High confidence toxic (${(toxicScore * 100).toFixed(1)}%)`)
} else if (toxicScore > 0.6) {
result.score += 5
result.flags.push(`[TOXIC-BERT] Moderate confidence toxic (${(toxicScore * 100).toFixed(1)}%)`)
} else if (toxicScore > 0.4) {
result.score += 2
result.flags.push(`[TOXIC-BERT] Low confidence toxic (${(toxicScore * 100).toFixed(1)}%)`)
}
result.details.roberta_toxicity = toxicScore
}
}
// Run sentiment analysis (as secondary signal)
if (this.twitter_hate) {
const sentimentResult = await this.twitter_hate(chunk)
if (sentimentResult && sentimentResult[0]) {
// DistilBERT sentiment outputs: [{"label": "NEGATIVE", "score": 0.9}, {"label": "POSITIVE", "score": 0.1}]
const negativeResult = sentimentResult.find(r => r.label === 'NEGATIVE') || sentimentResult[0]
const negativeScore = negativeResult.label === 'NEGATIVE' ? negativeResult.score : (1 - negativeResult.score)
totalSentimentScore += negativeScore
// Use negative sentiment as supporting evidence for toxicity
if (negativeScore > 0.8) {
result.score += 3
result.flags.push(`[SENTIMENT] Strong negative sentiment (${(negativeScore * 100).toFixed(1)}%)`)
} else if (negativeScore > 0.6) {
result.score += 2
result.flags.push(`[SENTIMENT] Moderate negative sentiment (${(negativeScore * 100).toFixed(1)}%)`)
} else if (negativeScore > 0.5) {
result.score += 1
result.flags.push(`[SENTIMENT] Mild negative sentiment (${(negativeScore * 100).toFixed(1)}%)`)
}
result.details.twitter_hate = negativeScore
}
}
}
// Calculate average scores
result.details.roberta_toxicity = totalToxicityScore / chunkCount
result.details.twitter_hate = totalSentimentScore / chunkCount
// Ensemble bonus for agreement between models
if (result.details.roberta_toxicity > 0.5 && result.details.twitter_hate > 0.6) {
const ensembleBonus = Math.min(3, (result.details.roberta_toxicity + result.details.twitter_hate) * 1.5)
result.score += ensembleBonus
result.flags.push(`[ENSEMBLE] Model agreement bonus (+${ensembleBonus.toFixed(1)})`)
}
} catch (error) {
console.warn('Ensemble ML analysis failed:', error.message)
result.details.modelsEnabled = false
}
}
performAdvancedSemanticAnalysis(text, result) {
this.advancedPatterns.forEach(pattern => {
const matches = text.match(pattern.pattern)
if (matches) {
result.score += pattern.score
result.details.semanticPatterns.push({
type: pattern.type,
description: pattern.description,
matches: matches.length,
score: pattern.score
})
result.flags.push(`[ENHANCED-SEMANTIC] ${pattern.description}`)
}
})
}
applyContextualAdjustments(text, result) {
const textLower = text.toLowerCase()
let positiveAdjustment = 0
let negativeAmplification = 0
// Check for positive context indicators
this.positiveContexts.forEach(pattern => {
if (pattern.test(textLower)) {
positiveAdjustment += 0.5
result.details.contextAdjustments.push('Positive context detected')
}
})
// Check for negative amplifiers
this.negativeAmplifiers.forEach(pattern => {
if (pattern.test(textLower)) {
negativeAmplification += 0.3
result.details.contextAdjustments.push('Negative amplifier detected')
}
})
// Apply adjustments
if (positiveAdjustment > 0 && result.score > 0) {
const reduction = Math.min(result.score * 0.3, positiveAdjustment * 2)
result.score = Math.max(0, result.score - reduction)
result.flags.push(`[CONTEXT] Positive context reduction (-${reduction.toFixed(1)})`)
}
if (negativeAmplification > 0 && result.score > 2) {
const amplification = negativeAmplification * 1.5
result.score += amplification
result.flags.push(`[CONTEXT] Negative amplification (+${amplification.toFixed(1)})`)
}
}
calculateEnsembleScore(result) {
// Calculate ensemble confidence based on multiple signals
let confidence = 0
if (result.details.roberta_toxicity !== null) {
confidence += result.details.roberta_toxicity * 0.4
}
if (result.details.twitter_hate !== null) {
confidence += result.details.twitter_hate * 0.4
}
if (result.details.semanticPatterns.length > 0) {
confidence += Math.min(0.2, result.details.semanticPatterns.length * 0.1)
}
result.details.confidence = Math.min(1.0, confidence)
result.details.ensembleScore = result.score
}
splitIntoChunks(text, maxLength) {
if (text.length <= maxLength) {
return [text]
}
const chunks = []
const sentences = text.split(/[.!?]+/)
let currentChunk = ''
for (const sentence of sentences) {
if (currentChunk.length + sentence.length > maxLength) {
if (currentChunk) {
chunks.push(currentChunk.trim())
currentChunk = sentence
} else {
// Sentence too long, split by words
const words = sentence.split(/\s+/)
for (const word of words) {
if (currentChunk.length + word.length > maxLength) {
if (currentChunk) {
chunks.push(currentChunk.trim())
currentChunk = word
} else {
chunks.push(word)
}
} else {
currentChunk += (currentChunk ? ' ' : '') + word
}
}
}
} else {
currentChunk += (currentChunk ? '. ' : '') + sentence
}
}
if (currentChunk) {
chunks.push(currentChunk.trim())
}
return chunks.filter(chunk => chunk.length > 0)
}
// Test methods for development
async testModels(text) {
if (!this.initialized) {
await this.initialize()
}
console.log('๐งช Testing enhanced ML models with text:', text.substring(0, 100) + '...')
const result = await this.analyze(text)
console.log('๐ Test results:', {
score: result.score,
confidence: result.details.confidence,
flags: result.flags,
models: {
roberta_toxicity: result.details.roberta_toxicity,
twitter_hate: result.details.twitter_hate
}
})
return result
}
}
module.exports = { EnhancedMLToxicityPlugin }