content-guard
Version:
🛡️ Advanced content analysis and moderation system with multi-variant optimization. Features context-aware detection, harassment prevention, and ML-powered toxicity analysis. Pre-1.0 development version.
340 lines (286 loc) • 10.8 kB
JavaScript
/**
* 🤖 ML Toxicity Detection Plugin v4.0
*
* Lightweight transformer-based toxicity detection using Transformers.js.
* Provides semantic understanding to catch subtle harassment patterns
* that rule-based systems miss. Edge-compatible and fast.
*/
class MLToxicityPlugin {
constructor(options = {}) {
this.options = {
silent: options.silent || false,
debug: options.debug || false,
modelName: options.modelName || 'Xenova/toxic-bert',
fallbackModel: options.fallbackModel || 'Xenova/bert-base-uncased',
threshold: options.threshold || 0.7,
useEnsemble: options.useEnsemble !== false
}
this.models = {}
this.initialized = false
this.stats = {
totalPredictions: 0,
ensemblePredictions: 0,
modelLoadTime: 0
}
this.name = 'ml-toxicity'
this.description = 'Transformer-based semantic toxicity detection'
this.version = '4.0.0'
// Fallback patterns for when ML is unavailable
this.semanticPatterns = [
// Subtle condescension
{
pattern: /(?:not surprised|clearly|obviously|simple(?:ly)?)\s+(?:you|your)/i,
score: 3,
type: 'condescension',
description: 'Condescending language patterns'
},
// Capability questioning
{
pattern: /(?:are you (?:sure|capable)|can you (?:handle|manage)|(?:maybe|perhaps) you should)/i,
score: 2,
type: 'capability_questioning',
description: 'Questioning competence patterns'
},
// Exclusionary language
{
pattern: /(?:people like you|not (?:really )?(?:cut out|designed|meant) for|doesn't fit|not (?:the )?right (?:fit|type))/i,
score: 4,
type: 'exclusionary',
description: 'Exclusionary language patterns'
},
// Gaslighting patterns
{
pattern: /(?:you'?re (?:being|over)reacting|you'?re (?:too )?sensitive|that'?s not what|you'?re (?:imagining|projecting))/i,
score: 3,
type: 'gaslighting',
description: 'Gaslighting patterns'
},
// Power dynamics abuse
{
pattern: /(?:remember who|know your place|work for me|signs? your paycheck|easily replaceable)/i,
score: 5,
type: 'power_abuse',
description: 'Power dynamics abuse'
},
// Coded threats
{
pattern: /(?:would be (?:unfortunate|interesting)|accidents happen|careful about|watch (?:your|yourself))/i,
score: 4,
type: 'coded_threat',
description: 'Veiled threat patterns'
}
]
// Positive context indicators that reduce scores
this.positivePatterns = [
/(?:thank you|please|appreciate|grateful|respect|understand|help)/i,
/(?:feedback|suggestion|recommendation|advice|guidance)/i,
/(?:learn|improve|develop|grow|support|assist)/i
]
}
async initialize(debug = false) {
if (this.initialized) return true
try {
const startTime = Date.now()
if (!this.options.silent && debug) {
console.log('📥 Loading Xenova/toxic-bert (ONNX-compatible)...')
}
// Load primary model
try {
const { pipeline } = await import('@xenova/transformers')
this.models.primary = await pipeline('text-classification', this.options.modelName)
if (!this.options.silent && debug) {
console.log('✅ ML Toxicity Plugin: Transformer model loaded successfully')
}
} catch (primaryError) {
if (!this.options.silent && debug) {
console.log('⚠️ Primary model failed, trying fallback...')
}
// Fallback model
const { pipeline } = await import('@xenova/transformers')
this.models.primary = await pipeline('text-classification', this.options.fallbackModel)
if (!this.options.silent && debug) {
console.log('✅ Fallback toxicity model loaded successfully')
}
}
// Load secondary model for ensemble if enabled
if (this.options.useEnsemble) {
try {
if (!this.options.silent && debug) {
console.log('📥 Loading alternative toxicity classifier...')
}
// You could load a different model here for ensemble
const { pipeline } = await import('@xenova/transformers')
this.models.secondary = await pipeline('text-classification', 'Xenova/distilbert-base-uncased')
if (!this.options.silent && debug) {
console.log('✅ Alternative toxicity classifier loaded successfully!')
}
} catch (secondaryError) {
if (!this.options.silent && debug) {
console.log('⚠️ Secondary model failed to load, using single model')
}
}
}
this.stats.modelLoadTime = Date.now() - startTime
this.initialized = true
if (!this.options.silent && debug) {
console.log('✅ Enhanced ML Toxicity Plugin: At least one model loaded successfully!')
}
return true
} catch (error) {
if (!this.options.silent && debug) {
console.error('❌ ML Toxicity Plugin failed to initialize:', error.message)
}
return false
}
}
async analyze(text, metadata = {}) {
if (!text || typeof text !== 'string') {
return { score: 0, flags: [], details: {} }
}
const result = {
score: 0,
flags: [],
details: {
mlEnabled: this.initialized,
sentimentScore: null,
semanticPatterns: [],
confidence: 0,
positiveSignals: 0
}
}
try {
// 1. Try ML analysis if available
if (this.initialized && this.models.primary) {
await this.performMLAnalysis(text, result)
}
// 2. Always run semantic pattern analysis (backup + supplement)
this.performSemanticAnalysis(text, result)
// 3. Apply positive context adjustments
this.applyPositiveAdjustments(text, result)
// 4. Calculate confidence and final adjustments
this.calculateConfidence(result)
} catch (error) {
console.error('ML Toxicity analysis error:', error)
result.flags.push('[ML] Analysis error, using pattern fallback')
// Continue with semantic patterns only
this.performSemanticAnalysis(text, result)
}
return result
}
async performMLAnalysis(text, result) {
try {
// Split long text into chunks for better analysis
const chunks = this.splitIntoChunks(text, 512)
let totalSentiment = 0
let negativeCount = 0
for (const chunk of chunks) {
const sentimentResult = await this.models.primary(chunk)
if (sentimentResult && sentimentResult[0]) {
const { label, score } = sentimentResult[0]
if (label === 'NEGATIVE') {
totalSentiment += score
negativeCount++
// High-confidence negative sentiment indicates toxicity
if (score > 0.8) {
result.score += 3
result.flags.push(`[ML] High-confidence negative sentiment (${(score * 100).toFixed(1)}%)`)
} else if (score > 0.6) {
result.score += 2
result.flags.push(`[ML] Moderate negative sentiment (${(score * 100).toFixed(1)}%)`)
} else if (score > 0.5) {
result.score += 1
result.flags.push(`[ML] Mild negative sentiment (${(score * 100).toFixed(1)}%)`)
}
}
result.details.sentimentScore = totalSentiment / chunks.length
result.details.confidence = score
}
}
// Multiple negative chunks indicate persistent toxicity
if (negativeCount > 1) {
result.score += 1
result.flags.push('[ML] Persistent negative sentiment across text')
}
} catch (error) {
console.warn('ML sentiment analysis failed:', error.message)
result.details.mlEnabled = false
}
}
performSemanticAnalysis(text, result) {
this.semanticPatterns.forEach(pattern => {
const matches = text.match(pattern.pattern)
if (matches) {
result.score += pattern.score
result.details.semanticPatterns.push({
type: pattern.type,
description: pattern.description,
matches: matches.length,
score: pattern.score
})
result.flags.push(`[ML] ${pattern.description}`)
}
})
}
applyPositiveAdjustments(text, result) {
let positiveCount = 0
this.positivePatterns.forEach(pattern => {
const matches = text.match(pattern)
if (matches) {
positiveCount += matches.length
}
})
if (positiveCount > 0) {
const reduction = Math.min(positiveCount * 0.5, 3) // Max 3 point reduction
result.score = Math.max(0, result.score - reduction)
result.details.positiveSignals = positiveCount
result.flags.push(`[ML] Positive context detected (-${reduction.toFixed(1)} points)`)
}
}
calculateConfidence(result) {
// Higher confidence when both ML and patterns agree
if (result.details.mlEnabled && result.details.semanticPatterns.length > 0) {
result.details.confidence = Math.min(result.details.confidence + 0.2, 1.0)
result.flags.push('[ML] High confidence: ML + pattern agreement')
}
// Adjust score based on confidence
if (result.details.confidence > 0.8) {
result.score = Math.round(result.score * 1.1) // Boost high-confidence scores
} else if (result.details.confidence < 0.3) {
result.score = Math.round(result.score * 0.8) // Reduce low-confidence scores
}
}
splitIntoChunks(text, maxLength) {
if (text.length <= maxLength) {
return [text]
}
const chunks = []
const sentences = text.split(/[.!?]+/)
let currentChunk = ''
for (const sentence of sentences) {
if ((currentChunk + sentence).length <= maxLength) {
currentChunk += sentence + '. '
} else {
if (currentChunk) {
chunks.push(currentChunk.trim())
}
currentChunk = sentence + '. '
}
}
if (currentChunk) {
chunks.push(currentChunk.trim())
}
return chunks.length > 0 ? chunks : [text.substring(0, maxLength)]
}
// Helper method for debugging
async testSentiment(text) {
if (!this.initialized) {
await this.initialize()
}
if (this.models.primary) {
return await this.models.primary(text)
} else {
return { error: 'Transformer not available' }
}
}
}
module.exports = { MLToxicityPlugin }