UNPKG

content-guard

Version:

🛡️ Advanced content analysis and moderation system with multi-variant optimization. Features context-aware detection, harassment prevention, and ML-powered toxicity analysis. Pre-1.0 development version.

663 lines (571 loc) 22.2 kB
/** * 🔧 Advanced Text Preprocessing v4.0 * * Professional text normalization and preparation for content analysis. * Now includes advanced Unicode confusables handling and multi-language support. */ const { ConfusablesAdvancedPlugin } = require('../plugins/confusables-advanced-plugin') class TextPreprocessor { constructor() { this.confusablesPlugin = new ConfusablesAdvancedPlugin() } preprocess(text, options = {}) { if (!text || typeof text !== 'string') return text let processed = text const metadata = { originalLength: text.length, transformations: [], leetSpeakVariations: null, hasLeetSpeak: false } // Normalize Unicode first if (options.normalizeUnicode !== false) { const normalized = processed.normalize('NFKC') if (normalized !== processed) { metadata.transformations.push('unicode_normalization') processed = normalized } } // Enhanced l33tspeak processing - generate all variations if detected if (options.normalizeLeetSpeak !== false) { const leetVariations = this.normalizeLeetSpeak(processed) if (leetVariations.length > 1) { metadata.hasLeetSpeak = true metadata.leetSpeakVariations = leetVariations metadata.transformations.push('leetspeak_detection') // Use the most likely decoded variation as the primary processed text processed = leetVariations[1] || leetVariations[0] // First variation is original, second is most likely decode } } // Remove excessive spacing if (options.removeExcessiveSpacing !== false) { const deSpaced = processed.replace(/\s{3,}/g, ' ').trim() if (deSpaced !== processed) { metadata.transformations.push('spacing_normalization') processed = deSpaced } } // Expand common slang if enabled if (options.expandSlang) { const expanded = this.expandSlang(processed) if (expanded !== processed) { metadata.transformations.push('slang_expansion') processed = expanded } } // Enhanced normalization for adversarial attacks if (options.enhancedNormalization) { const enhanced = this.enhancedNormalization(processed) if (enhanced !== processed) { metadata.transformations.push('enhanced_normalization') processed = enhanced } } metadata.finalLength = processed.length metadata.processingComplete = true // Return enhanced result with l33tspeak variations return { text: processed, metadata } } detectEvasionAttempts(text, result) { const evasionPatterns = [ { pattern: /(.)\1{3,}/g, // Character repetition (aaaa) name: 'character_repetition', severity: 'medium' }, { pattern: /[a-z]+[-_\.]+[a-z]+/gi, // Word separation (w-o-r-d) name: 'word_separation', severity: 'high' }, { pattern: /\b\w*[\d@#$%&*]+\w*\b/g, // Number/symbol injection name: 'symbol_injection', severity: 'medium' }, { pattern: /(\w)\s+(\w)/g, // Spaced letters (w o r d) name: 'letter_spacing', severity: 'high' } ] evasionPatterns.forEach(({ pattern, name, severity }) => { const matches = text.match(pattern) if (matches && matches.length > 0) { result.metadata.suspiciousPatterns.push({ type: name, severity: severity, matches: matches.length, examples: matches.slice(0, 3) // Keep first 3 examples }) result.metadata.transformations.push(`evasion_${name}`) } }) } // Enhanced method for emoji extraction and normalization extractEmojis(text) { const emojiRegex = /[\u{1F600}-\u{1F64F}]|[\u{1F300}-\u{1F5FF}]|[\u{1F680}-\u{1F6FF}]|[\u{1F1E0}-\u{1F1FF}]|[\u{2600}-\u{26FF}]|[\u{2700}-\u{27BF}]/gu return text.match(emojiRegex) || [] } // Method to get detailed preprocessing analysis getPreprocessingAnalysis(text) { const analysis = { originalLength: text.length, estimatedLanguage: this.detectLanguage(text), unicodeComplexity: this.analyzeUnicodeComplexity(text), evasionRisk: 'low', recommendedProcessing: [] } // Analyze Unicode complexity const unicodeBlocks = this.confusablesPlugin.getUnicodeAnalysis(text) analysis.unicodeComplexity = { blocks: unicodeBlocks.codePoints.length, scripts: [...new Set(unicodeBlocks.codePoints.map(cp => cp.unicodeBlock))], suspicious: unicodeBlocks.suspiciousPatterns.length > 0 } // Assess evasion risk const result = { metadata: { suspiciousPatterns: [] } } this.detectEvasionAttempts(text, result) if (result.metadata.suspiciousPatterns.length > 2) { analysis.evasionRisk = 'high' } else if (result.metadata.suspiciousPatterns.length > 0) { analysis.evasionRisk = 'medium' } // Provide recommendations if (analysis.unicodeComplexity.suspicious) { analysis.recommendedProcessing.push('advanced_unicode_normalization') } if (analysis.evasionRisk === 'high') { analysis.recommendedProcessing.push('aggressive_evasion_detection') } if (text.length > 1000) { analysis.recommendedProcessing.push('chunked_analysis') } return analysis } detectLanguage(text) { // Simple language detection based on character patterns const patterns = { english: /^[a-zA-Z0-9\s\.,!?;:'"()\-]+$/, spanish: /[ñáéíóúü]/, french: /[àâäéèêëïîôöùûüÿç]/, german: /[äöüß]/, portuguese: /[ãõç]/, mixed: /[\u0400-\u04FF\u0370-\u03FF\u0590-\u05FF\u0600-\u06FF]/ } for (const [lang, pattern] of Object.entries(patterns)) { if (pattern.test(text)) { return lang } } return 'unknown' } analyzeUnicodeComplexity(text) { const codePoints = [...text].map(char => char.codePointAt(0)) const basicLatin = codePoints.filter(cp => cp <= 0x007F).length const extendedLatin = codePoints.filter(cp => cp > 0x007F && cp <= 0x024F).length const otherScripts = codePoints.filter(cp => cp > 0x024F).length return { total: codePoints.length, basicLatin: basicLatin, extendedLatin: extendedLatin, otherScripts: otherScripts, complexity: otherScripts > 0 ? 'high' : extendedLatin > 0 ? 'medium' : 'low' } } /** * Enhanced l33tspeak normalization with comprehensive decoding * Detects clear l33tspeak and generates all possible interpretations */ normalizeLeetSpeak(text) { // Comprehensive l33tspeak mapping including letter substitutions const leetMap = { // Numbers to letters '0': ['o', 'O'], '1': ['i', 'I', 'l', 'L'], '3': ['e', 'E'], '4': ['a', 'A'], '5': ['s', 'S'], '7': ['t', 'T'], '8': ['b', 'B'], '9': ['g', 'G'], // Symbols to letters '@': ['a', 'A'], '!': ['i', 'I'], '$': ['s', 'S'], '|': ['l', 'L', 'i', 'I'], '+': ['t', 'T'], '€': ['e', 'E'], '£': ['l', 'L'], '%': ['x', 'X'], '&': ['and'], '#': ['h', 'H'], '*': ['a', 'A'], // Letter substitutions (common evasions) 'v': ['u'], // fvck -> fuck 'u': ['v'], // uuck -> vuck (reverse) 'x': ['ck'], // fuxx -> fuck 'kk': ['ck'], // fukk -> fuck 'cc': ['ck'], // fucc -> fuck 'ph': ['f'], // phuck -> fuck 'z': ['s'], // azz -> ass 'gg': ['g'], // faggot variations 'ii': ['i'], // shiit -> shit 'oo': ['o'], // stoopid -> stupid 'ee': ['e'], // freee -> free 'w': ['u'], // wuck -> luck patterns (reversed w and u confusion) 'y': ['i'], // shyt -> shit 'ay': ['ai'], // mayn -> main 'ur': ['your', 'you are'], 'u': ['you'], // when used as standalone 'r': ['are'], // when used as standalone 'n': ['and'], // when used as standalone '2': ['to', 'too'], '4': ['for'], // already covered above but adding context 'b4': ['before'], '8': ['ate'], // already covered above but adding context // Advanced evasions 'ckk': ['ck'], // fuckk -> fuck 'xxx': ['x'], // fuxxxer -> fuxxer -> fucker 'zz': ['z'], // pizzza -> pizza 'ff': ['f'], // stuffff -> stuff 'ss': ['s'], // asss -> ass 'tt': ['t'], // buttt -> butt 'bb': ['b'], // stubb -> stub 'pp': ['p'], // nippp -> nip 'dd': ['d'], // studd -> stud 'll': ['l'], // killl -> kill 'mm': ['m'], // dummm -> dum 'nn': ['n'], // thinn -> thin 'rr': ['r'], // starrr -> star // Invisible/zero-width characters (often used in evasion) '\u200B': [''], // zero-width space '\u200C': [''], // zero-width non-joiner '\u200D': [''], // zero-width joiner '\uFEFF': [''], // zero-width no-break space '\u2060': [''], // word joiner // Diacritics removal patterns 'á': ['a'], 'à': ['a'], 'â': ['a'], 'ã': ['a'], 'ä': ['a'], 'å': ['a'], 'é': ['e'], 'è': ['e'], 'ê': ['e'], 'ë': ['e'], 'í': ['i'], 'ì': ['i'], 'î': ['i'], 'ï': ['i'], 'ó': ['o'], 'ò': ['o'], 'ô': ['o'], 'õ': ['o'], 'ö': ['o'], 'ú': ['u'], 'ù': ['u'], 'û': ['u'], 'ü': ['u'], 'ý': ['y'], 'ÿ': ['y'], 'ñ': ['n'], 'ç': ['c'] } // Detect if this text contains clear l33tspeak patterns const leetIndicators = /[0-9@!$|+€£%&#*]{2,}|[4@]ss|[3e][4@]t|k[1i!|]ll|d[1i!|][3e]|f[uvw]ck|sh[1i!|]t|b[1i!|]tch|[4a]ssh[0o]le|st[uvw]p[1i!|]d|n[1i!|]gg?[3e]r|n[1i!|]gg?[4a@]|f[4a@]gg?[0o]t|r[3e]t[4a@]rd|ch[1i!|]nk|sp[1i!|]c|k[1i!|]k[3e]/gi const hasLeetSpeak = leetIndicators.test(text) if (!hasLeetSpeak) { return [text] // No l33tspeak detected, return original } // Generate all possible interpretations const variations = this.generateLeetSpeakVariations(text, leetMap) // Return up to 10 most likely variations to prevent explosion return variations.slice(0, 10) } /** * Generate all reasonable l33tspeak variations */ generateLeetSpeakVariations(text, leetMap) { const variations = new Set() variations.add(text) // Always include original // Start with basic single-character substitutions let currentVariations = [text] // Apply transformations in order of likelihood Object.entries(leetMap).forEach(([leetChar, replacements]) => { const newVariations = [] for (const variation of currentVariations) { // Skip if this variation is getting too long (prevent explosion) if (newVariations.length > 50) break const regex = new RegExp(leetChar.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'gi') if (regex.test(variation)) { for (const replacement of replacements) { const newVariation = variation.replace(regex, replacement) if (newVariation !== variation && newVariation.length <= text.length + 10) { newVariations.push(newVariation) variations.add(newVariation) } } } } // Add new variations to current set for next iteration currentVariations.push(...newVariations) // Limit total variations to prevent memory issues if (currentVariations.length > 100) { currentVariations = currentVariations.slice(0, 100) } }) // Add specialized decoding patterns const specialPatterns = this.applySpecialLeetPatterns(text) specialPatterns.forEach(pattern => variations.add(pattern)) // Filter and rank variations by likelihood return this.rankLeetSpeakVariations(Array.from(variations), text) } /** * Apply specialized l33tspeak patterns for known evasions */ applySpecialLeetPatterns(text) { const variations = [] const textLower = text.toLowerCase() // Common harassment word patterns const patterns = [ // fuck variations { pattern: /f[uvw@*]c?k+/gi, replacements: ['fuck'] }, { pattern: /ph[uvw]c?k+/gi, replacements: ['fuck'] }, { pattern: /f[0o]{2,}k/gi, replacements: ['fuck'] }, { pattern: /f[\*@#$%]ck/gi, replacements: ['fuck'] }, // shit variations { pattern: /sh[1i!|]t+/gi, replacements: ['shit'] }, { pattern: /sh[1i!|][7t]+/gi, replacements: ['shit'] }, { pattern: /5h[1i!|]t/gi, replacements: ['shit'] }, // bitch variations { pattern: /b[1i!|]tch/gi, replacements: ['bitch'] }, { pattern: /b[1i!|][7t]ch/gi, replacements: ['bitch'] }, { pattern: /b[1i!|][7t][cç]h/gi, replacements: ['bitch'] }, // ass variations { pattern: /[4a@]ss+/gi, replacements: ['ass'] }, { pattern: /[4a@]zz+/gi, replacements: ['ass'] }, { pattern: /[4a@]55/gi, replacements: ['ass'] }, // asshole variations { pattern: /[4a@]ssh[0o]le/gi, replacements: ['asshole'] }, { pattern: /[4a@]55h[0o]le/gi, replacements: ['asshole'] }, { pattern: /[4a@]zzh[0o]le/gi, replacements: ['asshole'] }, // kill variations { pattern: /k[1i!|]ll/gi, replacements: ['kill'] }, { pattern: /k[1i!|][l|]+/gi, replacements: ['kill'] }, // die variations { pattern: /d[1i!|][3e]/gi, replacements: ['die'] }, { pattern: /dy[3e]/gi, replacements: ['die'] }, // stupid variations { pattern: /st[uvw]p[1i!|]d/gi, replacements: ['stupid'] }, { pattern: /5t[uvw]p[1i!|]d/gi, replacements: ['stupid'] }, { pattern: /stu?p[1i!|]d/gi, replacements: ['stupid'] }, // hate variations { pattern: /h[4a@][7t][3e]/gi, replacements: ['hate'] }, { pattern: /h8/gi, replacements: ['hate'] }, // yourself variations { pattern: /y[0o]ur?s[3e]lf/gi, replacements: ['yourself'] }, { pattern: /ur?s[3e]lf/gi, replacements: ['yourself'] }, // CRITICAL: Racial slur variations - ADDED { pattern: /n[1i!|]gg?[3e]r/gi, replacements: ['nigger'] }, { pattern: /n[1i!|]gg?[4a@]/gi, replacements: ['nigga'] }, { pattern: /n[3e]gr[0o]/gi, replacements: ['negro'] }, { pattern: /c[0o]{2,}n/gi, replacements: ['coon'] }, // Homophobic slur variations { pattern: /f[4a@]gg?[0o]t/gi, replacements: ['faggot'] }, { pattern: /f[4a@]g/gi, replacements: ['fag'] }, // Other identity-based slur variations { pattern: /ch[1i!|]nk/gi, replacements: ['chink'] }, { pattern: /g[0o]{2,}k/gi, replacements: ['gook'] }, { pattern: /sp[1i!|]c/gi, replacements: ['spic'] }, { pattern: /k[1i!|]k[3e]/gi, replacements: ['kike'] }, { pattern: /r[4a@]gh[3e][4a@]d/gi, replacements: ['raghead'] }, // Ableist slur variations { pattern: /r[3e]t[4a@]rd/gi, replacements: ['retard'] }, { pattern: /r[3e]t[4a@]rd[3e]d/gi, replacements: ['retarded'] }, // Spaced-out evasions { pattern: /k-[1i!|]-l-l/gi, replacements: ['kill'] }, { pattern: /d-[1i!|]-[3e]/gi, replacements: ['die'] }, { pattern: /f-u-c-k/gi, replacements: ['fuck'] }, { pattern: /s-h-[1i!|]-t/gi, replacements: ['shit'] }, { pattern: /n-[1i!|]-g-g-[3e]-r/gi, replacements: ['nigger'] }, { pattern: /f-[4a@]-g-g-[0o]-t/gi, replacements: ['faggot'] }, // Advanced Unicode evasions { pattern: /к[1i!|]ѕѕ/gi, replacements: ['kiss'] }, // Cyrillic lookalikes { pattern: /f[ψυν]ck/gi, replacements: ['fuck'] }, // Greek lookalikes ] let modifiedText = text patterns.forEach(({ pattern, replacements }) => { if (pattern.test(modifiedText)) { replacements.forEach(replacement => { const newText = modifiedText.replace(pattern, replacement) if (newText !== modifiedText) { variations.push(newText) } }) } }) return variations } /** * Rank l33tspeak variations by likelihood of being the intended message */ rankLeetSpeakVariations(variations, originalText) { return variations .map(variation => ({ text: variation, score: this.calculateLeetSpeakScore(variation, originalText) })) .sort((a, b) => b.score - a.score) // Higher score = more likely .map(item => item.text) } /** * Calculate likelihood score for l33tspeak variation */ calculateLeetSpeakScore(variation, original) { let score = 0 // Prefer variations that form real words const realWords = ['fuck', 'shit', 'bitch', 'ass', 'asshole', 'kill', 'die', 'hate', 'stupid', 'idiot', 'damn', 'hell'] const variationLower = variation.toLowerCase() realWords.forEach(word => { if (variationLower.includes(word)) { score += 10 // High score for real toxic words } }) // Prefer variations that are similar length to original const lengthDiff = Math.abs(variation.length - original.length) score -= lengthDiff * 0.5 // Prefer variations with fewer numbers/symbols (more "decoded") const numberSymbolCount = (variation.match(/[0-9@!$|+€£%&#*]/g) || []).length score -= numberSymbolCount * 2 // Prefer variations that look like English const vowelRatio = (variation.match(/[aeiou]/gi) || []).length / variation.length if (vowelRatio > 0.1 && vowelRatio < 0.6) { score += 5 // Good vowel ratio } // Prefer variations without excessive repeated characters const repeatedChars = (variation.match(/(.)\1{2,}/g) || []).length score -= repeatedChars * 3 return score } /** * Expand common slang and abbreviations */ expandSlang(text) { const slangMap = { // Common toxic slang 'ur': 'your', 'u': 'you', 'r': 'are', 'n': 'and', 'ur mom': 'your mother', 'ur dad': 'your father', 'gtfo': 'get the fuck out', 'stfu': 'shut the fuck up', 'fml': 'fuck my life', 'wtf': 'what the fuck', 'omfg': 'oh my fucking god', 'lmfao': 'laughing my fucking ass off', 'rotfl': 'rolling on the floor laughing', 'smh': 'shaking my head', 'imo': 'in my opinion', 'imho': 'in my honest opinion', 'tbh': 'to be honest', 'ngl': 'not gonna lie', 'fr': 'for real', 'deadass': 'seriously', 'periodt': 'period', 'no cap': 'no lie', 'cap': 'lie', // Internet speak 'ppl': 'people', 'plz': 'please', 'thx': 'thanks', 'ty': 'thank you', 'np': 'no problem', 'yw': 'you welcome', 'irl': 'in real life', 'afk': 'away from keyboard', 'brb': 'be right back', 'ttyl': 'talk to you later', 'jk': 'just kidding', 'lol': 'laughing out loud', 'rofl': 'rolling on floor laughing', 'omg': 'oh my god', 'fyi': 'for your information', 'tmi': 'too much information', // Gaming/online harassment 'noob': 'newbie', 'pwned': 'owned', 'rekt': 'wrecked', 'gg': 'good game', 'ez': 'easy', 'git gud': 'get good', 'trash': 'garbage', 'scrub': 'bad player', // Social media 'dm': 'direct message', 'rt': 'retweet', 'avi': 'avatar', 'bio': 'biography', 'fam': 'family', 'squad': 'group', 'stan': 'obsessive fan', 'salty': 'bitter', 'shade': 'disrespect', 'tea': 'gossip', 'spill': 'reveal', 'flex': 'show off', 'vibe': 'feeling', 'mood': 'feeling', 'based': 'good', 'cringe': 'embarrassing', 'sus': 'suspicious', 'bet': 'yes', 'facts': 'true', 'slaps': 'excellent', 'bussin': 'excellent', 'fire': 'excellent', 'lit': 'excellent', 'mid': 'mediocre', 'ratio': 'get more likes', 'L': 'loss', 'W': 'win' } let expanded = text // Replace whole words only to avoid false replacements Object.entries(slangMap).forEach(([slang, expansion]) => { const regex = new RegExp(`\\b${slang.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'gi') expanded = expanded.replace(regex, expansion) }) return expanded } /** * Enhanced normalization for adversarial attacks */ enhancedNormalization(text) { let normalized = text // Remove excessive punctuation normalized = normalized.replace(/[!]{3,}/g, '!') normalized = normalized.replace(/[?]{3,}/g, '?') normalized = normalized.replace(/[.]{3,}/g, '...') // Normalize excessive spacing normalized = normalized.replace(/\s{2,}/g, ' ') // Remove zero-width characters used for evasion normalized = normalized.replace(/[\u200B-\u200D\uFEFF\u2060-\u2064]/g, '') // Normalize case variations (like sMaRt) const words = normalized.split(/\s+/) normalized = words.map(word => { // If word has mixed case pattern (likely evasion), normalize it const hasUpperLower = /[a-z]/.test(word) && /[A-Z]/.test(word) const isAlternating = /^([a-z][A-Z]|[A-Z][a-z])+/.test(word) if (hasUpperLower && (isAlternating || word.length > 6)) { return word.toLowerCase() } return word }).join(' ') // Remove excessive character repetition (but preserve intent) normalized = normalized.replace(/(.)\1{3,}/g, '$1$1$1') // Max 3 repeated chars return normalized.trim() } } // Legacy compatibility functions function normalizeText(text) { const preprocessor = new TextPreprocessor() const result = preprocessor.preprocess(text) return result.processedText } function confusablesNormalize(text) { const preprocessor = new TextPreprocessor() const confusablesResult = preprocessor.confusablesPlugin.analyze(text, {}) return confusablesResult.normalizedText } module.exports = { TextPreprocessor, normalizeText, confusablesNormalize }