UNPKG

content-guard

Version:

๐Ÿ›ก๏ธ Advanced content analysis and moderation system with multi-variant optimization. Features context-aware detection, harassment prevention, and ML-powered toxicity analysis. Pre-1.0 development version.

331 lines (282 loc) โ€ข 12.2 kB
/** * ๐Ÿ”„ Advanced Confusables Normalization Plugin v4.0 * * Professional Unicode confusables detection and normalization using * the standardized confusables library. Handles complex homograph attacks, * mixed-script obfuscation, and sophisticated Unicode evasion techniques. */ const confusables = require('confusables') class ConfusablesAdvancedPlugin { constructor() { this.name = 'confusables-advanced' this.description = 'Professional Unicode confusables normalization' this.version = '4.0.0' // Test confusables library on initialization try { // Test the confusables library methods if (typeof confusables.remove === 'function') { this.confusablesMethod = confusables.remove } else if (typeof confusables === 'function') { this.confusablesMethod = confusables } else { console.warn('Confusables library not found, using fallback') this.confusablesMethod = null } } catch (error) { console.warn('Confusables library initialization failed:', error.message) this.confusablesMethod = null } // Extended confusables mapping for edge cases this.customMappings = { // CRITICAL: Cyrillic confusables (common in attacks) 'ะฐ': 'a', 'ะต': 'e', 'ะพ': 'o', 'ั€': 'p', 'ั': 'c', 'ัƒ': 'y', 'ั…': 'x', 'ะ': 'A', 'ะ’': 'B', 'ะ•': 'E', 'ะš': 'K', 'ะœ': 'M', 'ะ': 'H', 'ะž': 'O', 'ะ ': 'P', 'ะก': 'C', 'ะข': 'T', 'ะฃ': 'Y', 'ะฅ': 'X', // CRITICAL: Mathematical/styled I and l characters 'ะ†': 'I', 'ั–': 'i', 'ำ€': 'I', 'ำ': 'l', 'ว€': 'l', 'ว': 'll', 'ว‚': 'l', 'โ„': 'I', 'โ„‘': 'I', 'โ„“': 'l', '๐ˆ': 'I', '๐ฅ': 'l', '๐‘ฐ': 'I', '๐’': 'l', '๐“˜': 'I', '๐“ต': 'l', '๐”ฆ': 'i', '๐”ฉ': 'l', '๐•€': 'I', '๐•': 'l', // Mathematical and styled characters '๐“ช': 'a', '๐“ซ': 'b', '๐“ฌ': 'c', '๐“ญ': 'd', '๐“ฎ': 'e', '๐“ฏ': 'f', '๐“ฐ': 'g', '๐“ฑ': 'h', '๐“ฒ': 'i', '๐“ณ': 'j', '๐“ด': 'k', '๐“ต': 'l', '๐“ถ': 'm', '๐“ท': 'n', '๐“ธ': 'o', '๐“น': 'p', '๐“บ': 'q', '๐“ป': 'r', '๐“ผ': 's', '๐“ฝ': 't', '๐“พ': 'u', '๐“ฟ': 'v', '๐”€': 'w', '๐”': 'x', '๐”‚': 'y', '๐”ƒ': 'z', // Double-struck (blackboard bold) '๐•’': 'a', '๐•“': 'b', '๐•”': 'c', '๐••': 'd', '๐•–': 'e', '๐•—': 'f', '๐•˜': 'g', '๐•™': 'h', '๐•š': 'i', '๐•›': 'j', '๐•œ': 'k', '๐•': 'l', '๐•ž': 'm', '๐•Ÿ': 'n', '๐• ': 'o', '๐•ก': 'p', '๐•ข': 'q', '๐•ฃ': 'r', '๐•ค': 's', '๐•ฅ': 't', '๐•ฆ': 'u', '๐•ง': 'v', '๐•จ': 'w', '๐•ฉ': 'x', '๐•ช': 'y', '๐•ซ': 'z', // Fraktur '๐”ž': 'a', '๐”Ÿ': 'b', '๐” ': 'c', '๐”ก': 'd', '๐”ข': 'e', '๐”ฃ': 'f', '๐”ค': 'g', '๐”ฅ': 'h', '๐”ฆ': 'i', '๐”ง': 'j', '๐”จ': 'k', '๐”ฉ': 'l', '๐”ช': 'm', '๐”ซ': 'n', '๐”ฌ': 'o', '๐”ญ': 'p', '๐”ฎ': 'q', '๐”ฏ': 'r', '๐”ฐ': 's', '๐”ฑ': 't', '๐”ฒ': 'u', '๐”ณ': 'v', '๐”ด': 'w', '๐”ต': 'x', '๐”ถ': 'y', '๐”ท': 'z', // Small caps 'แด€': 'a', 'ส™': 'b', 'แด„': 'c', 'แด…': 'd', 'แด‡': 'e', 'า“': 'f', 'ษข': 'g', 'สœ': 'h', 'ษช': 'i', 'แดŠ': 'j', 'แด‹': 'k', 'สŸ': 'l', 'แด': 'm', 'ษด': 'n', 'แด': 'o', 'แด˜': 'p', 'วซ': 'q', 'ส€': 'r', 's': 's', 'แด›': 't', 'แดœ': 'u', 'แด ': 'v', 'แดก': 'w', 'x': 'x', 'ส': 'y', 'แดข': 'z', // Regional indicators (used in some attacks) '๐Ÿ…ฐ': 'a', '๐Ÿ…ฑ': 'b', '๐Ÿ…พ': 'o' } // Common Unicode attack patterns this.suspiciousPatterns = [ { pattern: /[\u0400-\u04FF]/, name: 'cyrillic_mixing', severity: 'medium' }, { pattern: /[\u0370-\u03FF]/, name: 'greek_mixing', severity: 'medium' }, { pattern: /[\uFF00-\uFFEF]/, name: 'fullwidth_chars', severity: 'medium' }, { pattern: /[\u1D400-\u1D7FF]/, name: 'mathematical_chars', severity: 'high' }, { pattern: /[\u200B-\u200F\u2028-\u202F\u205F-\u206F]/, name: 'invisible_chars', severity: 'high' }, { pattern: /[\u1F100-\u1F1FF]/, name: 'regional_indicators', severity: 'medium' } ] } analyze(text, metadata = {}) { if (!text || typeof text !== 'string') { return { normalizedText: text, score: 0, flags: [], details: { transformations: 0, suspiciousPatterns: [] } } } const result = { normalizedText: text, score: 0, flags: [], details: { originalLength: text.length, transformations: 0, suspiciousPatterns: [], unicodeBlocks: new Set(), confusablesDetected: [] } } try { // 1. Detect suspicious Unicode patterns before normalization this.detectSuspiciousPatterns(text, result) // 2. Apply professional confusables normalization let normalized = text // First try the standard confusables library try { if (this.confusablesMethod) { normalized = this.confusablesMethod(text) if (normalized !== text) { result.details.transformations++ result.flags.push('[CONFUSABLES] Standard Unicode normalization applied') } } else { // Fallback to custom mappings only normalized = this.applyCustomMappings(text, result) } } catch (error) { console.warn('Confusables normalization failed, using fallback:', error.message) // Fallback to custom mappings normalized = this.applyCustomMappings(text, result) } // 3. Apply additional custom mappings for edge cases const finalNormalized = this.applyCustomMappings(normalized, result) // 4. Post-process common attack patterns const postProcessed = this.postProcessAttackPatterns(finalNormalized, result) // 5. Detect mixed script attacks this.detectMixedScriptAttacks(postProcessed, result) // 6. Detect invisible character attacks this.detectInvisibleChars(text, result) result.normalizedText = postProcessed // Calculate final score based on detected patterns this.calculateFinalScore(result) } catch (error) { console.error('Confusables analysis error:', error) result.flags.push('[CONFUSABLES] Analysis error, using original text') } return result } applyCustomMappings(text, result) { let normalized = text let transformationCount = 0 for (const [confusable, replacement] of Object.entries(this.customMappings)) { if (normalized.includes(confusable)) { normalized = normalized.replace(new RegExp(confusable, 'g'), replacement) transformationCount++ result.details.confusablesDetected.push({ character: confusable, replacement: replacement }) } } if (transformationCount > 0) { result.details.transformations += transformationCount result.flags.push(`[CONFUSABLES] Custom mappings applied (${transformationCount} chars)`) } return normalized } postProcessAttackPatterns(text, result) { let processed = text // Common attack pattern fixes const attackPatterns = [ // Fix mixed case I/l patterns (kiII -> kill, yourseIf -> yourself) { pattern: /k[iI][lI][lI]/gi, replacement: 'kill', name: 'kill_obfuscation' }, { pattern: /[yY]ourse[lI]f/gi, replacement: 'yourself', name: 'yourself_obfuscation' }, { pattern: /d[iI]e/gi, replacement: 'die', name: 'die_obfuscation' }, { pattern: /k[yY]s/gi, replacement: 'kys', name: 'kys_obfuscation' }, // Fix common letter substitutions { pattern: /[iI]{2,}/g, replacement: 'll', name: 'double_i_to_ll' }, { pattern: /[oO]0/g, replacement: 'oo', name: 'zero_to_o' }, { pattern: /3[eE]/g, replacement: 'ee', name: 'three_to_e' }, { pattern: /5[sS]/g, replacement: 'ss', name: 'five_to_s' } ] attackPatterns.forEach(({ pattern, replacement, name }) => { const matches = processed.match(pattern) if (matches) { processed = processed.replace(pattern, replacement) result.details.transformations++ result.flags.push(`[CONFUSABLES] Fixed attack pattern: ${name}`) } }) return processed } detectSuspiciousPatterns(text, result) { this.suspiciousPatterns.forEach(({ pattern, name, severity }) => { const matches = text.match(pattern) if (matches) { result.details.suspiciousPatterns.push({ type: name, severity: severity, matches: matches.length }) const severityScore = { 'high': 3, 'medium': 2, 'low': 1 }[severity] || 1 result.score += severityScore result.flags.push(`[CONFUSABLES] Suspicious pattern: ${name}`) } }) } detectMixedScriptAttacks(text, result) { const scriptBlocks = { latin: /[\u0020-\u007F\u00A0-\u00FF\u0100-\u017F\u0180-\u024F]/, cyrillic: /[\u0400-\u04FF]/, greek: /[\u0370-\u03FF]/, arabic: /[\u0600-\u06FF]/, hebrew: /[\u0590-\u05FF]/, cjk: /[\u4E00-\u9FFF\u3400-\u4DBF]/, mathematical: /[\u1D400-\u1D7FF]/ } const detectedScripts = [] for (const [script, pattern] of Object.entries(scriptBlocks)) { if (pattern.test(text)) { detectedScripts.push(script) result.details.unicodeBlocks.add(script) } } // Mixed script in short text is suspicious if (detectedScripts.length > 2 && text.length < 200) { result.score += 2 result.flags.push(`[CONFUSABLES] Mixed script attack detected: ${detectedScripts.join(', ')}`) } else if (detectedScripts.length > 1) { result.score += 1 result.flags.push(`[CONFUSABLES] Mixed scripts detected: ${detectedScripts.join(', ')}`) } } detectInvisibleChars(text, result) { const invisibleChars = [ '\u200B', // Zero Width Space '\u200C', // Zero Width Non-Joiner '\u200D', // Zero Width Joiner '\u2060', // Word Joiner '\uFEFF', // Zero Width No-Break Space '\u00AD' // Soft Hyphen ] let invisibleCount = 0 invisibleChars.forEach(char => { const matches = (text.match(new RegExp(char, 'g')) || []).length invisibleCount += matches }) if (invisibleCount > 0) { result.score += Math.min(invisibleCount, 5) // Cap at 5 points result.flags.push(`[CONFUSABLES] Invisible characters detected (${invisibleCount})`) } } calculateFinalScore(result) { // Bonus points for multiple confusables techniques if (result.details.suspiciousPatterns.length > 2) { result.score += 2 result.flags.push('[CONFUSABLES] Multiple obfuscation techniques detected') } // High transformation count indicates sophisticated attack if (result.details.transformations > 5) { result.score += 2 result.flags.push('[CONFUSABLES] High character transformation count') } } // Utility method to get detailed Unicode analysis getUnicodeAnalysis(text) { const analysis = { length: text.length, codePoints: [...text].map(char => ({ char, codePoint: char.codePointAt(0), unicodeBlock: this.getUnicodeBlock(char.codePointAt(0)) })), scripts: [], suspiciousPatterns: [] } // Add script detection this.suspiciousPatterns.forEach(({ pattern, name }) => { if (pattern.test(text)) { analysis.suspiciousPatterns.push(name) } }) return analysis } getUnicodeBlock(codePoint) { if (codePoint <= 0x007F) return 'Basic Latin' if (codePoint <= 0x00FF) return 'Latin-1 Supplement' if (codePoint >= 0x0400 && codePoint <= 0x04FF) return 'Cyrillic' if (codePoint >= 0x0370 && codePoint <= 0x03FF) return 'Greek' if (codePoint >= 0x1D400 && codePoint <= 0x1D7FF) return 'Mathematical' if (codePoint >= 0x1F600 && codePoint <= 0x1F64F) return 'Emoticons' return 'Other' } } module.exports = { ConfusablesAdvancedPlugin }