content-guard
Version:
๐ก๏ธ Advanced content analysis and moderation system with multi-variant optimization. Features context-aware detection, harassment prevention, and ML-powered toxicity analysis. Pre-1.0 development version.
331 lines (282 loc) โข 12.2 kB
JavaScript
/**
* ๐ Advanced Confusables Normalization Plugin v4.0
*
* Professional Unicode confusables detection and normalization using
* the standardized confusables library. Handles complex homograph attacks,
* mixed-script obfuscation, and sophisticated Unicode evasion techniques.
*/
const confusables = require('confusables')
class ConfusablesAdvancedPlugin {
constructor() {
this.name = 'confusables-advanced'
this.description = 'Professional Unicode confusables normalization'
this.version = '4.0.0'
// Test confusables library on initialization
try {
// Test the confusables library methods
if (typeof confusables.remove === 'function') {
this.confusablesMethod = confusables.remove
} else if (typeof confusables === 'function') {
this.confusablesMethod = confusables
} else {
console.warn('Confusables library not found, using fallback')
this.confusablesMethod = null
}
} catch (error) {
console.warn('Confusables library initialization failed:', error.message)
this.confusablesMethod = null
}
// Extended confusables mapping for edge cases
this.customMappings = {
// CRITICAL: Cyrillic confusables (common in attacks)
'ะฐ': 'a', 'ะต': 'e', 'ะพ': 'o', 'ั': 'p', 'ั': 'c', 'ั': 'y', 'ั
': 'x',
'ะ': 'A', 'ะ': 'B', 'ะ': 'E', 'ะ': 'K', 'ะ': 'M', 'ะ': 'H', 'ะ': 'O',
'ะ ': 'P', 'ะก': 'C', 'ะข': 'T', 'ะฃ': 'Y', 'ะฅ': 'X',
// CRITICAL: Mathematical/styled I and l characters
'ะ': 'I', 'ั': 'i', 'ำ': 'I', 'ำ': 'l', 'ว': 'l', 'ว': 'll', 'ว': 'l',
'โ': 'I', 'โ': 'I', 'โ': 'l', '๐': 'I', '๐ฅ': 'l', '๐ฐ': 'I', '๐': 'l',
'๐': 'I', '๐ต': 'l', '๐ฆ': 'i', '๐ฉ': 'l', '๐': 'I', '๐': 'l',
// Mathematical and styled characters
'๐ช': 'a', '๐ซ': 'b', '๐ฌ': 'c', '๐ญ': 'd', '๐ฎ': 'e', '๐ฏ': 'f', '๐ฐ': 'g',
'๐ฑ': 'h', '๐ฒ': 'i', '๐ณ': 'j', '๐ด': 'k', '๐ต': 'l', '๐ถ': 'm', '๐ท': 'n',
'๐ธ': 'o', '๐น': 'p', '๐บ': 'q', '๐ป': 'r', '๐ผ': 's', '๐ฝ': 't', '๐พ': 'u',
'๐ฟ': 'v', '๐': 'w', '๐': 'x', '๐': 'y', '๐': 'z',
// Double-struck (blackboard bold)
'๐': 'a', '๐': 'b', '๐': 'c', '๐': 'd', '๐': 'e', '๐': 'f', '๐': 'g',
'๐': 'h', '๐': 'i', '๐': 'j', '๐': 'k', '๐': 'l', '๐': 'm', '๐': 'n',
'๐ ': 'o', '๐ก': 'p', '๐ข': 'q', '๐ฃ': 'r', '๐ค': 's', '๐ฅ': 't', '๐ฆ': 'u',
'๐ง': 'v', '๐จ': 'w', '๐ฉ': 'x', '๐ช': 'y', '๐ซ': 'z',
// Fraktur
'๐': 'a', '๐': 'b', '๐ ': 'c', '๐ก': 'd', '๐ข': 'e', '๐ฃ': 'f', '๐ค': 'g',
'๐ฅ': 'h', '๐ฆ': 'i', '๐ง': 'j', '๐จ': 'k', '๐ฉ': 'l', '๐ช': 'm', '๐ซ': 'n',
'๐ฌ': 'o', '๐ญ': 'p', '๐ฎ': 'q', '๐ฏ': 'r', '๐ฐ': 's', '๐ฑ': 't', '๐ฒ': 'u',
'๐ณ': 'v', '๐ด': 'w', '๐ต': 'x', '๐ถ': 'y', '๐ท': 'z',
// Small caps
'แด': 'a', 'ส': 'b', 'แด': 'c', 'แด
': 'd', 'แด': 'e', 'า': 'f', 'ษข': 'g',
'ส': 'h', 'ษช': 'i', 'แด': 'j', 'แด': 'k', 'ส': 'l', 'แด': 'm', 'ษด': 'n',
'แด': 'o', 'แด': 'p', 'วซ': 'q', 'ส': 'r', 's': 's', 'แด': 't', 'แด': 'u',
'แด ': 'v', 'แดก': 'w', 'x': 'x', 'ส': 'y', 'แดข': 'z',
// Regional indicators (used in some attacks)
'๐
ฐ': 'a', '๐
ฑ': 'b', '๐
พ': 'o'
}
// Common Unicode attack patterns
this.suspiciousPatterns = [
{ pattern: /[\u0400-\u04FF]/, name: 'cyrillic_mixing', severity: 'medium' },
{ pattern: /[\u0370-\u03FF]/, name: 'greek_mixing', severity: 'medium' },
{ pattern: /[\uFF00-\uFFEF]/, name: 'fullwidth_chars', severity: 'medium' },
{ pattern: /[\u1D400-\u1D7FF]/, name: 'mathematical_chars', severity: 'high' },
{ pattern: /[\u200B-\u200F\u2028-\u202F\u205F-\u206F]/, name: 'invisible_chars', severity: 'high' },
{ pattern: /[\u1F100-\u1F1FF]/, name: 'regional_indicators', severity: 'medium' }
]
}
analyze(text, metadata = {}) {
if (!text || typeof text !== 'string') {
return {
normalizedText: text,
score: 0,
flags: [],
details: { transformations: 0, suspiciousPatterns: [] }
}
}
const result = {
normalizedText: text,
score: 0,
flags: [],
details: {
originalLength: text.length,
transformations: 0,
suspiciousPatterns: [],
unicodeBlocks: new Set(),
confusablesDetected: []
}
}
try {
// 1. Detect suspicious Unicode patterns before normalization
this.detectSuspiciousPatterns(text, result)
// 2. Apply professional confusables normalization
let normalized = text
// First try the standard confusables library
try {
if (this.confusablesMethod) {
normalized = this.confusablesMethod(text)
if (normalized !== text) {
result.details.transformations++
result.flags.push('[CONFUSABLES] Standard Unicode normalization applied')
}
} else {
// Fallback to custom mappings only
normalized = this.applyCustomMappings(text, result)
}
} catch (error) {
console.warn('Confusables normalization failed, using fallback:', error.message)
// Fallback to custom mappings
normalized = this.applyCustomMappings(text, result)
}
// 3. Apply additional custom mappings for edge cases
const finalNormalized = this.applyCustomMappings(normalized, result)
// 4. Post-process common attack patterns
const postProcessed = this.postProcessAttackPatterns(finalNormalized, result)
// 5. Detect mixed script attacks
this.detectMixedScriptAttacks(postProcessed, result)
// 6. Detect invisible character attacks
this.detectInvisibleChars(text, result)
result.normalizedText = postProcessed
// Calculate final score based on detected patterns
this.calculateFinalScore(result)
} catch (error) {
console.error('Confusables analysis error:', error)
result.flags.push('[CONFUSABLES] Analysis error, using original text')
}
return result
}
applyCustomMappings(text, result) {
let normalized = text
let transformationCount = 0
for (const [confusable, replacement] of Object.entries(this.customMappings)) {
if (normalized.includes(confusable)) {
normalized = normalized.replace(new RegExp(confusable, 'g'), replacement)
transformationCount++
result.details.confusablesDetected.push({
character: confusable,
replacement: replacement
})
}
}
if (transformationCount > 0) {
result.details.transformations += transformationCount
result.flags.push(`[CONFUSABLES] Custom mappings applied (${transformationCount} chars)`)
}
return normalized
}
postProcessAttackPatterns(text, result) {
let processed = text
// Common attack pattern fixes
const attackPatterns = [
// Fix mixed case I/l patterns (kiII -> kill, yourseIf -> yourself)
{ pattern: /k[iI][lI][lI]/gi, replacement: 'kill', name: 'kill_obfuscation' },
{ pattern: /[yY]ourse[lI]f/gi, replacement: 'yourself', name: 'yourself_obfuscation' },
{ pattern: /d[iI]e/gi, replacement: 'die', name: 'die_obfuscation' },
{ pattern: /k[yY]s/gi, replacement: 'kys', name: 'kys_obfuscation' },
// Fix common letter substitutions
{ pattern: /[iI]{2,}/g, replacement: 'll', name: 'double_i_to_ll' },
{ pattern: /[oO]0/g, replacement: 'oo', name: 'zero_to_o' },
{ pattern: /3[eE]/g, replacement: 'ee', name: 'three_to_e' },
{ pattern: /5[sS]/g, replacement: 'ss', name: 'five_to_s' }
]
attackPatterns.forEach(({ pattern, replacement, name }) => {
const matches = processed.match(pattern)
if (matches) {
processed = processed.replace(pattern, replacement)
result.details.transformations++
result.flags.push(`[CONFUSABLES] Fixed attack pattern: ${name}`)
}
})
return processed
}
detectSuspiciousPatterns(text, result) {
this.suspiciousPatterns.forEach(({ pattern, name, severity }) => {
const matches = text.match(pattern)
if (matches) {
result.details.suspiciousPatterns.push({
type: name,
severity: severity,
matches: matches.length
})
const severityScore = {
'high': 3,
'medium': 2,
'low': 1
}[severity] || 1
result.score += severityScore
result.flags.push(`[CONFUSABLES] Suspicious pattern: ${name}`)
}
})
}
detectMixedScriptAttacks(text, result) {
const scriptBlocks = {
latin: /[\u0020-\u007F\u00A0-\u00FF\u0100-\u017F\u0180-\u024F]/,
cyrillic: /[\u0400-\u04FF]/,
greek: /[\u0370-\u03FF]/,
arabic: /[\u0600-\u06FF]/,
hebrew: /[\u0590-\u05FF]/,
cjk: /[\u4E00-\u9FFF\u3400-\u4DBF]/,
mathematical: /[\u1D400-\u1D7FF]/
}
const detectedScripts = []
for (const [script, pattern] of Object.entries(scriptBlocks)) {
if (pattern.test(text)) {
detectedScripts.push(script)
result.details.unicodeBlocks.add(script)
}
}
// Mixed script in short text is suspicious
if (detectedScripts.length > 2 && text.length < 200) {
result.score += 2
result.flags.push(`[CONFUSABLES] Mixed script attack detected: ${detectedScripts.join(', ')}`)
} else if (detectedScripts.length > 1) {
result.score += 1
result.flags.push(`[CONFUSABLES] Mixed scripts detected: ${detectedScripts.join(', ')}`)
}
}
detectInvisibleChars(text, result) {
const invisibleChars = [
'\u200B', // Zero Width Space
'\u200C', // Zero Width Non-Joiner
'\u200D', // Zero Width Joiner
'\u2060', // Word Joiner
'\uFEFF', // Zero Width No-Break Space
'\u00AD' // Soft Hyphen
]
let invisibleCount = 0
invisibleChars.forEach(char => {
const matches = (text.match(new RegExp(char, 'g')) || []).length
invisibleCount += matches
})
if (invisibleCount > 0) {
result.score += Math.min(invisibleCount, 5) // Cap at 5 points
result.flags.push(`[CONFUSABLES] Invisible characters detected (${invisibleCount})`)
}
}
calculateFinalScore(result) {
// Bonus points for multiple confusables techniques
if (result.details.suspiciousPatterns.length > 2) {
result.score += 2
result.flags.push('[CONFUSABLES] Multiple obfuscation techniques detected')
}
// High transformation count indicates sophisticated attack
if (result.details.transformations > 5) {
result.score += 2
result.flags.push('[CONFUSABLES] High character transformation count')
}
}
// Utility method to get detailed Unicode analysis
getUnicodeAnalysis(text) {
const analysis = {
length: text.length,
codePoints: [...text].map(char => ({
char,
codePoint: char.codePointAt(0),
unicodeBlock: this.getUnicodeBlock(char.codePointAt(0))
})),
scripts: [],
suspiciousPatterns: []
}
// Add script detection
this.suspiciousPatterns.forEach(({ pattern, name }) => {
if (pattern.test(text)) {
analysis.suspiciousPatterns.push(name)
}
})
return analysis
}
getUnicodeBlock(codePoint) {
if (codePoint <= 0x007F) return 'Basic Latin'
if (codePoint <= 0x00FF) return 'Latin-1 Supplement'
if (codePoint >= 0x0400 && codePoint <= 0x04FF) return 'Cyrillic'
if (codePoint >= 0x0370 && codePoint <= 0x03FF) return 'Greek'
if (codePoint >= 0x1D400 && codePoint <= 0x1D7FF) return 'Mathematical'
if (codePoint >= 0x1F600 && codePoint <= 0x1F64F) return 'Emoticons'
return 'Other'
}
}
module.exports = { ConfusablesAdvancedPlugin }