UNPKG

allprofanity

Version:

A blazing-fast, multi-language profanity filter with advanced algorithms (Aho-Corasick, Bloom Filters) delivering 664% faster performance on large texts, intelligent leet-speak detection, and pattern-based context analysis

github.com/ayush-jadaun/AllProfanity

ayush-jadaun/AllProfanity

298 lines • 10.2 kB

JavaScript

/** * Universal context patterns for multi-language profanity detection */ /** * Universal context patterns that work across multiple languages */ export const UNIVERSAL_CONTEXT_PATTERNS = [ // Negation patterns { type: "negation", pattern: /\b(not|don't|won't|can't|never|ne|pas|nicht|no|नहीं|不|non|niente|нет|nie)\b.{0,30}PROFANE_WORD/i, weight: 0.2, languages: ["*"], description: "Negation words that reduce profanity likelihood", examples: ["not bad", "don't call me that", "never say that"], }, // Possessive patterns { type: "possessive", pattern: /\b\w+(['s]|du|de|का|की|के|の|del|della|от|od)\s+PROFANE_WORD\b/i, weight: 0.4, languages: ["*"], description: "Possessive constructions that may be innocent", examples: ["dog's mouth", "cat's ass", "bird's ass"], }, // Article patterns { type: "article", pattern: /\b(the|a|an|le|la|les|un|une|der|die|das|ein|eine|el|la|los|las|il|lo|gli|le)\s+PROFANE_WORD\b/i, weight: 0.6, languages: ["*"], description: "Articles that may indicate neutral reference", examples: ["the ass of the donkey", "a hell of a time"], }, // Compound word patterns { type: "compound", pattern: /\b(smart|silly|cute|funny|little|big|old|new|good|bad|nice|sweet)\s*[-]?\s*PROFANE_WORD\b/i, weight: 0.5, languages: ["*"], description: "Adjective-noun compounds that may be innocent", examples: ["smart-ass", "silly ass", "cute little ass"], }, // Proper noun patterns { type: "proper_noun", pattern: /\b[A-Z][a-z]+\s+PROFANE_WORD\b/, weight: 0.3, languages: ["en", "fr", "de", "es", "it"], description: "Proper nouns followed by potential profanity", examples: ["Hell Michigan", "Ass River"], }, // Quotation patterns { type: "quotation", pattern: /["'«»„"‚'].*PROFANE_WORD.*["'«»„"‚']/i, weight: 0.7, languages: ["*"], description: "Quoted text which may be reporting speech", examples: ['"Don\'t be an ass"', "'What the hell'"], }, // Medical/anatomical context { type: "medical", pattern: /\b(medical|anatomy|doctor|hospital|clinic|patient|diagnosis|treatment|surgical|clinical)\b.{0,50}PROFANE_WORD/i, weight: 0.1, languages: ["*"], description: "Medical contexts where anatomical terms are appropriate", examples: [ "medical examination of the ass", "doctor checked the damn thing", ], }, // Anatomical context { type: "anatomical", pattern: /\b(body|part|muscle|bone|skin|tissue|organ|limb|extremity)\b.{0,30}PROFANE_WORD/i, weight: 0.3, languages: ["*"], description: "Anatomical contexts for body parts", examples: ["body part called ass", "muscle in the ass"], }, ]; /** * Language-specific context patterns */ export const LANGUAGE_SPECIFIC_PATTERNS = { en: [ { type: "compound", pattern: /\b(jack|dumb|smart|bad|kick)\s*[-]?\s*PROFANE_WORD\b/i, weight: 0.4, languages: ["en"], description: "English-specific compound patterns", examples: ["jackass", "dumbass", "badass"], }, ], fr: [ { type: "negation", pattern: /\b(ne|n'|pas|point|jamais|rien|personne)\b.{0,30}PROFANE_WORD/i, weight: 0.2, languages: ["fr"], description: "French negation patterns", examples: ["ne pas dire", "jamais ça"], }, ], de: [ { type: "compound", pattern: /\bPROFANE_WORD(kopf|zeug|ding|sache)\b/i, weight: 0.5, languages: ["de"], description: "German compound word patterns", examples: ["Scheißzeug", "Arschloch"], }, ], es: [ { type: "possessive", pattern: /\b(el|la|los|las)\s+PROFANE_WORD\s+(de|del|de la)\b/i, weight: 0.4, languages: ["es"], description: "Spanish possessive patterns", examples: ["el culo de la mesa"], }, ], }; /** * Context rule generator */ export class ContextPatternMatcher { constructor(languages = ["en"]) { this.patterns = [...UNIVERSAL_CONTEXT_PATTERNS]; this.languagePatterns = new Map(); // Load language-specific patterns for (const lang of languages) { if (LANGUAGE_SPECIFIC_PATTERNS[lang]) { this.languagePatterns.set(lang, LANGUAGE_SPECIFIC_PATTERNS[lang]); } } } /** * Generate context rules for a specific word */ generateRules(word, languages = ["en"]) { const rules = []; const allPatterns = [...this.patterns]; // Add language-specific patterns for (const lang of languages) { const langPatterns = this.languagePatterns.get(lang) || []; allPatterns.push(...langPatterns); } for (const pattern of allPatterns) { // Skip if pattern doesn't apply to any of the specified languages if (!pattern.languages.includes("*") && !pattern.languages.some((lang) => languages.includes(lang))) { continue; } // Replace PROFANE_WORD placeholder with actual word const regexSource = pattern.pattern.source.replace("PROFANE_WORD", this.escapeRegex(word)); const regex = new RegExp(regexSource, pattern.pattern.flags); let action; if (pattern.weight < 0.3) { action = "reduce_score"; } else if (pattern.weight > 0.8) { action = "increase_score"; } else { action = "reduce_score"; } rules.push({ pattern: regex, action, weight: pattern.weight, priority: this.getPriority(pattern.type), }); } return rules.sort((a, b) => a.priority - b.priority); } /** * Get priority for pattern type */ getPriority(type) { const priorities = { medical: 1, anatomical: 2, negation: 3, quotation: 4, proper_noun: 5, possessive: 6, article: 7, compound: 8, }; return priorities[type] || 9; } /** * Escape regex special characters */ escapeRegex(str) { return str.replace(/[\\^$.*+?()[\]{}|]/g, "\\$&"); } /** * Add custom pattern */ addPattern(pattern) { this.patterns.push(pattern); } /** * Add language-specific pattern */ addLanguagePattern(language, pattern) { if (!this.languagePatterns.has(language)) { this.languagePatterns.set(language, []); } this.languagePatterns.get(language).push(pattern); } /** * Get all patterns for debugging */ getAllPatterns() { return { universal: [...this.patterns], languageSpecific: new Map(this.languagePatterns), }; } } /** * Context analyzer for scoring matches */ export class ContextAnalyzer { constructor(languages = ["en"]) { this.contextWindow = 50; // Characters before and after the match this.patternMatcher = new ContextPatternMatcher(languages); } /** * Analyze context around a potential profanity match */ analyzeContext(text, matchStart, matchEnd, word) { // Extract context window const contextStart = Math.max(0, matchStart - this.contextWindow); const contextEnd = Math.min(text.length, matchEnd + this.contextWindow); const context = text.substring(contextStart, contextEnd); // Get rules for this word const rules = this.patternMatcher.generateRules(word); let score = 1.0; // Start with full profanity score const appliedRules = []; // Apply context rules for (const rule of rules) { const matched = rule.pattern.test(context); appliedRules.push({ rule, matched }); if (matched) { if (rule.action === "reduce_score") { score *= rule.weight; } else if (rule.action === "increase_score") { score *= 2 - rule.weight; // Increase score } else if (rule.action === "whitelist") { score = 0; // Complete whitelist break; } } } // Determine confidence based on number of matching rules const matchingRules = appliedRules.filter((ar) => ar.matched).length; let confidence; if (matchingRules === 0) { confidence = "high"; // No context rules matched, likely profanity } else if (matchingRules <= 2) { confidence = "medium"; } else { confidence = "low"; // Many context rules matched, likely innocent } return { score: Math.max(0, Math.min(1, score)), confidence, appliedRules, context, }; } /** * Set context window size */ setContextWindow(size) { this.contextWindow = Math.max(10, Math.min(200, size)); } /** * Add custom pattern to the analyzer */ addCustomPattern(pattern) { this.patternMatcher.addPattern(pattern); } } //# sourceMappingURL=context-patterns.js.map