UNPKG

content-guard

Version:

🛡️ Advanced content analysis and moderation system with multi-variant optimization. Features context-aware detection, harassment prevention, and ML-powered toxicity analysis. Pre-1.0 development version.

490 lines (418 loc) 17.4 kB
/** * 🎯 ContentGuard v3.0 - Advanced Context Detection * * Sophisticated context analysis that can identify professional domains, * communication styles, and contextual appropriateness to reduce false positives * while maintaining high detection accuracy. */ const { TECHNICAL_TERMS, ACADEMIC_TERMS, MEDICAL_TERMS, BUSINESS_TERMS, LEGAL_TERMS } = require('../constants/context-data') /** * Extended domain vocabulary for better context detection */ const EXTENDED_CONTEXTS = { // Technology contexts DEVOPS: [ 'deploy', 'deployment', 'kubernetes', 'docker', 'container', 'microservice', 'pipeline', 'ci/cd', 'server', 'production', 'staging', 'environment', 'cluster', 'node', 'pod', 'namespace', 'monitoring', 'alerting', 'logging', 'metrics', 'prometheus', 'grafana', 'elk', 'splunk', 'infrastructure', 'terraform', 'ansible', 'chef', 'puppet', 'provisioning', 'automation' ], SECURITY: [ 'vulnerability', 'exploit', 'patch', 'update', 'firewall', 'vpn', 'encryption', 'certificate', 'authentication', 'authorization', 'oauth', 'jwt', 'ssl', 'tls', 'https', 'security audit', 'penetration test', 'threat', 'malware', 'virus', 'ransomware', 'phishing', 'social engineering' ], SOFTWARE_DEV: [ 'repository', 'git', 'commit', 'merge', 'branch', 'pull request', 'code review', 'refactor', 'api', 'endpoint', 'rest', 'graphql', 'database', 'sql', 'query', 'orm', 'migration', 'framework', 'library', 'dependency', 'package', 'npm', 'yarn', 'webpack', 'babel' ], // Medical contexts CLINICAL: [ 'symptom', 'diagnosis', 'prognosis', 'treatment', 'therapy', 'medication', 'dosage', 'prescription', 'patient care', 'clinical trial', 'medical history', 'vital signs', 'blood pressure', 'heart rate', 'laboratory', 'radiology', 'imaging', 'x-ray', 'mri', 'ct scan', 'ultrasound', 'biopsy' ], EMERGENCY_MEDICAL: [ 'emergency', 'trauma', 'icu', 'intensive care', 'life support', 'resuscitation', 'cardiac arrest', 'stroke', 'heart attack', 'respiratory failure', 'sepsis', 'shock', 'hemorrhage', 'fracture', 'ambulance', 'paramedic', 'first aid', 'triage', 'urgent care', 'emergency room' ], // Business contexts FINANCE: [ 'revenue', 'profit', 'loss', 'margin', 'cash flow', 'budget', 'forecast', 'projection', 'investment', 'roi', 'return on investment', 'equity', 'debt', 'liability', 'asset', 'balance sheet', 'income statement', 'financial statement', 'audit', 'compliance', 'regulation', 'market share', 'competitor', 'competition', 'competitive analysis', 'market analysis', 'business performance', 'sales performance', 'quarterly results', 'market position' ], MANAGEMENT: [ 'strategy', 'planning', 'execution', 'performance', 'metrics', 'kpi', 'objective', 'goal', 'milestone', 'deliverable', 'stakeholder', 'resource', 'allocation', 'optimization', 'workflow', 'process', 'procedure', 'standard operating procedure', 'best practice', 'competitive advantage', 'market penetration', 'business strategy', 'strategic planning', 'performance analysis', 'business analysis', 'market research', 'competitive intelligence' ], // Academic contexts RESEARCH: [ 'hypothesis', 'methodology', 'experiment', 'control group', 'variable', 'statistical significance', 'correlation', 'causation', 'regression', 'sample size', 'population', 'bias', 'validity', 'reliability', 'peer review', 'publication', 'journal', 'conference', 'citation', 'reference' ], ENGINEERING: [ 'specification', 'requirement', 'design', 'architecture', 'implementation', 'testing', 'validation', 'verification', 'prototype', 'iteration', 'optimization', 'efficiency', 'performance', 'scalability', 'reliability', 'maintainability', 'documentation', 'standard', 'protocol' ], // NEW: Social Media Context SOCIAL_MEDIA: [ 'friend request', 'new follower', 'mentioned you', 'tagged you', 'direct message', 'dm', 'social media', 'profile', 'feed', 'story', 'post', 'like', 'share', 'comment', 'tweet', 'retweet', 'hashtag', 'influencer', 'online community', 'forum post' ] } /** * Professional communication indicators */ const COMMUNICATION_PATTERNS = { FORMAL: [ /dear (sir|madam|team|colleagues|mr|ms|dr|professor)/i, /yours (sincerely|faithfully|truly)/i, /best regards/i, /kind regards/i, /thank you for your (time|consideration|attention)/i, /please find attached/i, /i am writing to (inform|request|inquire)/i, /we would like to (inform|request|propose)/i ], PROFESSIONAL: [ /meeting (scheduled|request|invitation)/i, /project (status|update|deadline|milestone)/i, /please (review|approve|consider|confirm)/i, /urgent (request|matter|issue|priority)/i, /deadline (approaching|extended|missed)/i, /follow up/i, /action items/i, /next steps/i ], TECHNICAL: [ /technical (issue|problem|solution|specification)/i, /system (failure|outage|maintenance|upgrade)/i, /performance (issue|improvement|optimization)/i, /bug (report|fix|tracking)/i, /feature (request|implementation|deployment)/i, /code (review|deployment|rollback)/i ] } /** * Potentially harmful words that are legitimate in professional contexts */ const CONTEXTUAL_WORDS = { // Words that trigger false positives but are legitimate in context 'kill': { professional_contexts: ['DEVOPS', 'SOFTWARE_DEV', 'TECHNICAL'], legitimate_phrases: [ 'kill process', 'kill task', 'kill command', 'kill signal', 'kill switch', 'kill the process', 'kill runaway process', 'kill stuck process' ], weight_reduction: 0.8 // Reduce impact by 80% in professional context }, 'critical': { professional_contexts: ['CLINICAL', 'EMERGENCY_MEDICAL', 'DEVOPS', 'SECURITY', 'BUSINESS'], legitimate_phrases: [ 'critical care', 'critical condition', 'critical system', 'critical path', 'critical issue', 'critical thinking', 'critical analysis', 'critical success factor' ], weight_reduction: 0.9 }, 'urgent': { professional_contexts: ['CLINICAL', 'EMERGENCY_MEDICAL', 'BUSINESS', 'DEVOPS'], legitimate_phrases: [ 'urgent care', 'urgent matter', 'urgent request', 'urgent priority', 'urgent surgery', 'urgent intervention', 'urgent response' ], weight_reduction: 0.8 }, 'ass': { professional_contexts: ['ACADEMIC', 'BUSINESS', 'RESEARCH'], legitimate_phrases: [ 'assess', 'assessment', 'class', 'classic', 'assist', 'assistance', 'associate', 'association', 'asset', 'passage', 'embassy', 'mass' ], weight_reduction: 0.95 }, 'analyze': { professional_contexts: ['ACADEMIC', 'RESEARCH', 'BUSINESS', 'TECHNICAL'], legitimate_phrases: [ 'analyze data', 'analyze results', 'analyze performance', 'analyze trends', 'analyze patterns', 'analyze behavior', 'analyze metrics' ], weight_reduction: 0.0 // Completely legitimate } } /** * Professional email domain patterns */ const PROFESSIONAL_DOMAINS = { CORPORATE: [ 'company.com', 'corp.com', 'inc.com', 'ltd.com', 'llc.com', 'microsoft.com', 'google.com', 'apple.com', 'amazon.com', 'facebook.com' ], EDUCATIONAL: [ '.edu', '.ac.uk', '.ac.in', '.edu.au', '.edu.sg', 'university.', 'college.' ], GOVERNMENT: [ '.gov', '.gov.uk', '.gov.au', '.gov.ca', '.mil', '.state.' ], HEALTHCARE: [ 'hospital.', 'clinic.', 'medical.', 'health.', '.nhs.', 'mayo.', 'kaiser.' ] } class ContextDetector { constructor(options = {}) { this.options = { enableDomainDetection: true, enablePatternMatching: true, enableVocabularyAnalysis: true, confidenceThreshold: 0.3, ...options } } /** * Analyze content for context and communication style */ analyzeContext(content, input = {}) { const context = { domains: this.detectDomains(content), communicationStyle: this.detectCommunicationStyle(content), professionalIndicators: this.detectProfessionalIndicators(content, input), vocabularyAnalysis: this.analyzeVocabulary(content), emailContext: this.analyzeEmailContext(input.email || ''), confidence: 0 } // Calculate overall confidence context.confidence = this.calculateContextConfidence(context) // Add convenience flags context.isProfessional = this.isProfessionalContext(context) context.isTechnical = context.domains.includes('DEVOPS') || context.domains.includes('SOFTWARE_DEV') || context.domains.includes('SECURITY') context.isMedical = context.domains.includes('CLINICAL') || context.domains.includes('EMERGENCY_MEDICAL') context.isAcademic = context.domains.includes('RESEARCH') || context.domains.includes('ENGINEERING') context.isBusiness = context.domains.includes('FINANCE') || context.domains.includes('MANAGEMENT') context.isSocialMedia = context.domains.includes('SOCIAL_MEDIA') return context } /** * Detect specific domain contexts */ detectDomains(content) { const detectedDomains = [] const text = content.allTextLower for (const [domain, vocabulary] of Object.entries(EXTENDED_CONTEXTS)) { if (!vocabulary || !Array.isArray(vocabulary)) continue const matches = vocabulary.filter(term => text.includes(term.toLowerCase())).length const threshold = Math.max(1, Math.ceil(vocabulary.length * 0.05)) if (matches >= threshold) { detectedDomains.push(domain) } } // Check for individual technical terms (more flexible) const technicalMatches = TECHNICAL_TERMS.filter(term => { const termWords = term.toLowerCase().split(' ') // If it's a phrase, all words must be present (but not necessarily together) if (termWords.length > 1) { return termWords.every(word => text.includes(word)) } // Single word matches return text.includes(term.toLowerCase()) }) if (technicalMatches.length > 0 && !detectedDomains.includes('DEVOPS')) { detectedDomains.push('DEVOPS') } // Check for business terms (more flexible) const businessMatches = BUSINESS_TERMS.filter(term => { const termWords = term.toLowerCase().split(' ') // If it's a phrase, all words must be present (but not necessarily together) if (termWords.length > 1) { return termWords.every(word => text.includes(word)) } // Single word matches return text.includes(term.toLowerCase()) }) if (businessMatches.length > 0 && !detectedDomains.includes('FINANCE')) { detectedDomains.push('FINANCE') } return detectedDomains } /** * Detect communication style patterns */ detectCommunicationStyle(content) { const styles = [] const text = content.allText for (const [style, patterns] of Object.entries(COMMUNICATION_PATTERNS)) { const matches = patterns.filter(pattern => pattern.test(text)).length if (matches > 0) { styles.push(style) } } return styles } /** * Detect professional indicators */ detectProfessionalIndicators(content, input) { const indicators = { hasBusinessTerms: false, hasTechnicalTerms: false, hasAcademicTerms: false, hasMedicalTerms: false, hasLegalTerms: false, isProfessionalEmail: false, usesBusinessLanguage: false } const text = content.allTextLower // Check for professional vocabulary indicators.hasBusinessTerms = BUSINESS_TERMS.some(term => text.includes(term.toLowerCase())) indicators.hasTechnicalTerms = TECHNICAL_TERMS.some(term => text.includes(term.toLowerCase())) indicators.hasAcademicTerms = ACADEMIC_TERMS.some(term => text.includes(term.toLowerCase())) indicators.hasMedicalTerms = MEDICAL_TERMS.some(term => text.includes(term.toLowerCase())) indicators.hasLegalTerms = LEGAL_TERMS.some(term => text.includes(term.toLowerCase())) // Check email domain if (input.email) { indicators.isProfessionalEmail = this.isProfessionalEmail(input.email) } // Check for business language patterns const businessPatterns = [ /quarterly (results|report|analysis)/i, /annual (report|review|meeting)/i, /board (meeting|approval|decision)/i, /stakeholder (meeting|communication|feedback)/i, /client (request|meeting|feedback)/i, /customer (service|support|inquiry)/i ] indicators.usesBusinessLanguage = businessPatterns.some(pattern => pattern.test(content.allText)) return indicators } /** * Analyze vocabulary sophistication and domain specificity */ analyzeVocabulary(content) { const words = content.allTextLower.split(/\s+/).filter(word => word.length > 3) return { totalWords: words.length, uniqueWords: new Set(words).size, averageWordLength: words.reduce((sum, word) => sum + word.length, 0) / words.length, technicalTermsCount: this.countTerms(content.allTextLower, TECHNICAL_TERMS), businessTermsCount: this.countTerms(content.allTextLower, BUSINESS_TERMS), academicTermsCount: this.countTerms(content.allTextLower, ACADEMIC_TERMS), medicalTermsCount: this.countTerms(content.allTextLower, MEDICAL_TERMS) } } /** * Analyze email context for professional indicators */ analyzeEmailContext(email) { if (!email) return { isProfessional: false, domain: '', type: 'unknown' } const domain = email.split('@')[1]?.toLowerCase() || '' for (const [type, domains] of Object.entries(PROFESSIONAL_DOMAINS)) { if (domains.some(profDomain => domain.includes(profDomain))) { return { isProfessional: true, domain, type: type.toLowerCase() } } } // Check for common professional patterns const professionalPatterns = [ /\.(com|org|net)$/, /^[a-zA-Z]+\.(com|org)$/, /company|corp|inc|ltd|llc/ ] const isProfessional = professionalPatterns.some(pattern => pattern.test(domain)) return { isProfessional, domain, type: isProfessional ? 'corporate' : 'personal' } } /** * Calculate scoring adjustments for potentially problematic words in context */ getContextualAdjustments(content, detectedContext) { const adjustments = [] const text = content.allTextLower for (const [word, config] of Object.entries(CONTEXTUAL_WORDS)) { if (text.includes(word)) { // Check if word appears in professional context const hasRelevantContext = config.professional_contexts.some(ctx => detectedContext.domains.includes(ctx) ) // Check if word appears in legitimate phrases const inLegitimatePhrase = config.legitimate_phrases.some(phrase => text.includes(phrase.toLowerCase()) ) if (hasRelevantContext || inLegitimatePhrase) { adjustments.push({ word, reason: hasRelevantContext ? 'professional_context' : 'legitimate_phrase', weightReduction: config.weight_reduction, contexts: config.professional_contexts }) } } } return adjustments } /** * Determine if overall context is professional */ isProfessionalContext(context) { const professionalIndicators = [ context.professionalIndicators.hasBusinessTerms, context.professionalIndicators.hasTechnicalTerms, context.professionalIndicators.hasAcademicTerms, context.professionalIndicators.hasMedicalTerms, context.professionalIndicators.hasLegalTerms, context.professionalIndicators.isProfessionalEmail, context.professionalIndicators.usesBusinessLanguage, context.communicationStyle.includes('FORMAL'), context.communicationStyle.includes('PROFESSIONAL'), context.domains.length > 0 ] const professionalScore = professionalIndicators.filter(Boolean).length / professionalIndicators.length return professionalScore >= 0.3 // 30% threshold } /** * Check if email is from professional domain */ isProfessionalEmail(email) { const domain = email.split('@')[1]?.toLowerCase() || '' return Object.values(PROFESSIONAL_DOMAINS) .flat() .some(profDomain => domain.includes(profDomain)) } /** * Count occurrences of terms in text */ countTerms(text, terms) { return terms.filter(term => text.includes(term.toLowerCase())).length } /** * Calculate overall context confidence */ calculateContextConfidence(context) { let confidence = 0 // Domain detection confidence confidence += context.domains.length * 0.2 // Communication style confidence confidence += context.communicationStyle.length * 0.15 // Professional indicators confidence const profIndicators = Object.values(context.professionalIndicators).filter(Boolean).length confidence += profIndicators * 0.1 // Email context confidence if (context.emailContext.isProfessional) { confidence += 0.2 } // Vocabulary analysis confidence if (context.vocabularyAnalysis.averageWordLength > 5) { confidence += 0.1 } return Math.min(1.0, confidence) } } module.exports = { ContextDetector, EXTENDED_CONTEXTS, CONTEXTUAL_WORDS, PROFESSIONAL_DOMAINS }