UNPKG

@himorishige/noren-core

Version:

Core PII detection, masking, and tokenization library built on Web Standards

350 lines (349 loc) 12 kB
/** * Context-based scoring system for PII detection accuracy improvement * Analyzes surrounding text to determine likelihood of genuine PII */ import { CONTEXT_KEYWORDS, NEGATIVE_CONTEXT_KEYWORDS } from './constants.js'; /** * Calculate context score for a PII candidate */ export function calculateContextScore(surroundingText, piiType, _windowSize = 24) { const baseScore = 1.0; let score = baseScore; const positiveMatches = []; const negativeMatches = []; const reasoning = []; // Normalize text for analysis const normalizedText = surroundingText.toLowerCase(); // Get type-specific positive keywords const positiveKeywords = CONTEXT_KEYWORDS[piiType] || new Set(); // Check for positive indicators for (const keyword of positiveKeywords) { // For Japanese keywords, use simple includes; for others, use word boundaries const isJapanese = /[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]/.test(keyword); let matches = false; if (isJapanese) { matches = normalizedText.includes(keyword.toLowerCase()); } else { const wordBoundaryRegex = new RegExp(`\\b${keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'i'); matches = wordBoundaryRegex.test(normalizedText); } if (matches) { score += 0.2; positiveMatches.push(keyword); reasoning.push(`positive_keyword:${keyword}`); } } // Check for negative indicators for (const keyword of NEGATIVE_CONTEXT_KEYWORDS) { // For Japanese keywords, use simple includes; for others, use word boundaries const isJapanese = /[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]/.test(keyword); let matches = false; if (isJapanese) { matches = normalizedText.includes(keyword.toLowerCase()); } else { const wordBoundaryRegex = new RegExp(`\\b${keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'i'); matches = wordBoundaryRegex.test(normalizedText); } if (matches) { score -= 0.4; // Increased penalty for negative keywords negativeMatches.push(keyword); reasoning.push(`negative_keyword:${keyword}`); } } // Type-specific context analysis score = adjustScoreByTypeSpecificContext(score, normalizedText, piiType, reasoning); // Confidence calculation based on evidence strength const evidenceStrength = positiveMatches.length + negativeMatches.length; const confidence = Math.min(0.95, 0.5 + evidenceStrength * 0.1); return { score: Math.max(0.0, score), positiveMatches, negativeMatches, confidence, reasoning, }; } /** * Type-specific context adjustments */ function adjustScoreByTypeSpecificContext(score, normalizedText, piiType, reasoning) { switch (piiType) { case 'credit_card': return adjustCreditCardContext(score, normalizedText, reasoning); case 'ipv4': case 'ipv6': return adjustIPContext(score, normalizedText, reasoning); case 'email': return adjustEmailContext(score, normalizedText, reasoning); case 'phone_e164': return adjustPhoneContext(score, normalizedText, reasoning); case 'mac': return adjustMACContext(score, normalizedText, reasoning); default: return score; } } /** * Credit card specific context analysis */ function adjustCreditCardContext(score, text, reasoning) { // Strong positive indicators if (/\b(payment|transaction|purchase|billing|checkout|order)\b/.test(text)) { score += 0.3; reasoning.push('payment_context'); } if (/\b(exp|expir|cvv|cvc|security\s+code)\b/.test(text)) { score += 0.4; reasoning.push('card_details_context'); } // Strong negative indicators if (/\b(example|sample|demo|test|dummy)\b/.test(text)) { score -= 0.5; reasoning.push('test_context'); } if (/\b(id|identifier|user|customer|account)\s*:?\s*\d+/.test(text)) { score -= 0.2; reasoning.push('identifier_context'); } // Database/log context (less likely to be real card data) if (/\b(insert|update|select|database|log|debug)\b/.test(text)) { score -= 0.3; reasoning.push('database_context'); } // Generic number context (reduces confidence for non-specific contexts) if (/\b(random|number|digits?|value|data|string)\b/.test(text)) { score -= 0.15; // Reduced penalty to avoid false negatives reasoning.push('generic_number_context'); } return score; } /** * IP address specific context analysis */ function adjustIPContext(score, text, reasoning) { // Strong positive indicators if (/\b(connect|connection|server|client|endpoint|gateway|router)\b/.test(text)) { score += 0.3; reasoning.push('network_context'); } if (/\b(from|to|src|dst|source|destination)\b/.test(text)) { score += 0.2; reasoning.push('direction_context'); } if (/\b(ping|traceroute|nslookup|dig|curl|wget|ssh|telnet|ftp)\b/.test(text)) { score += 0.4; reasoning.push('network_tool_context'); } // Negative indicators if (/\b(version|release|build|package)\b/.test(text)) { score -= 0.4; reasoning.push('version_context'); } if (/\b(date|time|timestamp|year|month|day)\b/.test(text)) { score -= 0.3; reasoning.push('date_context'); } // Configuration file context if (/\b(config|conf|settings|properties|ini|yaml|json|xml)\b/.test(text)) { score += 0.2; reasoning.push('config_context'); } return score; } /** * Email specific context analysis */ function adjustEmailContext(score, text, reasoning) { // Strong positive indicators if (/\b(send|sent|receive|forward|reply|contact|reach)\b/.test(text)) { score += 0.3; reasoning.push('communication_context'); } if (/\b(from|to|cc|bcc|subject|message|mail)\b/.test(text)) { score += 0.4; reasoning.push('email_headers_context'); } // Negative indicators if (/\b(noreply|no-reply|donotreply|system|daemon|automated)\b/.test(text)) { score -= 0.5; // Stronger penalty for automated emails reasoning.push('automated_email_context'); } if (/\b(example|test|demo|placeholder)\b/.test(text)) { score -= 0.4; reasoning.push('example_context'); } // Domain validation context if (/\b(domain|dns|mx|record|zone)\b/.test(text)) { score -= 0.2; reasoning.push('dns_context'); } return score; } /** * Phone number specific context analysis */ function adjustPhoneContext(score, text, reasoning) { // Strong positive indicators if (/\b(call|dial|ring|mobile|cell|landline|extension|ext)\b/.test(text)) { score += 0.3; reasoning.push('phone_action_context'); } if (/\b(contact|reach|emergency|support|help|service)\b/.test(text)) { score += 0.2; reasoning.push('contact_context'); } // Negative indicators if (/\b(test|example|dummy|sample|fake)\b/.test(text)) { score -= 0.4; reasoning.push('test_number_context'); } if (/\b(id|identifier|code|reference|serial)\b/.test(text)) { score -= 0.2; reasoning.push('identifier_context'); } return score; } /** * MAC address specific context analysis */ function adjustMACContext(score, text, reasoning) { // Strong positive indicators if (/\b(ethernet|ether|nic|interface|adapter|card)\b/.test(text)) { score += 0.4; reasoning.push('network_hardware_context'); } if (/\b(ifconfig|arp|bridge|switch|router|gateway)\b/.test(text)) { score += 0.3; reasoning.push('network_config_context'); } // Negative indicators if (/\b(uuid|guid|hash|checksum|signature|key)\b/.test(text)) { score -= 0.5; reasoning.push('identifier_hash_context'); } if (/\b(bluetooth|bt|wireless|wifi)\b/.test(text)) { score += 0.2; reasoning.push('wireless_context'); } return score; } /** * Extract surrounding text for context analysis */ export function extractSurroundingText(fullText, matchStart, matchEnd, windowSize = 24) { const beforeStart = Math.max(0, matchStart - windowSize); const afterEnd = Math.min(fullText.length, matchEnd + windowSize); const before = fullText.slice(beforeStart, matchStart); const after = fullText.slice(matchEnd, afterEnd); return before + after; } /** * Determine if context score meets threshold for acceptance */ export function meetsContextThreshold(analysis, piiType, strictness = 'balanced') { const thresholds = { fast: { credit_card: 0.5, email: 0.3, ipv4: 0.3, ipv6: 0.3, phone_e164: 0.5, mac: 0.4, default: 0.3, }, balanced: { credit_card: 1.1, // Slightly stricter for credit cards email: 0.8, // More stricter for emails to catch auto-emails ipv4: 0.5, ipv6: 0.5, phone_e164: 0.8, mac: 0.6, default: 0.5, }, strict: { credit_card: 1.5, email: 1.0, ipv4: 1.0, ipv6: 1.0, phone_e164: 1.2, mac: 1.0, default: 1.0, }, }; const threshold = thresholds[strictness][piiType] || thresholds[strictness].default; return analysis.score >= threshold; } /** * Enhanced context analysis that includes pattern-based scoring */ export function analyzeContextWithPatterns(surroundingText, candidate, piiType) { const baseAnalysis = calculateContextScore(surroundingText, piiType); // Add pattern-specific adjustments let patternScore = 0; const additionalReasoning = []; // Check for structured data patterns if (isInStructuredData(surroundingText, candidate)) { patternScore += 0.2; additionalReasoning.push('structured_data_context'); } // Check for form field context if (isInFormField(surroundingText, candidate)) { patternScore += 0.3; additionalReasoning.push('form_field_context'); } // Check for log/debug context if (isInLogContext(surroundingText)) { patternScore -= 0.2; additionalReasoning.push('log_debug_context'); } return { ...baseAnalysis, score: baseAnalysis.score + patternScore, reasoning: [...baseAnalysis.reasoning, ...additionalReasoning], }; } /** * Detect if candidate is in structured data (JSON, XML, etc.) */ function isInStructuredData(text, _candidate) { // JSON context if (text.includes('"') && text.includes(':')) { return true; } // XML context if (text.includes('<') && text.includes('>')) { return true; } // CSV context if (text.includes(',') && text.split(',').length > 2) { return true; } return false; } /** * Detect if candidate is in a form field context */ function isInFormField(text, _candidate) { const formPatterns = [ /\b(input|field|form|label|placeholder|value)\b/i, /\b(name|email|phone|address|card|number)\s*[:=]/i, /<input[^>]*>/i, /\btype\s*=\s*["']?(text|email|tel|number)/i, ]; return formPatterns.some((pattern) => pattern.test(text)); } /** * Detect if context suggests log/debug output */ function isInLogContext(text) { const logPatterns = [ /\b(log|debug|trace|info|warn|error|fatal)\b/i, /\b(console|print|echo|output)\b/i, /\d{4}-\d{2}-\d{2}|\d{2}:\d{2}:\d{2}/, // timestamp patterns /\[(DEBUG|INFO|WARN|ERROR|TRACE)\]/i, ]; return logPatterns.some((pattern) => pattern.test(text)); }