UNPKG

@himorishige/noren-core

Version:

Core PII detection, masking, and tokenization library built on Web Standards

223 lines (222 loc) 7.69 kB
/** * Confidence scoring system for PII detection * Provides rule-based confidence calculation for detected patterns */ /** * Confidence thresholds for different sensitivity levels */ export const CONFIDENCE_THRESHOLDS = { strict: 0.5, // Allow false positives, avoid false negatives balanced: 0.7, // Balance precision and recall relaxed: 0.85, // Avoid false positives, allow false negatives }; /** * Calculate confidence score for a detected PII pattern */ export function calculateConfidence(hit, text, features) { const extractedFeatures = extractFeatures(hit, text, features); const reasons = []; let baseScore = 0.5; // Start with neutral confidence // Base pattern matching score baseScore += getBasePatternScore(hit.type, hit.value); reasons.push(`base-pattern-${hit.type}`); // Context-based adjustments const contextAdjustment = getContextAdjustment(extractedFeatures); baseScore += contextAdjustment.adjustment; reasons.push(...contextAdjustment.reasons); // Type-specific validation const typeAdjustment = getTypeSpecificAdjustment(hit.type, hit.value, extractedFeatures); baseScore += typeAdjustment.adjustment; reasons.push(...typeAdjustment.reasons); // Normalize to 0.0-1.0 range, but ensure it's not exactly 1.0 for better testing const confidence = Math.max(0.0, Math.min(0.99, baseScore)); return { confidence, reasons, features: extractedFeatures, }; } /** * Extract features from text around the detected pattern */ function extractFeatures(hit, text, providedFeatures) { const surroundingStart = Math.max(0, hit.start - 50); const surroundingEnd = Math.min(text.length, hit.end + 50); const surroundingText = text.slice(surroundingStart, surroundingEnd).toLowerCase(); const surroundingWords = surroundingText.match(/\b\w+\b/g) || []; // Test keywords that suggest non-real data const testKeywords = ['test', 'example', 'dummy', 'sample', 'demo', 'fake', 'mock']; const exampleKeywords = ['example.com', 'example.org', 'localhost', 'invalid', 'placeholder']; const features = { hasTestKeywords: testKeywords.some((keyword) => surroundingText.includes(keyword)), hasExampleKeywords: exampleKeywords.some((keyword) => surroundingText.includes(keyword)), isInCodeBlock: isInCodeBlock(text, hit.start), isInComment: isInComment(text, hit.start), surroundingWords, patternComplexity: calculatePatternComplexity(hit.value), patternLength: hit.value.length, hasValidFormat: true, // Will be overridden by type-specific validation typeSpecific: {}, ...providedFeatures, }; return features; } /** * Get base confidence score for pattern type */ function getBasePatternScore(type, _value) { switch (type) { case 'email': return 0.6; // Email patterns are generally reliable case 'credit_card': return 0.7; // Credit cards have Luhn validation default: return 0.4; // Unknown types get lower base score } } /** * Adjust confidence based on context */ function getContextAdjustment(features) { let adjustment = 0; const reasons = []; if (features.hasTestKeywords) { adjustment -= 0.1; // Reduced from 0.2 reasons.push('test-keywords-present'); } if (features.hasExampleKeywords) { adjustment -= 0.15; // Reduced from 0.3 reasons.push('example-keywords-present'); } if (features.isInCodeBlock) { adjustment -= 0.2; // Reduced from 0.3 reasons.push('in-code-block'); } if (features.isInComment) { adjustment -= 0.2; reasons.push('in-comment'); } // Longer patterns are generally more reliable if (features.patternLength > 20) { adjustment += 0.1; reasons.push('long-pattern'); } else if (features.patternLength < 5) { adjustment -= 0.1; reasons.push('short-pattern'); } return { adjustment, reasons }; } /** * Type-specific confidence adjustments */ function getTypeSpecificAdjustment(type, value, _features) { const adjustment = 0; const reasons = []; switch (type) { case 'email': return getEmailConfidenceAdjustment(value, _features); default: return { adjustment, reasons }; } } /** * Email-specific confidence adjustments */ function getEmailConfidenceAdjustment(value, _features) { let adjustment = 0; const reasons = []; const email = value.toLowerCase(); // Check for disposable/test email domains const testDomains = ['example.com', 'example.org', 'test.com', 'localhost']; if (testDomains.some((domain) => email.endsWith(domain))) { adjustment -= 0.4; // Reduced from 0.8 to prevent zero confidence reasons.push('test-domain'); } // Check for no-reply patterns if (email.startsWith('noreply@') || email.startsWith('no-reply@')) { adjustment -= 0.4; reasons.push('noreply-pattern'); } // Check for valid TLD (simplified check) const tld = email.split('.').pop(); if (tld && tld.length >= 2 && tld.length <= 6 && /^[a-z]+$/.test(tld)) { adjustment += 0.2; reasons.push('valid-tld'); } // Check for suspicious patterns if (/admin@|test@|user@/.test(email)) { adjustment -= 0.2; reasons.push('generic-prefix'); } return { adjustment, reasons }; } /** * Check if text position is within a code block */ function isInCodeBlock(text, position) { const beforeText = text.slice(0, position); // Check for markdown code blocks const beforeTicks = (beforeText.match(/```/g) || []).length; // If odd number of ``` before position, we're inside a code block return beforeTicks % 2 === 1; } /** * Check if text position is within a comment */ function isInComment(text, position) { const line = getLineContaining(text, position); return line.trim().startsWith('//') || line.trim().startsWith('#'); } /** * Get the line containing the specified position */ function getLineContaining(text, position) { const beforeText = text.slice(0, position); const afterText = text.slice(position); const lineStart = beforeText.lastIndexOf('\n') + 1; const lineEnd = afterText.indexOf('\n'); if (lineEnd === -1) { return text.slice(lineStart); } else { return text.slice(lineStart, position + lineEnd); } } /** * Calculate pattern complexity score */ function calculatePatternComplexity(value) { let complexity = 0; // Character variety if (/[a-z]/.test(value)) complexity++; if (/[A-Z]/.test(value)) complexity++; if (/\d/.test(value)) complexity++; if (/[^a-zA-Z0-9]/.test(value)) complexity++; // Length contribution complexity += Math.min(value.length / 20, 2); return complexity; } /** * Check if hit meets confidence threshold for given sensitivity */ export function meetsConfidenceThreshold(confidence, sensitivity, customThreshold) { const threshold = customThreshold ?? CONFIDENCE_THRESHOLDS[sensitivity]; return confidence >= threshold; } /** * Apply confidence-based filtering to hits */ export function filterByConfidence(hits, sensitivity, customThreshold) { return hits.filter((hit) => { if (hit.confidence === undefined) { // For backward compatibility, assume high confidence for hits without scores return true; } return meetsConfidenceThreshold(hit.confidence, sensitivity, customThreshold); }); }