UNPKG

@himorishige/noren-core

Version:

Core PII detection, masking, and tokenization library built on Web Standards

398 lines (397 loc) 17.5 kB
// Noren Core - Fast, lightweight PII detection and masking (Web Standards only) export { AllowDenyManager } from './allowlist.js'; export { CONFIDENCE_THRESHOLDS, filterByConfidence } from './confidence.js'; export { CONTEXT_KEYWORDS, NEGATIVE_CONTEXT_KEYWORDS, STRICTNESS_LEVELS, } from './constants.js'; export { calculateContextScore, extractSurroundingText, meetsContextThreshold, } from './context-scoring.js'; export { createJSONDetector, JSONDetector, } from './json-detector.js'; export { clearPluginCache } from './lazy.js'; // MCP (Model Context Protocol) utilities export { containsJsonRpcPattern, createMCPRedactionTransform, extractMethodName, extractSensitiveContent, getMessageType, isValidJsonRpcMessage, parseJsonLines, redactJsonRpcMessage, } from './mcp-utils.js'; export { HitPool } from './pool.js'; export { createRedactionTransform } from './stream-utils.js'; export { hmacToken, importHmacKey } from './utils.js'; export { debugValidation, validateCandidate, } from './validators.js'; import { AllowDenyManager } from './allowlist.js'; import { calculateConfidence, filterByConfidence } from './confidence.js'; import { builtinDetect } from './detection.js'; import { createJSONDetector } from './json-detector.js'; import { loadPlugin } from './lazy.js'; import { defaultMask } from './masking.js'; import { hitPool } from './pool.js'; import { hmacToken, importHmacKey, normalize, SECURITY_LIMITS } from './utils.js'; import { validateCandidate } from './validators.js'; // Risk level weights for tiebreaker comparison const RISK_WEIGHTS = { high: 3, medium: 2, low: 1 }; const DEFAULT_BUILTIN_PRIORITY = 10; // Priority comparison: lower number = higher priority (e.g., -5 > -1 > 1 > 5) function isHigherPriority(a, b) { return a < b; } /** * Resolve conflicts between hits with same priority * Returns true if current hit should replace the existing hit */ function resolveSamePriorityConflict(current, existing) { // 1. Prefer higher risk levels const currentRiskWeight = RISK_WEIGHTS[current.risk]; const existingRiskWeight = RISK_WEIGHTS[existing.risk]; if (currentRiskWeight !== existingRiskWeight) { return currentRiskWeight > existingRiskWeight; } // 2. Prefer longer hits (more specific matches) const currentLength = current.end - current.start; const existingLength = existing.end - existing.start; if (currentLength !== existingLength) { return currentLength > existingLength; } // 3. As final tiebreaker, prefer hits that appear earlier in text return current.start < existing.start; } export class Registry { detectors = []; maskers = new Map(); base; contextHintsSet; allowDenyManager; enableConfidenceScoring; enableJsonDetection; constructor(options) { // Validate configuration this.validateOptions(options); const { environment, allowDenyConfig, enableConfidenceScoring, enableJsonDetection, ...policy } = options; this.base = policy; this.contextHintsSet = new Set(policy.contextHints ?? []); this.enableConfidenceScoring = enableConfidenceScoring ?? true; this.enableJsonDetection = enableJsonDetection ?? false; // Initialize allowlist/denylist manager this.allowDenyManager = new AllowDenyManager({ environment: environment ?? 'production', ...allowDenyConfig, }); } /** * Validate registry configuration options */ validateOptions(options) { // Validate validation strictness if (options.validationStrictness && !['fast', 'balanced', 'strict'].includes(options.validationStrictness)) { throw new Error(`Invalid validationStrictness: ${options.validationStrictness}. Must be 'fast', 'balanced', or 'strict'`); } // Validate HMAC key strength (only for string keys) if (options.hmacKey && typeof options.hmacKey === 'string' && options.hmacKey.length < 32) { throw new Error('HMAC key must be at least 32 characters long for security'); } // Validate context hints if (options.contextHints && !Array.isArray(options.contextHints)) { throw new Error('contextHints must be an array of strings'); } // Validate rules structure if (options.rules) { for (const [type, rule] of Object.entries(options.rules)) { if (rule && typeof rule === 'object') { if (rule.action && !['mask', 'remove', 'tokenize'].includes(rule.action)) { throw new Error(`Invalid action '${rule.action}' for type '${type}'. Must be 'mask', 'remove', or 'tokenize'`); } } } } // Validate default action if (options.defaultAction && !['mask', 'remove', 'tokenize'].includes(options.defaultAction)) { throw new Error(`Invalid defaultAction: ${options.defaultAction}. Must be 'mask', 'remove', or 'tokenize'`); } } use(detectors = [], maskers = {}, ctx = []) { for (const d of detectors) this.detectors.push(d); // Sort so that higher priority detectors run first (lower number first) this.detectors.sort((a, b) => { const ap = a.priority ?? 0; const bp = b.priority ?? 0; if (isHigherPriority(ap, bp)) return -1; if (isHigherPriority(bp, ap)) return 1; return 0; }); for (const [k, m] of Object.entries(maskers)) this.maskers.set(k, m); if (ctx.length) { for (const hint of ctx) this.contextHintsSet.add(hint); this.base.contextHints = Array.from(this.contextHintsSet); } } async useLazy(pluginName, plugin) { const loaded = await loadPlugin(pluginName, plugin); this.use(loaded.detectors, loaded.maskers, loaded.contextHints); } getPolicy() { return this.base; } maskerFor(t) { return this.maskers.get(t); } /** * Try JSON detection on the input text */ tryJsonDetection(src, utils) { const jsonDetector = createJSONDetector(); const result = jsonDetector.detectInJson(src, utils); if (result.isValidJson && result.hits.length > 0) { // Convert JsonHit to regular Hit for integration for (const jsonHit of result.hits) { const hit = { type: jsonHit.type, start: jsonHit.start, end: jsonHit.end, value: jsonHit.value, risk: jsonHit.risk, priority: -5, // Higher priority than default built-ins for JSON-based detection confidence: jsonHit.confidence, reasons: jsonHit.reasons, features: { ...jsonHit.features, isJsonDetection: true, jsonPath: jsonHit.jsonPath, keyName: jsonHit.keyName, }, }; utils.push(hit); } } } async detect(raw, ctxHints = this.base.contextHints ?? []) { // Input validation and size limits if (typeof raw !== 'string') { throw new Error('Input must be a string'); } if (raw.length > SECURITY_LIMITS.maxInputLength) { throw new Error(`Input too large: ${raw.length} chars exceeds limit of ${SECURITY_LIMITS.maxInputLength}`); } const src = normalize(raw); const hits = []; // Lazily compute lowercase and hints only when needed to reduce overhead on hot path let srcLower = null; const getSrcLower = () => { if (srcLower === null) { srcLower = src.toLowerCase(); } return srcLower; }; let contextCheckCache = null; const u = { src, hasCtx: (ws) => { const hay = getSrcLower(); if (!ws) { if (contextCheckCache === null) { const hints = ctxHints.length > 0 ? ctxHints : Array.from(this.contextHintsSet); contextCheckCache = hints.some((w) => hay.includes(w.toLowerCase())); } return contextCheckCache; } return ws.some((w) => hay.includes(w.toLowerCase())); }, push: (h) => { if (hits.length >= SECURITY_LIMITS.maxPatternMatches) return; hits.push(h); }, canPush: () => hits.length < SECURITY_LIMITS.maxPatternMatches, }; // Get validation strictness from policy (default to 'fast' for backward compatibility) const validationStrictness = this.base.validationStrictness ?? 'fast'; builtinDetect(u, validationStrictness); // Try JSON detection if enabled if (this.enableJsonDetection) { this.tryJsonDetection(src, u); } // Assign default priority for builtin hits that didn't set one for (let i = 0; i < hits.length; i++) { if (hits[i].priority === undefined) hits[i].priority = DEFAULT_BUILTIN_PRIORITY; } for (const d of this.detectors) { const originalPush = u.push; // Override push to set detector priority and apply validation if not already set u.push = (hit) => { if (hit.priority === undefined) { hit.priority = d.priority ?? 0; } // Apply validation for non-fast modes if (validationStrictness !== 'fast') { try { // Extract context around the hit for validation const windowSize = 48; const beforeStart = Math.max(0, hit.start - windowSize); const afterEnd = Math.min(src.length, hit.end + windowSize); const surroundingText = src.slice(beforeStart, afterEnd); const validationContext = { surroundingText, strictness: validationStrictness, originalIndex: hit.start - beforeStart, }; const validationResult = validateCandidate(hit.value, hit.type, validationContext); // Only push if validation passes if (validationResult.valid) { originalPush(hit); } // For plugins, we don't log validation failures to avoid noise } catch (_error) { // If validation fails, fall back to accepting the hit originalPush(hit); } } else { // Fast mode: no validation originalPush(hit); } }; await d.match(u); // Restore original push u.push = originalPush; } if (hits.length === 0) return { src, hits: [] }; hits.sort((a, b) => a.start - b.start || b.end - b.start - (a.end - a.start)); let writeIndex = 0; let currentEnd = -1; for (let readIndex = 0; readIndex < hits.length; readIndex++) { const currentHit = hits[readIndex]; if (currentHit.start >= currentEnd) { // No overlap, keep this hit hits[writeIndex] = currentHit; currentEnd = currentHit.end; writeIndex++; } else { // Overlap detected - check if current hit has higher priority const lastAcceptedHit = hits[writeIndex - 1]; const currentPriority = currentHit.priority ?? 0; const lastPriority = lastAcceptedHit.priority ?? 0; if (isHigherPriority(currentPriority, lastPriority)) { // Current hit has higher priority, replace the last one const toRelease = lastAcceptedHit; hits[writeIndex - 1] = currentHit; currentEnd = currentHit.end; hitPool.releaseOne(toRelease); } else if (currentPriority === lastPriority) { // Same priority - use tiebreaker rules const shouldReplace = resolveSamePriorityConflict(currentHit, lastAcceptedHit); if (shouldReplace) { const toRelease = lastAcceptedHit; hits[writeIndex - 1] = currentHit; currentEnd = currentHit.end; hitPool.releaseOne(toRelease); } else { hitPool.releaseOne(currentHit); } } else { // Keep the existing hit, discard current one hitPool.releaseOne(currentHit); } } } // Filter hits through allowlist/denylist const filteredHits = []; for (let i = 0; i < writeIndex; i++) { const hit = hits[i]; // Extract context around the hit for allowlist/denylist checking const contextWindowSize = 100; const contextStart = Math.max(0, hit.start - contextWindowSize); const contextEnd = Math.min(src.length, hit.end + contextWindowSize); const context = src.slice(contextStart, contextEnd); // Check if this value should be allowed (not treated as PII) if (!this.allowDenyManager.isAllowed(hit.value, hit.type, context)) { filteredHits.push(hit); } } // Apply confidence scoring if enabled const scoredHits = []; for (const hit of filteredHits) { if (this.enableConfidenceScoring) { const confidenceResult = calculateConfidence(hit, src); const scoredHit = { ...hit, confidence: confidenceResult.confidence, reasons: confidenceResult.reasons, features: { ...hit.features, // Preserve existing features (like JSON detection info) ...confidenceResult.features, // Add confidence-related features }, }; scoredHits.push(scoredHit); } else { scoredHits.push(hit); } } // Apply confidence-based filtering const finalFilteredHits = this.enableConfidenceScoring && this.base.sensitivity ? filterByConfidence(scoredHits, this.base.sensitivity, this.base.confidenceThreshold) : scoredHits; // Create clean copies of final hits to avoid pool reference issues const finalHits = new Array(finalFilteredHits.length); for (let i = 0; i < finalFilteredHits.length; i++) { const hit = finalFilteredHits[i]; finalHits[i] = { type: hit.type, start: hit.start, end: hit.end, value: hit.value, risk: hit.risk, priority: hit.priority, confidence: hit.confidence, reasons: hit.reasons, features: hit.features, }; } // Return accepted hit objects to the pool; rejected ones were already released if (writeIndex > 0) { hitPool.releaseRange(hits, writeIndex); } return { src, hits: finalHits }; } } export async function redactText(reg, input, override = {}) { const cfg = { ...reg.getPolicy(), ...override }; const { src, hits } = await reg.detect(input, cfg.contextHints); if (hits.length === 0) return src; const needTok = Object.values(cfg.rules ?? {}).some((v) => v?.action === 'tokenize') || cfg.defaultAction === 'tokenize'; const key = needTok && cfg.hmacKey ? await importHmacKey(cfg.hmacKey) : undefined; const parts = []; let cur = 0; for (const h of hits) { const rule = cfg.rules?.[h.type] ?? { action: cfg.defaultAction ?? 'mask' }; if (h.start > cur) { parts.push(src.slice(cur, h.start)); } let rep = h.value; if (rule.action === 'remove') { rep = ''; } else if (rule.action === 'mask') { rep = reg.maskerFor(h.type)?.(h) ?? defaultMask(h, rule.preserveLast4); } else if (rule.action === 'tokenize') { if (!key) throw new Error(`hmacKey is required for tokenize action on type ${h.type}`); rep = `TKN_${String(h.type).toUpperCase()}_${await hmacToken(h.value, key)}`; } if (rep !== '') { parts.push(rep); } cur = h.end; } if (cur < src.length) { parts.push(src.slice(cur)); } return parts.join(''); }