UNPKG

pii-paladin

Version:

A Node.js package to censor PII in a string using a hybrid NER and Regex approach.

221 lines (220 loc) 10.1 kB
/* * PII-PALADIN * * index.js: Core logic for PII censoring. */ import { pipeline, env } from '@xenova/transformers'; import { fileURLToPath } from 'url'; import path from 'path'; // --- Configuration --- // Get the directory of the current module. const __dirname = path.dirname(fileURLToPath(import.meta.url)); // Configure transformers.js to use local files ONLY. // This is critical for ensuring offline operation. // 1. Set the path to the local model directory. // Models are expected to be in `/models/`. env.localModelPath = path.resolve(__dirname, 'models/'); // 2. Disable remote model downloads. // If the model is not found locally, it will throw an error instead of trying to download it. env.allowRemoteModels = false; // 3. Set the path to the local ONNX runtime WASM files. // WASM files are expected to be in `/wasm/`. env.backends.onnx.wasm.wasmPaths = path.join(__dirname, 'wasm/'); /** * @typedef {object} PIIEntity * @property {string} entity_group - The type of PII detected (e.g., 'PER', 'ORG', 'LOC', 'SSN', 'EMAIL'). * @property {string} word - The actual text of the detected PII. * @property {number} start - The starting character index of the PII in the original string. * @property {number} end - The ending character index of the PII in the original string. * @property {number} score - The confidence score of the detection (for NER, 1.0 for regex). */ /** * Singleton class to manage the Named Entity Recognition (NER) pipeline from transformers.js. * Ensures that the NER model is loaded only once. */ export class NerPipelineSingleton { /** * Retrieves the singleton instance of the NER pipeline. * @param {function} [progress_callback=null] - Optional callback function for progress updates during model loading. * @returns {Promise<object>} A promise that resolves to the NER pipeline instance. */ static async getInstance(progress_callback = null) { if (this.instance === null) { try { // Create a new pipeline instance this.instance = await pipeline(this.task, this.model, { progress_callback }); } catch (error) { console.error('Error loading NER model or pipeline:', error); console.error( "Please ensure model files are in 'models/Xenova/bert-base-NER/onnx/' and WASM files are in 'wasm/'.", ); throw new Error( 'Failed to initialize NER pipeline. Check model and WASM file paths.', ); } } return this.instance; } } NerPipelineSingleton.task = 'token-classification'; NerPipelineSingleton.model = 'Xenova/bert-base-NER'; NerPipelineSingleton.instance = null; // --- Regex Patterns for Structured PII --- const regexPIIPatterns = [ { type: 'SSN', pattern: /\b\d{3}[- ]?\d{2}[- ]?\d{4}\b/g }, { type: 'CREDIT_CARD', pattern: /\b(?:\d[ -]*?){13,16}\b/g }, // Basic pattern, not validating prefixes or Luhn { type: 'CREDIT_CARD_EXPIRATION', pattern: /\b(?:Exp(?:ires|iration Date)?:?|Valid Thru:?)\s*(0[1-9]|1[0-2])[-/]([0-9]{2}|[0-9]{4})\b/g, }, // MM/YY or MM/YYYY with mandatory prefixes { type: 'EMAIL', pattern: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g }, { type: 'PHONE', pattern: /\b(?:\(?\d{3}\)?[-. ]?\d{3}[-. ]?\d{4})\b/g }, // Common US formats { type: 'IP_ADDRESS', pattern: /\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g, }, { type: 'VIN', pattern: /\b[A-HJ-NPR-Z0-9]{17}\b/g }, // Standard 17-character VIN, excluding I, O, Q { type: 'ID_NUMBER', pattern: /\b\d{9}\b/g }, // General 9-digit ID (e.g., some driver's licenses, state IDs) { type: 'PASSPORT_NUMBER', pattern: /\b[A-Z0-9]{8,12}\b/g }, // General 8-12 alphanumeric for passports { type: 'MEDICAL_RECORD_NUMBER', pattern: /\bMRN-\d{7}\b/g }, // Specific to MRN-XXXXXXX format { type: 'POLICY_NUMBER', pattern: /\b\d{5,10}\b/g }, // General 5-10 digit policy numbers ]; /** * Censors detected Personally Identifiable Information (PII) in a string using a hybrid approach. * * This function combines Named Entity Recognition (NER) for contextual PII (names, orgs, locations) * with Regular Expressions (Regex) for structured PII (SSN, credit cards, emails, phones, etc.). * * @param {string} input The string to censor. * @returns {Promise<string>} A promise that resolves to the censored string. */ export async function censorPII(input) { const ner = await NerPipelineSingleton.getInstance(); // 1. Get entities from NER model const rawEntities = await ner(input, { aggregation_strategy: 'none', // Get individual tokens }); let aggregatedEntities = []; let currentEntity = null; let currentInputIndex = 0; // Track current position in the input string for (const entity of rawEntities) { const entityType = entity.entity.split('-').pop(); // e.g., PER, ORG, LOC, MISC, DATE const entityTag = entity.entity.split('-')[0]; // e.g., B, I, O if (entityTag === 'B') { // Start of a new entity if (currentEntity !== null) { aggregatedEntities.push(currentEntity); } // Find the start index of the new entity's word in the remaining input string const wordToFind = entity.word.startsWith('##') ? entity.word.substring(2) : entity.word; const startIndex = input.indexOf(wordToFind, currentInputIndex); if (startIndex !== -1) { currentEntity = { entity_group: entityType, word: wordToFind, start: startIndex, end: startIndex + wordToFind.length, score: entity.score, }; currentInputIndex = startIndex + wordToFind.length; } else { // If word not found, treat as non-entity or skip currentEntity = null; } } else if ( entityTag === 'I' && currentEntity !== null && currentEntity.entity_group === entityType ) { // Continuation of the current entity const wordToAppend = entity.word.startsWith('##') ? entity.word.substring(2) : ' ' + entity.word; const startIndex = input.indexOf(wordToAppend, currentInputIndex); if (startIndex !== -1) { currentEntity.word += wordToAppend; currentEntity.end = startIndex + wordToAppend.length; currentEntity.score = Math.min(currentEntity.score, entity.score); // Take min score for aggregated entity currentInputIndex = startIndex + wordToAppend.length; } else { // If continuation word not found, finalize current entity and reset aggregatedEntities.push(currentEntity); currentEntity = null; } } else { // Not a B- or I- tag, or a new entity type without a B-tag, or 'O' (Outside) if (currentEntity !== null) { aggregatedEntities.push(currentEntity); } currentEntity = null; // Reset // Advance currentInputIndex past the current word const wordToAdvance = entity.word.startsWith('##') ? entity.word.substring(2) : entity.word; const nextIndex = input.indexOf(wordToAdvance, currentInputIndex); if (nextIndex !== -1) { currentInputIndex = nextIndex + wordToAdvance.length; } else { currentInputIndex++; // Fallback if word not found } } } // Push the last entity if it exists if (currentEntity !== null) { aggregatedEntities.push(currentEntity); } // 2. Get entities from Regex patterns let regexEntities = []; for (const { type, pattern } of regexPIIPatterns) { let match; while ((match = pattern.exec(input)) !== null) { regexEntities.push({ entity_group: type, word: match[0], start: match.index, end: match.index + match[0].length, score: 1.0, // Assign a high score for regex matches }); } } // 3. Combine and filter all PII entities // Filter for NER PII types + all regex types const allPiiEntities = aggregatedEntities .filter( (entity) => entity.entity_group === 'PER' || // Person entity.entity_group === 'ORG' || // Organization entity.entity_group === 'LOC' || // Location entity.entity_group === 'MISC', // Miscellaneous can sometimes catch other PII // DATE is excluded as the model doesn't reliably detect it ) .concat(regexEntities); // Sort all entities by their starting index in descending order. // This is crucial to avoid messing up indices as we replace text. allPiiEntities.sort((a, b) => b.start - a.start); let censoredText = input; // Replace each detected PII entity with [CENSORED] for (const entity of allPiiEntities) { // Ensure valid start and end indices if ( typeof entity.start === 'number' && typeof entity.end === 'number' && entity.start !== -1 && entity.end !== -1 && entity.start < entity.end ) { const replacement = '[CENSORED]'; censoredText = censoredText.substring(0, entity.start) + replacement + censoredText.substring(entity.end); } else { console.warn(`Invalid entity indices received for entity: ${JSON.stringify(entity)}`); } } return censoredText; }