UNPKG

@naturalcycles/scrubber-lib

Version:

Scrub data in JavaScript plain objects by using rules defined in a configuration object

369 lines (367 loc) 14.5 kB
import crypto from 'node:crypto'; import { _assert } from '@naturalcycles/js-lib/error/assert.js'; import { nanoIdCustomAlphabet } from '@naturalcycles/nodejs-lib'; function encloseValueForSQL(value, type) { if (type === 'STRING') return `'${value}'`; return String(value); } // The name of the original value in the SQL statement const sqlValueToReplace = 'VAL'; // Seed for all random functions. If `HASH(${sqlValueToReplace})` is used, // the random value will be the same every time the table is queried // With `HASH(RANDOM())`, the random value will be different every time, but is safer cryptographically const randomGeneratorSeed = `HASH(RANDOM())`; export const undefinedScrubber = () => undefined; export const undefinedScrubberSQL = () => 'NULL'; export const preserveOriginalScrubber = value => value; export const preserveOriginalScrubberSQL = () => sqlValueToReplace; export const staticScrubber = (value, params = { replacement: '' }) => (params.ifMatch && !value.match(params.ifMatch) && value) || params.replacement; export const staticScrubberSQL = (params = { replacement: '' }) => { const { ifMatch, replacement } = params; const type = typeof replacement === 'number' ? 'NUMBER' : 'STRING'; return wrapIfMatchSQL(ifMatch, encloseValueForSQL(replacement, type)); }; export const isoDateStringScrubber = (value, params = {}) => { if (!value) return; if (value && params.excludeDay) { value = value.slice(0, 8) + '01'; } if (value && params.excludeMonth) { value = value.slice(0, 5) + '01' + value.slice(7, 10); } if (value && params.excludeYear) { value = '1970' + value.slice(4, 11); } return value; }; export const isoDateStringScrubberSQL = (params = {}) => { let replacement = sqlValueToReplace; if (params.excludeDay) { replacement = `SUBSTR(${replacement}, 0, 8) || '01'`; } if (params.excludeMonth) { replacement = `SUBSTR(${replacement}, 0, 5) || '01' || SUBSTR(${replacement}, 8, 10)`; } if (params.excludeYear) { replacement = `'1970' || SUBSTR(${replacement}, 5, 10)`; } return replacement; // "SUBSTR(VAL, 0, 8) || '01'" }; export const unixTimestampScrubber = (value, params = {}) => { if (!value) return; const date = new Date(value * 1000); if (value && params.excludeTime) { date.setSeconds(0); date.setMinutes(0); date.setHours(0); } if (value && params.excludeDay) { date.setDate(1); } if (value && params.excludeMonth) { date.setMonth(0); } if (value && params.excludeYear) { date.setFullYear(1970); } return Math.round(date.getTime() / 1000); }; export const unixTimestampScrubberSQL = (params = {}) => { let replacement = 'TIMESTAMP_NTZ_FROM_PARTS('; if (params.excludeYear) { replacement += '1970, '; } else { replacement += `DATE_PART('YEAR', ${sqlValueToReplace}), `; } if (params.excludeMonth) { replacement += '1, '; } else { replacement += `DATE_PART('MONTH', ${sqlValueToReplace}), `; } if (params.excludeDay) { replacement += '1, '; } else { replacement += `DATE_PART('DAY', ${sqlValueToReplace}), `; } if (params.excludeTime) { replacement += '0, 0, 0)'; } else { replacement += `DATE_PART('HOUR', ${sqlValueToReplace}), DATE_PART('MINUTE', ${sqlValueToReplace}), DATE_PART('SECOND', ${sqlValueToReplace}))`; } return replacement; }; export const charsFromRightScrubber = (value, params = { count: 99, replacement: 'X', replaceFull: false }) => { if (!value) return; const { count, replacement, replaceFull } = params; if (replaceFull) { // oxlint-disable-next-line unicorn/prefer-string-slice return value.substr(0, value.length - count) + replacement; } const lengthToReplace = Math.min(count, value.length); // oxlint-disable-next-line unicorn/prefer-string-slice return value.substr(0, value.length - count) + replacement.repeat(lengthToReplace); }; export const charsFromRightScrubberSQL = (params = { count: 99, replacement: 'X', replaceFull: false }) => { const { count, replacement, replaceFull } = params; if (replaceFull) { // remove $count chars from the right, and replace it by $replacement return `SUBSTR(${sqlValueToReplace}, 0, LEN(${sqlValueToReplace}) - ${count}) || '${replacement}'`; } // replace each chars from the right by $replacement until $count chars are replaced return `SUBSTR(${sqlValueToReplace}, 0, LEN(${sqlValueToReplace}) - ${count}) || REPEAT('${replacement}', LEAST(${count}, LEN(${sqlValueToReplace})))`; }; export const keepCharsFromLeftScrubber = (value, params = { count: 99, replacement: 'X', replaceFull: false }) => { if (!value) return; const { count, replacement, replaceFull } = params; if (value.length <= count) { return value; } if (replaceFull) { return value.slice(0, count) + replacement; } return value.slice(0, count) + replacement.repeat(value.length - count); }; export const keepCharsFromLeftScrubberSQL = (params = { count: 99, replacement: 'X', replaceFull: false }) => { const { count, replacement, replaceFull } = params; if (replaceFull) { // keep $count chars from the left, and replace rest by $replacement return `IFF(LEN(${sqlValueToReplace}) > ${count}, SUBSTR(${sqlValueToReplace}, 0, ${count}) || '${replacement}', ${sqlValueToReplace})`; } // keep $count chars and fill out with $replacement if string was longer return `SUBSTR(${sqlValueToReplace}, 0, ${count}) || REPEAT('${replacement}', LEAST(0, LEN(${sqlValueToReplace})-${count})`; }; /* Random scrubber Uses the package nanoid to generate a random string given an alphabet and a length */ const ALPHABET_NUMBER = '0123456789'; const ALPHABET_LOWERCASE = 'abcdefghijklmnopqrstuvwxyz'; const ALPHABET_ALPHANUMERIC_LOWERCASE = [ALPHABET_NUMBER, ALPHABET_LOWERCASE].join(''); export const randomScrubber = (_value, additionalParams) => { const params = { alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE, length: 16, ...additionalParams }; return nanoIdCustomAlphabet(params.alphabet, params.length)(); }; export const randomScrubberSQL = additionalParams => { const { length } = { length: 16, ...additionalParams }; // This doesn't respect the alphabet :( return `RANDSTR(${length}, ${randomGeneratorSeed})`; }; export const randomEmailScrubber = (_value, additionalParams) => { const params = { alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE, length: 16, domain: '@example.com', ...additionalParams, }; return nanoIdCustomAlphabet(params.alphabet, params.length)() + params.domain; }; export const randomEmailScrubberSQL = additionalParams => { const { length, domain } = { // alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE, length: 16, domain: '@example.com', ...additionalParams, }; // This doesn't respect the alphabet :( return `RANDSTR(${length}, ${randomGeneratorSeed}) || '${domain}'`; }; export const randomEmailInContentScrubber = (value, additionalParams) => { // Email regex, allows letters const emailRegex = /([a-zA-Z1-9._-]*@[a-zA-Z1-9._-]*\.[a-zA-Z_-]{2,63})/; const matches = emailRegex.exec(value); if (!matches) { // No email found, return as is return value; } // Replace all matches with random email const match = matches.pop(); value = value.replace(match, randomEmailScrubber(value, additionalParams)); return value; }; export const randomEmailInContentScrubberSQL = additionalParams => { const { length, domain } = { // alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE, length: 16, domain: '@example.com', ...additionalParams, }; return String.raw `REGEXP_REPLACE( ${sqlValueToReplace}, '[a-zA-Z1-9._-]*@[a-zA-Z1-9._-]*\.[a-zA-Z_-]{2,63}', RANDSTR(${length}, ${randomGeneratorSeed}) ) || '${domain}'`; }; export const saltedHashScrubber = (value, params) => { _assert(params?.initializationVector, 'Initialization vector is missing'); return crypto.createHash('sha256').update(value).update(params.initializationVector).digest('hex'); }; export const saltedHashScrubberSQL = params => { _assert(params?.initializationVector, 'Initialization vector is missing'); return `SHA2(${sqlValueToReplace} || '${params.initializationVector}', 256)`; }; export const saltedHashEmailScrubber = (value, additionalParams) => { const params = { domain: '@example.com', ...additionalParams, }; _assert(params?.initializationVector, 'Initialization vector is missing'); return saltedHashScrubber(value, params) + params.domain; }; export const saltedHashEmailScrubberSQL = additionalParams => { const { initializationVector, domain } = { domain: '@example.com', ...additionalParams, }; _assert(initializationVector, 'Initialization vector is missing'); return `SHA2(${sqlValueToReplace} || '${initializationVector}', 256) || '${domain}'`; }; export const bcryptStringScrubber = (value, params) => { if (!value) return value; // Keep value until 3rd $ const cutoff = nthChar(value, '$', 3); if (!cutoff) return `$2a$12$${nanoIdCustomAlphabet(ALPHABET_ALPHANUMERIC_LOWERCASE, 53)()}`; const prefix = value.slice(0, cutoff); if (params?.replacements) { for (const kvPair of params.replacements.split(',')) { const [k, v] = kvPair.split(':'); if (prefix === k) return v; } } return `${prefix}${nanoIdCustomAlphabet(ALPHABET_ALPHANUMERIC_LOWERCASE, 53)()}`; }; export const bcryptStringScrubberSQL = params => { // to have at least one WHEN clause, so the ELSE clause is valid let replacementDLL = "WHEN FALSE THEN ''\n "; // unpack the replacements here rather than in SQL if (params?.replacements) { for (const kvPair of params.replacements.split(',')) { const [k, v] = kvPair.split(':'); replacementDLL += `WHEN '${k}' THEN '${v}'\n `; } } replacementDLL += `ELSE ARRAY_TO_STRING(ARRAY_SLICE(SPLIT(${sqlValueToReplace}, '$'), 0, 3), '$') || '$' || RANDSTR(53, ${randomGeneratorSeed})`; return `CASE WHEN ARRAY_SIZE(ARRAY_SLICE(SPLIT(${sqlValueToReplace}, '$'), 0, 3)) >= 3 -- If there are at least 3 $ in the string THEN CASE ARRAY_TO_STRING(ARRAY_SLICE(SPLIT(${sqlValueToReplace}, '$'), 0, 3), '$') || '$' -- this is the prefix ${replacementDLL} END ELSE '$2a$12$' || RANDSTR(53, ${randomGeneratorSeed}) END`; }; export const saltedHashSubstringScrubber = (value, params) => { _assert(params?.initializationVector, 'Initialization vector is missing'); _assert(params?.regex, 'Substring or regex is missing'); if (!value) return value; const regex = new RegExp(params.regex, 'g'); return value.replace(regex, substring => crypto.createHash('sha256').update(substring).update(params.initializationVector).digest('hex')); }; export const saltedHashSubstringScrubberSQL = params => { _assert(params?.initializationVector, 'Initialization vector is missing'); _assert(params?.regex, 'Substring or regex is missing'); const substringToReplace = `COALESCE(REGEXP_SUBSTR(${sqlValueToReplace}, '${params.regex}'), '')`; const hashedValue = `SHA2(${substringToReplace} || '${params.initializationVector}', 256)`; const replacedValue = `REGEXP_REPLACE(${sqlValueToReplace}, '${params.regex}', ${hashedValue})`; return replacedValue; }; /** * ZIP areas to scrub completely, due to less than 20,000 inhabitants */ const restrictedZipAreas = [ '036', '059', '063', '102', '203', '556', '692', '790', '821', '823', '830', '831', '878', '879', '884', '890', '893', ]; export const zipScrubber = value => { if (!value) return; const leftPart = value.slice(0, 3); if (restrictedZipAreas.includes(leftPart)) return 'XXXXX'; return `${leftPart}XX`; // de-identify length of zip code as well. }; export const zipScrubberSQL = () => `CASE WHEN ARRAY_CONTAINS( SUBSTR(${sqlValueToReplace}, 0, 3), ['${restrictedZipAreas.join("', '")}']::ARRAY(STRING) ) THEN 'XXXXX' ELSE SUBSTR(${sqlValueToReplace}, 0, 3) || 'XX' END`; function nthChar(str, character, n) { let count = 0; let i = 0; while (count < n) { i = str.indexOf(character, i) + 1; if (i < 1) { return; } count++; if (count === n) return i; } } export const defaultScrubbers = { staticScrubber, preserveOriginalScrubber, isoDateStringScrubber, unixTimestampScrubber, undefinedScrubber, charsFromRightScrubber, randomScrubber, randomEmailScrubber, randomEmailInContentScrubber, saltedHashScrubber, saltedHashEmailScrubber, bcryptStringScrubber, saltedHashSubstringScrubber, keepCharsFromLeftScrubber, zipScrubber, }; export const defaultScrubbersSQL = { staticScrubber: staticScrubberSQL, preserveOriginalScrubber: preserveOriginalScrubberSQL, isoDateStringScrubber: isoDateStringScrubberSQL, unixTimestampScrubber: unixTimestampScrubberSQL, undefinedScrubber: undefinedScrubberSQL, charsFromRightScrubber: charsFromRightScrubberSQL, randomScrubber: randomScrubberSQL, randomEmailScrubber: randomEmailScrubberSQL, randomEmailInContentScrubber: randomEmailInContentScrubberSQL, saltedHashScrubber: saltedHashScrubberSQL, saltedHashEmailScrubber: saltedHashEmailScrubberSQL, bcryptStringScrubber: bcryptStringScrubberSQL, saltedHashSubstringScrubber: saltedHashSubstringScrubberSQL, keepCharsFromLeftScrubber: keepCharsFromLeftScrubberSQL, zipScrubber: zipScrubberSQL, }; const wrapIfMatchSQL = (ifMatch, expression) => { if (ifMatch === undefined) return expression; return `CASE WHEN REGEXP_LIKE(${sqlValueToReplace}, '${ifMatch}') THEN ${expression} ELSE ${sqlValueToReplace} END`; };