@naturalcycles/scrubber-lib
Version:
Scrub data in JavaScript plain objects by using rules defined in a configuration object
369 lines (367 loc) • 14.5 kB
JavaScript
import crypto from 'node:crypto';
import { _assert } from '@naturalcycles/js-lib/error/assert.js';
import { nanoIdCustomAlphabet } from '@naturalcycles/nodejs-lib';
function encloseValueForSQL(value, type) {
if (type === 'STRING')
return `'${value}'`;
return String(value);
}
// The name of the original value in the SQL statement
const sqlValueToReplace = 'VAL';
// Seed for all random functions. If `HASH(${sqlValueToReplace})` is used,
// the random value will be the same every time the table is queried
// With `HASH(RANDOM())`, the random value will be different every time, but is safer cryptographically
const randomGeneratorSeed = `HASH(RANDOM())`;
export const undefinedScrubber = () => undefined;
export const undefinedScrubberSQL = () => 'NULL';
export const preserveOriginalScrubber = value => value;
export const preserveOriginalScrubberSQL = () => sqlValueToReplace;
export const staticScrubber = (value, params = { replacement: '' }) => (params.ifMatch && !value.match(params.ifMatch) && value) || params.replacement;
export const staticScrubberSQL = (params = { replacement: '' }) => {
const { ifMatch, replacement } = params;
const type = typeof replacement === 'number' ? 'NUMBER' : 'STRING';
return wrapIfMatchSQL(ifMatch, encloseValueForSQL(replacement, type));
};
export const isoDateStringScrubber = (value, params = {}) => {
if (!value)
return;
if (value && params.excludeDay) {
value = value.slice(0, 8) + '01';
}
if (value && params.excludeMonth) {
value = value.slice(0, 5) + '01' + value.slice(7, 10);
}
if (value && params.excludeYear) {
value = '1970' + value.slice(4, 11);
}
return value;
};
export const isoDateStringScrubberSQL = (params = {}) => {
let replacement = sqlValueToReplace;
if (params.excludeDay) {
replacement = `SUBSTR(${replacement}, 0, 8) || '01'`;
}
if (params.excludeMonth) {
replacement = `SUBSTR(${replacement}, 0, 5) || '01' || SUBSTR(${replacement}, 8, 10)`;
}
if (params.excludeYear) {
replacement = `'1970' || SUBSTR(${replacement}, 5, 10)`;
}
return replacement; // "SUBSTR(VAL, 0, 8) || '01'"
};
export const unixTimestampScrubber = (value, params = {}) => {
if (!value)
return;
const date = new Date(value * 1000);
if (value && params.excludeTime) {
date.setSeconds(0);
date.setMinutes(0);
date.setHours(0);
}
if (value && params.excludeDay) {
date.setDate(1);
}
if (value && params.excludeMonth) {
date.setMonth(0);
}
if (value && params.excludeYear) {
date.setFullYear(1970);
}
return Math.round(date.getTime() / 1000);
};
export const unixTimestampScrubberSQL = (params = {}) => {
let replacement = 'TIMESTAMP_NTZ_FROM_PARTS(';
if (params.excludeYear) {
replacement += '1970, ';
}
else {
replacement += `DATE_PART('YEAR', ${sqlValueToReplace}), `;
}
if (params.excludeMonth) {
replacement += '1, ';
}
else {
replacement += `DATE_PART('MONTH', ${sqlValueToReplace}), `;
}
if (params.excludeDay) {
replacement += '1, ';
}
else {
replacement += `DATE_PART('DAY', ${sqlValueToReplace}), `;
}
if (params.excludeTime) {
replacement += '0, 0, 0)';
}
else {
replacement += `DATE_PART('HOUR', ${sqlValueToReplace}), DATE_PART('MINUTE', ${sqlValueToReplace}), DATE_PART('SECOND', ${sqlValueToReplace}))`;
}
return replacement;
};
export const charsFromRightScrubber = (value, params = { count: 99, replacement: 'X', replaceFull: false }) => {
if (!value)
return;
const { count, replacement, replaceFull } = params;
if (replaceFull) {
// oxlint-disable-next-line unicorn/prefer-string-slice
return value.substr(0, value.length - count) + replacement;
}
const lengthToReplace = Math.min(count, value.length);
// oxlint-disable-next-line unicorn/prefer-string-slice
return value.substr(0, value.length - count) + replacement.repeat(lengthToReplace);
};
export const charsFromRightScrubberSQL = (params = { count: 99, replacement: 'X', replaceFull: false }) => {
const { count, replacement, replaceFull } = params;
if (replaceFull) {
// remove $count chars from the right, and replace it by $replacement
return `SUBSTR(${sqlValueToReplace}, 0, LEN(${sqlValueToReplace}) - ${count}) || '${replacement}'`;
}
// replace each chars from the right by $replacement until $count chars are replaced
return `SUBSTR(${sqlValueToReplace}, 0, LEN(${sqlValueToReplace}) - ${count}) || REPEAT('${replacement}', LEAST(${count}, LEN(${sqlValueToReplace})))`;
};
export const keepCharsFromLeftScrubber = (value, params = { count: 99, replacement: 'X', replaceFull: false }) => {
if (!value)
return;
const { count, replacement, replaceFull } = params;
if (value.length <= count) {
return value;
}
if (replaceFull) {
return value.slice(0, count) + replacement;
}
return value.slice(0, count) + replacement.repeat(value.length - count);
};
export const keepCharsFromLeftScrubberSQL = (params = { count: 99, replacement: 'X', replaceFull: false }) => {
const { count, replacement, replaceFull } = params;
if (replaceFull) {
// keep $count chars from the left, and replace rest by $replacement
return `IFF(LEN(${sqlValueToReplace}) > ${count}, SUBSTR(${sqlValueToReplace}, 0, ${count}) || '${replacement}', ${sqlValueToReplace})`;
}
// keep $count chars and fill out with $replacement if string was longer
return `SUBSTR(${sqlValueToReplace}, 0, ${count}) || REPEAT('${replacement}', LEAST(0, LEN(${sqlValueToReplace})-${count})`;
};
/*
Random scrubber
Uses the package nanoid to generate a random string given an alphabet and a length
*/
const ALPHABET_NUMBER = '0123456789';
const ALPHABET_LOWERCASE = 'abcdefghijklmnopqrstuvwxyz';
const ALPHABET_ALPHANUMERIC_LOWERCASE = [ALPHABET_NUMBER, ALPHABET_LOWERCASE].join('');
export const randomScrubber = (_value, additionalParams) => {
const params = { alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE, length: 16, ...additionalParams };
return nanoIdCustomAlphabet(params.alphabet, params.length)();
};
export const randomScrubberSQL = additionalParams => {
const { length } = { length: 16, ...additionalParams };
// This doesn't respect the alphabet :(
return `RANDSTR(${length}, ${randomGeneratorSeed})`;
};
export const randomEmailScrubber = (_value, additionalParams) => {
const params = {
alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE,
length: 16,
domain: '@example.com',
...additionalParams,
};
return nanoIdCustomAlphabet(params.alphabet, params.length)() + params.domain;
};
export const randomEmailScrubberSQL = additionalParams => {
const { length, domain } = {
// alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE,
length: 16,
domain: '@example.com',
...additionalParams,
};
// This doesn't respect the alphabet :(
return `RANDSTR(${length}, ${randomGeneratorSeed}) || '${domain}'`;
};
export const randomEmailInContentScrubber = (value, additionalParams) => {
// Email regex, allows letters
const emailRegex = /([a-zA-Z1-9._-]*@[a-zA-Z1-9._-]*\.[a-zA-Z_-]{2,63})/;
const matches = emailRegex.exec(value);
if (!matches) {
// No email found, return as is
return value;
}
// Replace all matches with random email
const match = matches.pop();
value = value.replace(match, randomEmailScrubber(value, additionalParams));
return value;
};
export const randomEmailInContentScrubberSQL = additionalParams => {
const { length, domain } = {
// alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE,
length: 16,
domain: '@example.com',
...additionalParams,
};
return String.raw `REGEXP_REPLACE(
${sqlValueToReplace},
'[a-zA-Z1-9._-]*@[a-zA-Z1-9._-]*\.[a-zA-Z_-]{2,63}',
RANDSTR(${length}, ${randomGeneratorSeed})
) || '${domain}'`;
};
export const saltedHashScrubber = (value, params) => {
_assert(params?.initializationVector, 'Initialization vector is missing');
return crypto.createHash('sha256').update(value).update(params.initializationVector).digest('hex');
};
export const saltedHashScrubberSQL = params => {
_assert(params?.initializationVector, 'Initialization vector is missing');
return `SHA2(${sqlValueToReplace} || '${params.initializationVector}', 256)`;
};
export const saltedHashEmailScrubber = (value, additionalParams) => {
const params = {
domain: '@example.com',
...additionalParams,
};
_assert(params?.initializationVector, 'Initialization vector is missing');
return saltedHashScrubber(value, params) + params.domain;
};
export const saltedHashEmailScrubberSQL = additionalParams => {
const { initializationVector, domain } = {
domain: '@example.com',
...additionalParams,
};
_assert(initializationVector, 'Initialization vector is missing');
return `SHA2(${sqlValueToReplace} || '${initializationVector}', 256) || '${domain}'`;
};
export const bcryptStringScrubber = (value, params) => {
if (!value)
return value;
// Keep value until 3rd $
const cutoff = nthChar(value, '$', 3);
if (!cutoff)
return `$2a$12$${nanoIdCustomAlphabet(ALPHABET_ALPHANUMERIC_LOWERCASE, 53)()}`;
const prefix = value.slice(0, cutoff);
if (params?.replacements) {
for (const kvPair of params.replacements.split(',')) {
const [k, v] = kvPair.split(':');
if (prefix === k)
return v;
}
}
return `${prefix}${nanoIdCustomAlphabet(ALPHABET_ALPHANUMERIC_LOWERCASE, 53)()}`;
};
export const bcryptStringScrubberSQL = params => {
// to have at least one WHEN clause, so the ELSE clause is valid
let replacementDLL = "WHEN FALSE THEN ''\n ";
// unpack the replacements here rather than in SQL
if (params?.replacements) {
for (const kvPair of params.replacements.split(',')) {
const [k, v] = kvPair.split(':');
replacementDLL += `WHEN '${k}' THEN '${v}'\n `;
}
}
replacementDLL += `ELSE ARRAY_TO_STRING(ARRAY_SLICE(SPLIT(${sqlValueToReplace}, '$'), 0, 3), '$') || '$' || RANDSTR(53, ${randomGeneratorSeed})`;
return `CASE WHEN ARRAY_SIZE(ARRAY_SLICE(SPLIT(${sqlValueToReplace}, '$'), 0, 3)) >= 3 -- If there are at least 3 $ in the string
THEN
CASE ARRAY_TO_STRING(ARRAY_SLICE(SPLIT(${sqlValueToReplace}, '$'), 0, 3), '$') || '$' -- this is the prefix
${replacementDLL}
END
ELSE '$2a$12$' || RANDSTR(53, ${randomGeneratorSeed})
END`;
};
export const saltedHashSubstringScrubber = (value, params) => {
_assert(params?.initializationVector, 'Initialization vector is missing');
_assert(params?.regex, 'Substring or regex is missing');
if (!value)
return value;
const regex = new RegExp(params.regex, 'g');
return value.replace(regex, substring => crypto.createHash('sha256').update(substring).update(params.initializationVector).digest('hex'));
};
export const saltedHashSubstringScrubberSQL = params => {
_assert(params?.initializationVector, 'Initialization vector is missing');
_assert(params?.regex, 'Substring or regex is missing');
const substringToReplace = `COALESCE(REGEXP_SUBSTR(${sqlValueToReplace}, '${params.regex}'), '')`;
const hashedValue = `SHA2(${substringToReplace} || '${params.initializationVector}', 256)`;
const replacedValue = `REGEXP_REPLACE(${sqlValueToReplace}, '${params.regex}', ${hashedValue})`;
return replacedValue;
};
/**
* ZIP areas to scrub completely, due to less than 20,000 inhabitants
*/
const restrictedZipAreas = [
'036',
'059',
'063',
'102',
'203',
'556',
'692',
'790',
'821',
'823',
'830',
'831',
'878',
'879',
'884',
'890',
'893',
];
export const zipScrubber = value => {
if (!value)
return;
const leftPart = value.slice(0, 3);
if (restrictedZipAreas.includes(leftPart))
return 'XXXXX';
return `${leftPart}XX`; // de-identify length of zip code as well.
};
export const zipScrubberSQL = () => `CASE WHEN ARRAY_CONTAINS(
SUBSTR(${sqlValueToReplace}, 0, 3),
['${restrictedZipAreas.join("', '")}']::ARRAY(STRING)
)
THEN 'XXXXX'
ELSE SUBSTR(${sqlValueToReplace}, 0, 3) || 'XX'
END`;
function nthChar(str, character, n) {
let count = 0;
let i = 0;
while (count < n) {
i = str.indexOf(character, i) + 1;
if (i < 1) {
return;
}
count++;
if (count === n)
return i;
}
}
export const defaultScrubbers = {
staticScrubber,
preserveOriginalScrubber,
isoDateStringScrubber,
unixTimestampScrubber,
undefinedScrubber,
charsFromRightScrubber,
randomScrubber,
randomEmailScrubber,
randomEmailInContentScrubber,
saltedHashScrubber,
saltedHashEmailScrubber,
bcryptStringScrubber,
saltedHashSubstringScrubber,
keepCharsFromLeftScrubber,
zipScrubber,
};
export const defaultScrubbersSQL = {
staticScrubber: staticScrubberSQL,
preserveOriginalScrubber: preserveOriginalScrubberSQL,
isoDateStringScrubber: isoDateStringScrubberSQL,
unixTimestampScrubber: unixTimestampScrubberSQL,
undefinedScrubber: undefinedScrubberSQL,
charsFromRightScrubber: charsFromRightScrubberSQL,
randomScrubber: randomScrubberSQL,
randomEmailScrubber: randomEmailScrubberSQL,
randomEmailInContentScrubber: randomEmailInContentScrubberSQL,
saltedHashScrubber: saltedHashScrubberSQL,
saltedHashEmailScrubber: saltedHashEmailScrubberSQL,
bcryptStringScrubber: bcryptStringScrubberSQL,
saltedHashSubstringScrubber: saltedHashSubstringScrubberSQL,
keepCharsFromLeftScrubber: keepCharsFromLeftScrubberSQL,
zipScrubber: zipScrubberSQL,
};
const wrapIfMatchSQL = (ifMatch, expression) => {
if (ifMatch === undefined)
return expression;
return `CASE WHEN REGEXP_LIKE(${sqlValueToReplace}, '${ifMatch}') THEN ${expression} ELSE ${sqlValueToReplace} END`;
};