UNPKG

@naturalcycles/scrubber-lib

Version:

Scrub data in JavaScript plain objects by using rules defined in a configuration object

658 lines (530 loc) 20.8 kB
import crypto from 'node:crypto' import { _assert } from '@naturalcycles/js-lib/error/assert.js' import { nanoIdCustomAlphabet } from '@naturalcycles/nodejs-lib' import type { ScrubberFn, ScrubbersMap, ScrubberSQLFn, ScrubbersSQLMap } from './scrubber.model.js' function encloseValueForSQL(value: string | number, type: string): string { if (type === 'STRING') return `'${value}'` return String(value) } // The name of the original value in the SQL statement const sqlValueToReplace = 'VAL' // Seed for all random functions. If `HASH(${sqlValueToReplace})` is used, // the random value will be the same every time the table is queried // With `HASH(RANDOM())`, the random value will be different every time, but is safer cryptographically const randomGeneratorSeed = `HASH(RANDOM())` /* Undefined scrubber Replace value with `undefined` */ export type UndefinedScrubberFn = ScrubberFn<any, undefined> export type UndefinedScrubberSQLFn = ScrubberSQLFn<undefined> export const undefinedScrubber: UndefinedScrubberFn = () => undefined export const undefinedScrubberSQL: UndefinedScrubberSQLFn = () => 'NULL' /* Preserve original scrubber Useful for profiles that inherit from another and want to keep original value (eg: "removing" scrubber from parent) */ export type PreserveOriginalScrubberFn = ScrubberFn<any, undefined> export type PreserveOriginalScrubberSQLFn = ScrubberSQLFn<undefined> export const preserveOriginalScrubber: PreserveOriginalScrubberFn = value => value export const preserveOriginalScrubberSQL: PreserveOriginalScrubberSQLFn = () => sqlValueToReplace /* Static scrubber Replace value with `params.replacement` */ export interface StaticScrubberParams { /** * Only scrub if value matches given Regex */ ifMatch?: string replacement: string | number } export type StaticScrubberFn = ScrubberFn<any, StaticScrubberParams> export type StaticScrubberSQLFn = ScrubberSQLFn<StaticScrubberParams> export const staticScrubber: StaticScrubberFn = (value, params = { replacement: '' }) => (params.ifMatch && !value.match(params.ifMatch) && value) || params.replacement export const staticScrubberSQL: StaticScrubberSQLFn = (params = { replacement: '' }) => { const { ifMatch, replacement } = params const type = typeof replacement === 'number' ? 'NUMBER' : 'STRING' return wrapIfMatchSQL(ifMatch, encloseValueForSQL(replacement, type)) } /* ISO Date string scrubber excludeDay: 2019-05-25 -> 2019-05-01 excludeMonth: 2019-05-25 -> 2019-01-25 excludeYear: 2019-05-25 -> 1970-05-25 */ export interface ISODateStringScrubberParams { excludeDay?: boolean excludeMonth?: boolean excludeYear?: boolean } export type ISODateStringScrubberFn = ScrubberFn<string | undefined, ISODateStringScrubberParams> export type ISODateStringScrubberSQLFn = ScrubberSQLFn<ISODateStringScrubberParams> export const isoDateStringScrubber: ISODateStringScrubberFn = (value, params = {}) => { if (!value) return if (value && params.excludeDay) { value = value.slice(0, 8) + '01' } if (value && params.excludeMonth) { value = value.slice(0, 5) + '01' + value.substr(7, 3) } if (value && params.excludeYear) { value = '1970' + value.substr(4, 9) } return value } export const isoDateStringScrubberSQL: ISODateStringScrubberSQLFn = (params = {}) => { let replacement = sqlValueToReplace if (params.excludeDay) { replacement = `SUBSTR(${replacement}, 0, 8) || '01'` } if (params.excludeMonth) { replacement = `SUBSTR(${replacement}, 0, 5) || '01' || SUBSTR(${replacement}, 8, 10)` } if (params.excludeYear) { replacement = `'1970' || SUBSTR(${replacement}, 5, 10)` } return replacement // "SUBSTR(VAL, 0, 8) || '01'" } /* Unix timestamp (timestamp in seconds) scrubber */ export interface UnixTimestampScrubberParams { excludeTime?: boolean excludeDay?: boolean excludeMonth?: boolean excludeYear?: boolean } export type UnixTimestampScrubberFn = ScrubberFn< number | string | undefined, UnixTimestampScrubberParams > export type UnixTimestampScrubberSQLFn = ScrubberSQLFn<UnixTimestampScrubberParams> export const unixTimestampScrubber: UnixTimestampScrubberFn = (value, params = {}) => { if (!value) return const date = new Date((value as number) * 1000) if (value && params.excludeTime) { date.setSeconds(0) date.setMinutes(0) date.setHours(0) } if (value && params.excludeDay) { date.setDate(1) } if (value && params.excludeMonth) { date.setMonth(0) } if (value && params.excludeYear) { date.setFullYear(1970) } return Math.round(date.getTime() / 1000) } export const unixTimestampScrubberSQL: UnixTimestampScrubberSQLFn = (params = {}) => { let replacement = 'TIMESTAMP_NTZ_FROM_PARTS(' if (params.excludeYear) { replacement += '1970, ' } else { replacement += `DATE_PART('YEAR', ${sqlValueToReplace}), ` } if (params.excludeMonth) { replacement += '1, ' } else { replacement += `DATE_PART('MONTH', ${sqlValueToReplace}), ` } if (params.excludeDay) { replacement += '1, ' } else { replacement += `DATE_PART('DAY', ${sqlValueToReplace}), ` } if (params.excludeTime) { replacement += '0, 0, 0)' } else { replacement += `DATE_PART('HOUR', ${sqlValueToReplace}), DATE_PART('MINUTE', ${sqlValueToReplace}), DATE_PART('SECOND', ${sqlValueToReplace}))` } return replacement } /* Chars From Right scrubber Replace `params.count` characters, from the right to the left, with `params.replacement` Useful for anonymizing zip codes */ export interface CharsFromRightScrubberParams { count: number replacement: string /** * Should replacement be for "full" replacement? default is false, each replaced char will be replaced with replacement. */ replaceFull?: boolean } export type CharsFromRightScrubberFn = ScrubberFn<string | undefined, CharsFromRightScrubberParams> export type CharsFromRightScrubberSQLFn = ScrubberSQLFn<CharsFromRightScrubberParams> export const charsFromRightScrubber: CharsFromRightScrubberFn = ( value, params = { count: 99, replacement: 'X', replaceFull: false }, ) => { if (!value) return const { count, replacement, replaceFull } = params if (replaceFull) { return value.substr(0, value.length - count) + replacement } const lengthToReplace = Math.min(count, value.length) return value.substr(0, value.length - count) + replacement.repeat(lengthToReplace) } export const charsFromRightScrubberSQL: CharsFromRightScrubberSQLFn = ( params = { count: 99, replacement: 'X', replaceFull: false }, ) => { const { count, replacement, replaceFull } = params if (replaceFull) { // remove $count chars from the right, and replace it by $replacement return `SUBSTR(${sqlValueToReplace}, 0, LEN(${sqlValueToReplace}) - ${count}) || '${replacement}'` } // replace each chars from the right by $replacement until $count chars are replaced return `SUBSTR(${sqlValueToReplace}, 0, LEN(${sqlValueToReplace}) - ${count}) || REPEAT('${replacement}', LEAST(${count}, LEN(${sqlValueToReplace})))` } /* Keep Chars From Left scrubber Keep `params.count` characters from the left and replace the rest with `params.replacement` Useful for anonymizing zip codes */ export interface KeepCharsFromLeftScrubberParams { count: number replacement: string /** * Should replacement be for "full" replacement? default is false, each replaced char will be replaced with replacement. */ replaceFull?: boolean } export type KeepCharsFromLeftScrubberFn = ScrubberFn< string | undefined, KeepCharsFromLeftScrubberParams > export type KeepCharsFromLeftScrubberSQLFn = ScrubberSQLFn<KeepCharsFromLeftScrubberParams> export const keepCharsFromLeftScrubber: KeepCharsFromLeftScrubberFn = ( value, params = { count: 99, replacement: 'X', replaceFull: false }, ) => { if (!value) return const { count, replacement, replaceFull } = params if (value.length <= count) { return value } if (replaceFull) { return value.slice(0, count) + replacement } return value.slice(0, count) + replacement.repeat(value.length - count) } export const keepCharsFromLeftScrubberSQL: KeepCharsFromLeftScrubberSQLFn = ( params = { count: 99, replacement: 'X', replaceFull: false }, ) => { const { count, replacement, replaceFull } = params if (replaceFull) { // keep $count chars from the left, and replace rest by $replacement return `IFF(LEN(${sqlValueToReplace}) > ${count}, SUBSTR(${sqlValueToReplace}, 0, ${count}) || '${replacement}', ${sqlValueToReplace})` } // keep $count chars and fill out with $replacement if string was longer return `SUBSTR(${sqlValueToReplace}, 0, ${count}) || REPEAT('${replacement}', LEAST(0, LEN(${sqlValueToReplace})-${count})` } /* Random scrubber Uses the package nanoid to generate a random string given an alphabet and a length */ const ALPHABET_NUMBER = '0123456789' const ALPHABET_LOWERCASE = 'abcdefghijklmnopqrstuvwxyz' const ALPHABET_ALPHANUMERIC_LOWERCASE = [ALPHABET_NUMBER, ALPHABET_LOWERCASE].join('') export interface RandomScrubberParams { alphabet?: string length?: number } export type RandomScrubberFn = ScrubberFn<string, RandomScrubberParams> export type RandomScrubberSQLFn = ScrubberSQLFn<RandomScrubberParams> export const randomScrubber: RandomScrubberFn = (_value, additionalParams) => { const params = { alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE, length: 16, ...additionalParams } return nanoIdCustomAlphabet(params.alphabet, params['length'])() } export const randomScrubberSQL: RandomScrubberSQLFn = additionalParams => { const { length } = { length: 16, ...additionalParams } // This doesn't respect the alphabet :( return `RANDSTR(${length}, ${randomGeneratorSeed})` } /* Random email scrubber Uses the package nanoid to generate a random string given an alphabet and a length and appends a given domain (should include '@') at the end of it */ export interface RandomEmailScrubberParams { alphabet?: string length?: number domain?: string } export type RandomEmailScrubberFn = ScrubberFn<string, RandomEmailScrubberParams> export type RandomEmailScrubberSQLFn = ScrubberSQLFn<RandomEmailScrubberParams> export const randomEmailScrubber: RandomEmailScrubberFn = (_value, additionalParams) => { const params = { alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE, length: 16, domain: '@example.com', ...additionalParams, } return nanoIdCustomAlphabet(params.alphabet, params['length'])() + params.domain } export const randomEmailScrubberSQL: RandomEmailScrubberSQLFn = additionalParams => { const { length, domain } = { // alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE, length: 16, domain: '@example.com', ...additionalParams, } // This doesn't respect the alphabet :( return `RANDSTR(${length}, ${randomGeneratorSeed}) || '${domain}'` } /* Random email in content scrubber Extends the random email scrubber and allows scrubbing emails within strings while maintaining the rest of the string */ export interface RandomEmailInContentScrubberParams { alphabet?: string length?: number domain?: string } export type RandomEmailInContentScrubberFn = ScrubberFn<string, RandomEmailInContentScrubberParams> export type RandomEmailInContentScrubberSQLFn = ScrubberSQLFn<RandomEmailInContentScrubberParams> export const randomEmailInContentScrubber: RandomEmailInContentScrubberFn = ( value, additionalParams, ) => { // Email regex, allows letters const emailRegex = /([a-zA-Z1-9._-]*@[a-zA-Z1-9._-]*\.[a-zA-Z_-]{2,63})/ const matches = emailRegex.exec(value) if (!matches) { // No email found, return as is return value } // Replace all matches with random email const match = matches.pop()! value = value.replace(match, randomEmailScrubber(value, additionalParams)) return value } export const randomEmailInContentScrubberSQL: RandomEmailInContentScrubberSQLFn = additionalParams => { const { length, domain } = { // alphabet: ALPHABET_ALPHANUMERIC_LOWERCASE, length: 16, domain: '@example.com', ...additionalParams, } return `REGEXP_REPLACE( ${sqlValueToReplace}, '[a-zA-Z1-9._-]*@[a-zA-Z1-9._-]*\\.[a-zA-Z_-]{2,63}', RANDSTR(${length}, ${randomGeneratorSeed}) ) || '${domain}'` } /* Salted hash scrubber. Takes an initializationVector param and uses it to salt the value before hashing it. */ export interface SaltedHashScrubberParams { initializationVector: string } export type SaltedHashScrubberFn = ScrubberFn<string, SaltedHashScrubberParams> export type SaltedHashScrubberSQLFn = ScrubberSQLFn<SaltedHashScrubberParams> export const saltedHashScrubber: SaltedHashScrubberFn = (value, params) => { _assert(params?.initializationVector, 'Initialization vector is missing') return crypto.createHash('sha256').update(value).update(params.initializationVector).digest('hex') } export const saltedHashScrubberSQL: SaltedHashScrubberSQLFn = params => { _assert(params?.initializationVector, 'Initialization vector is missing') return `SHA2(${sqlValueToReplace} || '${params.initializationVector}', 256)` } /* Salted hash email scrubber. Takes an initializationVector param and uses it to salt the value before hashing it and suffixing email domain */ export interface SaltedHashEmailScrubberParams { initializationVector: string domain?: string } export type SaltedHashEmailScrubberFn = ScrubberFn<string, SaltedHashEmailScrubberParams> export type SaltedHashEmailScrubberSQLFn = ScrubberSQLFn<SaltedHashEmailScrubberParams> export const saltedHashEmailScrubber: SaltedHashEmailScrubberFn = (value, additionalParams) => { const params = { domain: '@example.com', ...additionalParams, } as SaltedHashEmailScrubberParams _assert(params?.initializationVector, 'Initialization vector is missing') return saltedHashScrubber(value, params) + params.domain } export const saltedHashEmailScrubberSQL: SaltedHashEmailScrubberSQLFn = additionalParams => { const { initializationVector, domain } = { domain: '@example.com', ...additionalParams, } as SaltedHashEmailScrubberParams _assert(initializationVector, 'Initialization vector is missing') return `SHA2(${sqlValueToReplace} || '${initializationVector}', 256) || '${domain}'` } /* Bcrypt string scrubber. Scrubs both salt and hash while maintaining algo and cost factor, thus resulting in a valid, but nonsense bcrypt string */ export type BcryptStringScrubberFn = ScrubberFn<string | undefined, BcryptStringScrubberParams> export type BcryptStringScrubberSQLFn = ScrubberSQLFn<BcryptStringScrubberParams> /* replacements string is a comma seperated list of key-value pairs (seperated by :) that maps bcrypt string prefix (algo + cost factor) to a resulting string replacement. e.g. replacements: '$2a$10$:$2a$10$456,$2a$12$:$2a$12$123' */ export interface BcryptStringScrubberParams { replacements: string } export const bcryptStringScrubber: BcryptStringScrubberFn = (value, params) => { if (!value) return value // Keep value until 3rd $ const cutoff = nthChar(value, '$', 3) if (!cutoff) return `$2a$12$${nanoIdCustomAlphabet(ALPHABET_ALPHANUMERIC_LOWERCASE, 53)()}` const prefix = value.substring(0, cutoff) if (params?.replacements) { for (const kvPair of params.replacements.split(',')) { const [k, v] = kvPair.split(':') if (prefix === k) return v } } return `${prefix}${nanoIdCustomAlphabet(ALPHABET_ALPHANUMERIC_LOWERCASE, 53)()}` } export const bcryptStringScrubberSQL: BcryptStringScrubberSQLFn = params => { // to have at least one WHEN clause, so the ELSE clause is valid let replacementDLL = "WHEN FALSE THEN ''\n " // unpack the replacements here rather than in SQL if (params?.replacements) { for (const kvPair of params.replacements.split(',')) { const [k, v] = kvPair.split(':') replacementDLL += `WHEN '${k}' THEN '${v}'\n ` } } replacementDLL += `ELSE ARRAY_TO_STRING(ARRAY_SLICE(SPLIT(${sqlValueToReplace}, '$'), 0, 3), '$') || '$' || RANDSTR(53, ${randomGeneratorSeed})` return `CASE WHEN ARRAY_SIZE(ARRAY_SLICE(SPLIT(${sqlValueToReplace}, '$'), 0, 3)) >= 3 -- If there are at least 3 $ in the string THEN CASE ARRAY_TO_STRING(ARRAY_SLICE(SPLIT(${sqlValueToReplace}, '$'), 0, 3), '$') || '$' -- this is the prefix ${replacementDLL} END ELSE '$2a$12$' || RANDSTR(53, ${randomGeneratorSeed}) END` } export type SaltedHashSubstringScrubberFn = ScrubberFn< string | undefined, SaltedHashSubstringScrubberParams > export type SaltedHashSubstringScrubberSQLFn = ScrubberSQLFn<SaltedHashSubstringScrubberParams> export interface SaltedHashSubstringScrubberParams { initializationVector: string regex: string } export const saltedHashSubstringScrubber: SaltedHashSubstringScrubberFn = (value, params) => { _assert(params?.initializationVector, 'Initialization vector is missing') _assert(params?.regex, 'Substring or regex is missing') if (!value) return value const regex = new RegExp(params.regex, 'g') return value.replace(regex, substring => crypto.createHash('sha256').update(substring).update(params.initializationVector).digest('hex'), ) } export const saltedHashSubstringScrubberSQL: SaltedHashSubstringScrubberSQLFn = params => { _assert(params?.initializationVector, 'Initialization vector is missing') _assert(params?.regex, 'Substring or regex is missing') const substringToReplace = `COALESCE(REGEXP_SUBSTR(${sqlValueToReplace}, '${params.regex}'), '')` const hashedValue = `SHA2(${substringToReplace} || '${params.initializationVector}', 256)` const replacedValue = `REGEXP_REPLACE(${sqlValueToReplace}, '${params.regex}', ${hashedValue})` return replacedValue } /* A scrubber based on USA recommendations from HIPAA https://www.hhs.gov/hipaa/for-professionals/special-topics/de-identification/index.html#standard */ export type ZipScrubberFn = ScrubberFn<string | undefined, undefined> /** * ZIP areas to scrub completely, due to less than 20,000 inhabitants */ const restrictedZipAreas = [ '036', '059', '063', '102', '203', '556', '692', '790', '821', '823', '830', '831', '878', '879', '884', '890', '893', ] export type ZipScrubberSQLFn = ScrubberSQLFn<undefined> export const zipScrubber: ZipScrubberFn = value => { if (!value) return const leftPart = value.slice(0, 3) if (restrictedZipAreas.includes(leftPart)) return 'XXXXX' return `${leftPart}XX` // de-identify length of zip code as well. } export const zipScrubberSQL: ZipScrubberSQLFn = () => `CASE WHEN ARRAY_CONTAINS( SUBSTR(${sqlValueToReplace}, 0, 3), ['${restrictedZipAreas.join("', '")}']::ARRAY(STRING) ) THEN 'XXXXX' ELSE SUBSTR(${sqlValueToReplace}, 0, 3) || 'XX' END` function nthChar(str: string, character: string, n: number): number | undefined { let count = 0 let i = 0 while (count < n) { i = str.indexOf(character, i) + 1 if (i < 1) { return } count++ if (count === n) return i } } export const defaultScrubbers: ScrubbersMap = { staticScrubber, preserveOriginalScrubber, isoDateStringScrubber, unixTimestampScrubber, undefinedScrubber, charsFromRightScrubber, randomScrubber, randomEmailScrubber, randomEmailInContentScrubber, saltedHashScrubber, saltedHashEmailScrubber, bcryptStringScrubber, saltedHashSubstringScrubber, keepCharsFromLeftScrubber, zipScrubber, } export const defaultScrubbersSQL: ScrubbersSQLMap = { staticScrubber: staticScrubberSQL, preserveOriginalScrubber: preserveOriginalScrubberSQL, isoDateStringScrubber: isoDateStringScrubberSQL, unixTimestampScrubber: unixTimestampScrubberSQL, undefinedScrubber: undefinedScrubberSQL, charsFromRightScrubber: charsFromRightScrubberSQL, randomScrubber: randomScrubberSQL, randomEmailScrubber: randomEmailScrubberSQL, randomEmailInContentScrubber: randomEmailInContentScrubberSQL, saltedHashScrubber: saltedHashScrubberSQL, saltedHashEmailScrubber: saltedHashEmailScrubberSQL, bcryptStringScrubber: bcryptStringScrubberSQL, saltedHashSubstringScrubber: saltedHashSubstringScrubberSQL, keepCharsFromLeftScrubber: keepCharsFromLeftScrubberSQL, zipScrubber: zipScrubberSQL, } const wrapIfMatchSQL = (ifMatch: string | undefined, expression: string): string => { if (ifMatch === undefined) return expression return `CASE WHEN REGEXP_LIKE(${sqlValueToReplace}, '${ifMatch}') THEN ${expression} ELSE ${sqlValueToReplace} END` }