redact-pii
Version:
Remove personally identifiable information from text.
266 lines (243 loc) • 9.48 kB
text/typescript
import { get } from 'lodash';
import { IAsyncRedactor } from '../types';
import DLP, { DlpServiceClient } from '@google-cloud/dlp';
export const MAX_DLP_CONTENT_LENGTH = 524288;
// a finding quote length that is too short (e.g. 1 char like "S") causes too many false replacements
const MIN_FINDING_QUOTE_LENGTH = 2;
const minLikelihood = 'LIKELIHOOD_UNSPECIFIED';
const maxFindings = 0;
export const defaultInfoTypes = [
{ name: 'AMERICAN_BANKERS_CUSIP_ID' },
{ name: 'AUSTRALIA_MEDICARE_NUMBER' },
{ name: 'AUSTRALIA_TAX_FILE_NUMBER' },
{ name: 'BRAZIL_CPF_NUMBER' },
{ name: 'CANADA_BC_PHN' },
{ name: 'CANADA_DRIVERS_LICENSE_NUMBER' },
{ name: 'CANADA_OHIP' },
{ name: 'CANADA_PASSPORT' },
{ name: 'CANADA_QUEBEC_HIN' },
{ name: 'CANADA_SOCIAL_INSURANCE_NUMBER' },
{ name: 'CHINA_PASSPORT' },
{ name: 'CREDIT_CARD_NUMBER' },
{ name: 'EMAIL_ADDRESS' },
{ name: 'ETHNIC_GROUP' },
{ name: 'FEMALE_NAME' },
{ name: 'FIRST_NAME' },
{ name: 'FRANCE_CNI' },
{ name: 'FRANCE_NIR' },
{ name: 'FRANCE_PASSPORT' },
{ name: 'GCP_CREDENTIALS' },
{ name: 'GERMANY_PASSPORT' },
{ name: 'IBAN_CODE' },
{ name: 'IMEI_HARDWARE_ID' },
{ name: 'INDIA_PAN_INDIVIDUAL' },
{ name: 'IP_ADDRESS' },
{ name: 'JAPAN_INDIVIDUAL_NUMBER' },
{ name: 'JAPAN_PASSPORT' },
{ name: 'KOREA_PASSPORT' },
{ name: 'KOREA_RRN' },
{ name: 'LAST_NAME' },
{ name: 'MAC_ADDRESS_LOCAL' },
{ name: 'MAC_ADDRESS' },
{ name: 'MALE_NAME' },
{ name: 'MEXICO_CURP_NUMBER' },
{ name: 'MEXICO_PASSPORT' },
{ name: 'NETHERLANDS_BSN_NUMBER' },
{ name: 'PHONE_NUMBER' },
{ name: 'SPAIN_NIE_NUMBER' },
{ name: 'SPAIN_NIF_NUMBER' },
{ name: 'SPAIN_PASSPORT' },
{ name: 'SWIFT_CODE' },
{ name: 'UK_DRIVERS_LICENSE_NUMBER' },
{ name: 'UK_NATIONAL_HEALTH_SERVICE_NUMBER' },
{ name: 'UK_NATIONAL_INSURANCE_NUMBER' },
{ name: 'UK_PASSPORT' },
{ name: 'UK_TAXPAYER_REFERENCE' },
{ name: 'US_ADOPTION_TAXPAYER_IDENTIFICATION_NUMBER' },
{ name: 'US_BANK_ROUTING_MICR' },
{ name: 'US_DEA_NUMBER' },
{ name: 'US_DRIVERS_LICENSE_NUMBER' },
{ name: 'US_HEALTHCARE_NPI' },
{ name: 'US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER' },
{ name: 'US_PASSPORT' },
{ name: 'US_PREPARER_TAXPAYER_IDENTIFICATION_NUMBER' },
{ name: 'US_SOCIAL_SECURITY_NUMBER' },
{ name: 'US_TOLLFREE_PHONE_NUMBER' },
{ name: 'US_VEHICLE_IDENTIFICATION_NUMBER' },
{ name: 'US_STATE' },
{ name: 'FDA_CODE' },
{ name: 'ICD9_CODE' },
{ name: 'ICD10_CODE' },
{ name: 'US_EMPLOYER_IDENTIFICATION_NUMBER' },
{ name: 'LOCATION' },
{ name: 'DATE' },
{ name: 'DATE_OF_BIRTH' },
{ name: 'TIME' },
{ name: 'PERSON_NAME' },
{ name: 'AGE' },
{ name: 'GENDER' },
{ name: 'ARGENTINA_DNI_NUMBER' },
{ name: 'CHILE_CDI_NUMBER' },
{ name: 'COLOMBIA_CDC_NUMBER' },
{ name: 'NETHERLANDS_PASSPORT' },
{ name: 'PARAGUAY_CIC_NUMBER' },
{ name: 'PERU_DNI_NUMBER' },
{ name: 'PORTUGAL_CDC_NUMBER' },
{ name: 'URUGUAY_CDI_NUMBER' },
{ name: 'VENEZUELA_CDI_NUMBER' },
];
const customInfoTypes = [
{
infoType: {
name: 'URL',
},
regex: {
pattern: '([^\\s:/?#]+):\\/\\/([^/?#\\s]*)([^?#\\s]*)(\\?([^#\\s]*))?(#([^\\s]*))?',
},
},
];
const likelihoodPriority: { [likelyHoodName: string]: number } = {
LIKELIHOOD_UNSPECIFIED: 0,
VERY_UNLIKELY: 1,
UNLIKELY: 2,
POSSIBLE: 3,
LIKELY: 4,
VERY_LIKELY: 5,
};
const includeQuote = true;
interface Finding {
likelihood: string;
quote: string;
infoType: {
name: string;
};
location: {
byteRange: {
start: string;
end: string;
};
};
}
// finding location.byteRange.start and end are strings for some reason, so must convert to numbers
const getFindingStart = (finding: Finding) => Number(get(finding, 'location.byteRange.start', 0));
const getFindingEnd = (finding: Finding) => Number(get(finding, 'location.byteRange.end', 0));
/**
* Remove overlapping findings which can cause messed up tokens.
*
* For example "My name is John D." will cause 3 findings:
* - PERSON_NAME for text "John S." at range 11-17
* - FIRST_NAME for text "John" at range 11-15
* - LAST_NAME for text "S." at range 15-17
*
* The FIRST_NAME and LAST_NAME findings overlap the first finding so there is no need to search for them
*/
function removeOverlappingFindings(findings: Finding[]): Finding[] {
// early return if only have 0 or 1 findings
if (findings.length <= 1) {
return findings;
}
// sort findings by ascending start
findings.sort((a, b) => getFindingStart(a) - getFindingStart(b));
// remove findings that overlap (but keep the one with higher likelihood)
const resultFindings = [findings[0]];
for (let i = 1; i < findings.length; i++) {
const current = findings[i];
const previous = resultFindings[resultFindings.length - 1];
// when findings overlap, keep the one with the higher likelihood
if (getFindingStart(current) < getFindingEnd(previous)) {
if (likelihoodPriority[current.likelihood] > likelihoodPriority[previous.likelihood]) {
resultFindings[resultFindings.length - 1] = current;
}
} else {
// no overlap
resultFindings.push(current);
}
}
return resultFindings;
}
/** @public */
export interface GoogleDLPRedactorOptions {
/** options to pass down to the Google Cloud DLP client. Check https://cloud.google.com/nodejs/docs/reference/dlp/0.10.x/v2.DlpServiceClient for the available options */
clientOptions?: any;
/** object containing `inspectConfig` options that should override the default `inspectConfig` options.
* For example, this can be used to set `customInfoTypes` or define a `ruleSet` to modify behavior of info types (e.g. exclude certain patterns).
* Check https://cloud.google.com/nodejs/docs/reference/dlp/0.10.x/v2.DlpServiceClient#inspectContent for details. */
inspectConfig?: any;
/** Array of extra DLP info type names to also include in addition to the default set */
includeInfoTypes?: string[];
/** Array of DLP info type names from the default set that should be excluded */
excludeInfoTypes?: string[];
/** If auto batching when content length exceeds DLP's limit should be disabled */
disableAutoBatchWhenContentSizeExceedsLimit?: boolean;
/** Maximum content size for when auto batching is turned on. */
maxContentSizeForBatch?: number;
}
/** @public */
export class GoogleDLPRedactor implements IAsyncRedactor {
dlpClient: DlpServiceClient;
constructor(private opts: GoogleDLPRedactorOptions = {}) {
this.dlpClient = new DLP.DlpServiceClient(this.opts.clientOptions);
}
async redactAsync(textToRedact: string): Promise<string> {
// default batch size is MAX_DLP_CONTENT_LENGTH/2 because some unicode characters can take more than 1 byte
// and its difficult to get a substring of a desired target length in bytes
const maxContentSize = this.opts.maxContentSizeForBatch || MAX_DLP_CONTENT_LENGTH / 2;
if (textToRedact.length > maxContentSize && !this.opts.disableAutoBatchWhenContentSizeExceedsLimit) {
const batchPromises = [];
let batchStartIndex = 0;
while (batchStartIndex < textToRedact.length) {
const batchEndIndex = batchStartIndex + maxContentSize;
const batchText = textToRedact.substring(batchStartIndex, batchEndIndex);
batchPromises.push(this.doRedactAsync(batchText));
batchStartIndex = batchEndIndex;
}
const batchResults = await Promise.all(batchPromises);
return batchResults.join('');
} else {
return this.doRedactAsync(textToRedact);
}
}
async doRedactAsync(textToRedact: string): Promise<string> {
const projectId = await this.dlpClient.getProjectId();
// handle info type excludes and includes
const infoTypes = defaultInfoTypes
.filter((infoType) => !this.opts.excludeInfoTypes || !this.opts.excludeInfoTypes.includes(infoType.name))
.concat((this.opts.includeInfoTypes || []).map((infoTypeName) => ({ name: infoTypeName })));
const response: any = await this.dlpClient.inspectContent({
parent: this.dlpClient.projectPath(projectId),
inspectConfig: Object.assign(
{
infoTypes,
customInfoTypes,
minLikelihood,
includeQuote,
limits: {
maxFindingsPerRequest: maxFindings,
},
},
this.opts.inspectConfig
),
item: { value: textToRedact },
});
const findings = response[0].result.findings;
if (findings.length > 0) {
// this is necessary to prevent tokens getting messed up with other repeated partial tokens (e.g. "my name is PERLALALALALALALALALALALALALALALALALAL...")
const findingsWithoutOverlaps = removeOverlappingFindings(findings);
// sort findings by highest likelihood first
findingsWithoutOverlaps.sort(function (a: any, b: any) {
return likelihoodPriority[b.likelihood] - likelihoodPriority[a.likelihood];
});
// in order of highest likelihood replace finding with info type name
findingsWithoutOverlaps.forEach((finding: any) => {
let find = finding.quote;
if (find !== finding.infoType.name && find.length >= MIN_FINDING_QUOTE_LENGTH) {
let numSearches = 0;
while (numSearches++ < 1000 && textToRedact.indexOf(find) >= 0) {
textToRedact = textToRedact.replace(find, finding.infoType.name);
}
}
});
}
return textToRedact;
}
}