UNPKG

redact-pii

Version:

Remove personally identifiable information from text.

266 lines (243 loc) 9.48 kB
import { get } from 'lodash'; import { IAsyncRedactor } from '../types'; import DLP, { DlpServiceClient } from '@google-cloud/dlp'; export const MAX_DLP_CONTENT_LENGTH = 524288; // a finding quote length that is too short (e.g. 1 char like "S") causes too many false replacements const MIN_FINDING_QUOTE_LENGTH = 2; const minLikelihood = 'LIKELIHOOD_UNSPECIFIED'; const maxFindings = 0; export const defaultInfoTypes = [ { name: 'AMERICAN_BANKERS_CUSIP_ID' }, { name: 'AUSTRALIA_MEDICARE_NUMBER' }, { name: 'AUSTRALIA_TAX_FILE_NUMBER' }, { name: 'BRAZIL_CPF_NUMBER' }, { name: 'CANADA_BC_PHN' }, { name: 'CANADA_DRIVERS_LICENSE_NUMBER' }, { name: 'CANADA_OHIP' }, { name: 'CANADA_PASSPORT' }, { name: 'CANADA_QUEBEC_HIN' }, { name: 'CANADA_SOCIAL_INSURANCE_NUMBER' }, { name: 'CHINA_PASSPORT' }, { name: 'CREDIT_CARD_NUMBER' }, { name: 'EMAIL_ADDRESS' }, { name: 'ETHNIC_GROUP' }, { name: 'FEMALE_NAME' }, { name: 'FIRST_NAME' }, { name: 'FRANCE_CNI' }, { name: 'FRANCE_NIR' }, { name: 'FRANCE_PASSPORT' }, { name: 'GCP_CREDENTIALS' }, { name: 'GERMANY_PASSPORT' }, { name: 'IBAN_CODE' }, { name: 'IMEI_HARDWARE_ID' }, { name: 'INDIA_PAN_INDIVIDUAL' }, { name: 'IP_ADDRESS' }, { name: 'JAPAN_INDIVIDUAL_NUMBER' }, { name: 'JAPAN_PASSPORT' }, { name: 'KOREA_PASSPORT' }, { name: 'KOREA_RRN' }, { name: 'LAST_NAME' }, { name: 'MAC_ADDRESS_LOCAL' }, { name: 'MAC_ADDRESS' }, { name: 'MALE_NAME' }, { name: 'MEXICO_CURP_NUMBER' }, { name: 'MEXICO_PASSPORT' }, { name: 'NETHERLANDS_BSN_NUMBER' }, { name: 'PHONE_NUMBER' }, { name: 'SPAIN_NIE_NUMBER' }, { name: 'SPAIN_NIF_NUMBER' }, { name: 'SPAIN_PASSPORT' }, { name: 'SWIFT_CODE' }, { name: 'UK_DRIVERS_LICENSE_NUMBER' }, { name: 'UK_NATIONAL_HEALTH_SERVICE_NUMBER' }, { name: 'UK_NATIONAL_INSURANCE_NUMBER' }, { name: 'UK_PASSPORT' }, { name: 'UK_TAXPAYER_REFERENCE' }, { name: 'US_ADOPTION_TAXPAYER_IDENTIFICATION_NUMBER' }, { name: 'US_BANK_ROUTING_MICR' }, { name: 'US_DEA_NUMBER' }, { name: 'US_DRIVERS_LICENSE_NUMBER' }, { name: 'US_HEALTHCARE_NPI' }, { name: 'US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER' }, { name: 'US_PASSPORT' }, { name: 'US_PREPARER_TAXPAYER_IDENTIFICATION_NUMBER' }, { name: 'US_SOCIAL_SECURITY_NUMBER' }, { name: 'US_TOLLFREE_PHONE_NUMBER' }, { name: 'US_VEHICLE_IDENTIFICATION_NUMBER' }, { name: 'US_STATE' }, { name: 'FDA_CODE' }, { name: 'ICD9_CODE' }, { name: 'ICD10_CODE' }, { name: 'US_EMPLOYER_IDENTIFICATION_NUMBER' }, { name: 'LOCATION' }, { name: 'DATE' }, { name: 'DATE_OF_BIRTH' }, { name: 'TIME' }, { name: 'PERSON_NAME' }, { name: 'AGE' }, { name: 'GENDER' }, { name: 'ARGENTINA_DNI_NUMBER' }, { name: 'CHILE_CDI_NUMBER' }, { name: 'COLOMBIA_CDC_NUMBER' }, { name: 'NETHERLANDS_PASSPORT' }, { name: 'PARAGUAY_CIC_NUMBER' }, { name: 'PERU_DNI_NUMBER' }, { name: 'PORTUGAL_CDC_NUMBER' }, { name: 'URUGUAY_CDI_NUMBER' }, { name: 'VENEZUELA_CDI_NUMBER' }, ]; const customInfoTypes = [ { infoType: { name: 'URL', }, regex: { pattern: '([^\\s:/?#]+):\\/\\/([^/?#\\s]*)([^?#\\s]*)(\\?([^#\\s]*))?(#([^\\s]*))?', }, }, ]; const likelihoodPriority: { [likelyHoodName: string]: number } = { LIKELIHOOD_UNSPECIFIED: 0, VERY_UNLIKELY: 1, UNLIKELY: 2, POSSIBLE: 3, LIKELY: 4, VERY_LIKELY: 5, }; const includeQuote = true; interface Finding { likelihood: string; quote: string; infoType: { name: string; }; location: { byteRange: { start: string; end: string; }; }; } // finding location.byteRange.start and end are strings for some reason, so must convert to numbers const getFindingStart = (finding: Finding) => Number(get(finding, 'location.byteRange.start', 0)); const getFindingEnd = (finding: Finding) => Number(get(finding, 'location.byteRange.end', 0)); /** * Remove overlapping findings which can cause messed up tokens. * * For example "My name is John D." will cause 3 findings: * - PERSON_NAME for text "John S." at range 11-17 * - FIRST_NAME for text "John" at range 11-15 * - LAST_NAME for text "S." at range 15-17 * * The FIRST_NAME and LAST_NAME findings overlap the first finding so there is no need to search for them */ function removeOverlappingFindings(findings: Finding[]): Finding[] { // early return if only have 0 or 1 findings if (findings.length <= 1) { return findings; } // sort findings by ascending start findings.sort((a, b) => getFindingStart(a) - getFindingStart(b)); // remove findings that overlap (but keep the one with higher likelihood) const resultFindings = [findings[0]]; for (let i = 1; i < findings.length; i++) { const current = findings[i]; const previous = resultFindings[resultFindings.length - 1]; // when findings overlap, keep the one with the higher likelihood if (getFindingStart(current) < getFindingEnd(previous)) { if (likelihoodPriority[current.likelihood] > likelihoodPriority[previous.likelihood]) { resultFindings[resultFindings.length - 1] = current; } } else { // no overlap resultFindings.push(current); } } return resultFindings; } /** @public */ export interface GoogleDLPRedactorOptions { /** options to pass down to the Google Cloud DLP client. Check https://cloud.google.com/nodejs/docs/reference/dlp/0.10.x/v2.DlpServiceClient for the available options */ clientOptions?: any; /** object containing `inspectConfig` options that should override the default `inspectConfig` options. * For example, this can be used to set `customInfoTypes` or define a `ruleSet` to modify behavior of info types (e.g. exclude certain patterns). * Check https://cloud.google.com/nodejs/docs/reference/dlp/0.10.x/v2.DlpServiceClient#inspectContent for details. */ inspectConfig?: any; /** Array of extra DLP info type names to also include in addition to the default set */ includeInfoTypes?: string[]; /** Array of DLP info type names from the default set that should be excluded */ excludeInfoTypes?: string[]; /** If auto batching when content length exceeds DLP's limit should be disabled */ disableAutoBatchWhenContentSizeExceedsLimit?: boolean; /** Maximum content size for when auto batching is turned on. */ maxContentSizeForBatch?: number; } /** @public */ export class GoogleDLPRedactor implements IAsyncRedactor { dlpClient: DlpServiceClient; constructor(private opts: GoogleDLPRedactorOptions = {}) { this.dlpClient = new DLP.DlpServiceClient(this.opts.clientOptions); } async redactAsync(textToRedact: string): Promise<string> { // default batch size is MAX_DLP_CONTENT_LENGTH/2 because some unicode characters can take more than 1 byte // and its difficult to get a substring of a desired target length in bytes const maxContentSize = this.opts.maxContentSizeForBatch || MAX_DLP_CONTENT_LENGTH / 2; if (textToRedact.length > maxContentSize && !this.opts.disableAutoBatchWhenContentSizeExceedsLimit) { const batchPromises = []; let batchStartIndex = 0; while (batchStartIndex < textToRedact.length) { const batchEndIndex = batchStartIndex + maxContentSize; const batchText = textToRedact.substring(batchStartIndex, batchEndIndex); batchPromises.push(this.doRedactAsync(batchText)); batchStartIndex = batchEndIndex; } const batchResults = await Promise.all(batchPromises); return batchResults.join(''); } else { return this.doRedactAsync(textToRedact); } } async doRedactAsync(textToRedact: string): Promise<string> { const projectId = await this.dlpClient.getProjectId(); // handle info type excludes and includes const infoTypes = defaultInfoTypes .filter((infoType) => !this.opts.excludeInfoTypes || !this.opts.excludeInfoTypes.includes(infoType.name)) .concat((this.opts.includeInfoTypes || []).map((infoTypeName) => ({ name: infoTypeName }))); const response: any = await this.dlpClient.inspectContent({ parent: this.dlpClient.projectPath(projectId), inspectConfig: Object.assign( { infoTypes, customInfoTypes, minLikelihood, includeQuote, limits: { maxFindingsPerRequest: maxFindings, }, }, this.opts.inspectConfig ), item: { value: textToRedact }, }); const findings = response[0].result.findings; if (findings.length > 0) { // this is necessary to prevent tokens getting messed up with other repeated partial tokens (e.g. "my name is PERLALALALALALALALALALALALALALALALALAL...") const findingsWithoutOverlaps = removeOverlappingFindings(findings); // sort findings by highest likelihood first findingsWithoutOverlaps.sort(function (a: any, b: any) { return likelihoodPriority[b.likelihood] - likelihoodPriority[a.likelihood]; }); // in order of highest likelihood replace finding with info type name findingsWithoutOverlaps.forEach((finding: any) => { let find = finding.quote; if (find !== finding.infoType.name && find.length >= MIN_FINDING_QUOTE_LENGTH) { let numSearches = 0; while (numSearches++ < 1000 && textToRedact.indexOf(find) >= 0) { textToRedact = textToRedact.replace(find, finding.infoType.name); } } }); } return textToRedact; } }