UNPKG

redact-pii

Version:

Remove personally identifiable information from text.

229 lines 9.84 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.GoogleDLPRedactor = exports.defaultInfoTypes = exports.MAX_DLP_CONTENT_LENGTH = void 0; const lodash_1 = require("lodash"); const dlp_1 = require("@google-cloud/dlp"); exports.MAX_DLP_CONTENT_LENGTH = 524288; // a finding quote length that is too short (e.g. 1 char like "S") causes too many false replacements const MIN_FINDING_QUOTE_LENGTH = 2; const minLikelihood = 'LIKELIHOOD_UNSPECIFIED'; const maxFindings = 0; exports.defaultInfoTypes = [ { name: 'AMERICAN_BANKERS_CUSIP_ID' }, { name: 'AUSTRALIA_MEDICARE_NUMBER' }, { name: 'AUSTRALIA_TAX_FILE_NUMBER' }, { name: 'BRAZIL_CPF_NUMBER' }, { name: 'CANADA_BC_PHN' }, { name: 'CANADA_DRIVERS_LICENSE_NUMBER' }, { name: 'CANADA_OHIP' }, { name: 'CANADA_PASSPORT' }, { name: 'CANADA_QUEBEC_HIN' }, { name: 'CANADA_SOCIAL_INSURANCE_NUMBER' }, { name: 'CHINA_PASSPORT' }, { name: 'CREDIT_CARD_NUMBER' }, { name: 'EMAIL_ADDRESS' }, { name: 'ETHNIC_GROUP' }, { name: 'FEMALE_NAME' }, { name: 'FIRST_NAME' }, { name: 'FRANCE_CNI' }, { name: 'FRANCE_NIR' }, { name: 'FRANCE_PASSPORT' }, { name: 'GCP_CREDENTIALS' }, { name: 'GERMANY_PASSPORT' }, { name: 'IBAN_CODE' }, { name: 'IMEI_HARDWARE_ID' }, { name: 'INDIA_PAN_INDIVIDUAL' }, { name: 'IP_ADDRESS' }, { name: 'JAPAN_INDIVIDUAL_NUMBER' }, { name: 'JAPAN_PASSPORT' }, { name: 'KOREA_PASSPORT' }, { name: 'KOREA_RRN' }, { name: 'LAST_NAME' }, { name: 'MAC_ADDRESS_LOCAL' }, { name: 'MAC_ADDRESS' }, { name: 'MALE_NAME' }, { name: 'MEXICO_CURP_NUMBER' }, { name: 'MEXICO_PASSPORT' }, { name: 'NETHERLANDS_BSN_NUMBER' }, { name: 'PHONE_NUMBER' }, { name: 'SPAIN_NIE_NUMBER' }, { name: 'SPAIN_NIF_NUMBER' }, { name: 'SPAIN_PASSPORT' }, { name: 'SWIFT_CODE' }, { name: 'UK_DRIVERS_LICENSE_NUMBER' }, { name: 'UK_NATIONAL_HEALTH_SERVICE_NUMBER' }, { name: 'UK_NATIONAL_INSURANCE_NUMBER' }, { name: 'UK_PASSPORT' }, { name: 'UK_TAXPAYER_REFERENCE' }, { name: 'US_ADOPTION_TAXPAYER_IDENTIFICATION_NUMBER' }, { name: 'US_BANK_ROUTING_MICR' }, { name: 'US_DEA_NUMBER' }, { name: 'US_DRIVERS_LICENSE_NUMBER' }, { name: 'US_HEALTHCARE_NPI' }, { name: 'US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER' }, { name: 'US_PASSPORT' }, { name: 'US_PREPARER_TAXPAYER_IDENTIFICATION_NUMBER' }, { name: 'US_SOCIAL_SECURITY_NUMBER' }, { name: 'US_TOLLFREE_PHONE_NUMBER' }, { name: 'US_VEHICLE_IDENTIFICATION_NUMBER' }, { name: 'US_STATE' }, { name: 'FDA_CODE' }, { name: 'ICD9_CODE' }, { name: 'ICD10_CODE' }, { name: 'US_EMPLOYER_IDENTIFICATION_NUMBER' }, { name: 'LOCATION' }, { name: 'DATE' }, { name: 'DATE_OF_BIRTH' }, { name: 'TIME' }, { name: 'PERSON_NAME' }, { name: 'AGE' }, { name: 'GENDER' }, { name: 'ARGENTINA_DNI_NUMBER' }, { name: 'CHILE_CDI_NUMBER' }, { name: 'COLOMBIA_CDC_NUMBER' }, { name: 'NETHERLANDS_PASSPORT' }, { name: 'PARAGUAY_CIC_NUMBER' }, { name: 'PERU_DNI_NUMBER' }, { name: 'PORTUGAL_CDC_NUMBER' }, { name: 'URUGUAY_CDI_NUMBER' }, { name: 'VENEZUELA_CDI_NUMBER' }, ]; const customInfoTypes = [ { infoType: { name: 'URL', }, regex: { pattern: '([^\\s:/?#]+):\\/\\/([^/?#\\s]*)([^?#\\s]*)(\\?([^#\\s]*))?(#([^\\s]*))?', }, }, ]; const likelihoodPriority = { LIKELIHOOD_UNSPECIFIED: 0, VERY_UNLIKELY: 1, UNLIKELY: 2, POSSIBLE: 3, LIKELY: 4, VERY_LIKELY: 5, }; const includeQuote = true; // finding location.byteRange.start and end are strings for some reason, so must convert to numbers const getFindingStart = (finding) => Number((0, lodash_1.get)(finding, 'location.byteRange.start', 0)); const getFindingEnd = (finding) => Number((0, lodash_1.get)(finding, 'location.byteRange.end', 0)); /** * Remove overlapping findings which can cause messed up tokens. * * For example "My name is John D." will cause 3 findings: * - PERSON_NAME for text "John S." at range 11-17 * - FIRST_NAME for text "John" at range 11-15 * - LAST_NAME for text "S." at range 15-17 * * The FIRST_NAME and LAST_NAME findings overlap the first finding so there is no need to search for them */ function removeOverlappingFindings(findings) { // early return if only have 0 or 1 findings if (findings.length <= 1) { return findings; } // sort findings by ascending start findings.sort((a, b) => getFindingStart(a) - getFindingStart(b)); // remove findings that overlap (but keep the one with higher likelihood) const resultFindings = [findings[0]]; for (let i = 1; i < findings.length; i++) { const current = findings[i]; const previous = resultFindings[resultFindings.length - 1]; // when findings overlap, keep the one with the higher likelihood if (getFindingStart(current) < getFindingEnd(previous)) { if (likelihoodPriority[current.likelihood] > likelihoodPriority[previous.likelihood]) { resultFindings[resultFindings.length - 1] = current; } } else { // no overlap resultFindings.push(current); } } return resultFindings; } /** @public */ class GoogleDLPRedactor { constructor(opts = {}) { this.opts = opts; this.dlpClient = new dlp_1.default.DlpServiceClient(this.opts.clientOptions); } redactAsync(textToRedact) { return __awaiter(this, void 0, void 0, function* () { // default batch size is MAX_DLP_CONTENT_LENGTH/2 because some unicode characters can take more than 1 byte // and its difficult to get a substring of a desired target length in bytes const maxContentSize = this.opts.maxContentSizeForBatch || exports.MAX_DLP_CONTENT_LENGTH / 2; if (textToRedact.length > maxContentSize && !this.opts.disableAutoBatchWhenContentSizeExceedsLimit) { const batchPromises = []; let batchStartIndex = 0; while (batchStartIndex < textToRedact.length) { const batchEndIndex = batchStartIndex + maxContentSize; const batchText = textToRedact.substring(batchStartIndex, batchEndIndex); batchPromises.push(this.doRedactAsync(batchText)); batchStartIndex = batchEndIndex; } const batchResults = yield Promise.all(batchPromises); return batchResults.join(''); } else { return this.doRedactAsync(textToRedact); } }); } doRedactAsync(textToRedact) { return __awaiter(this, void 0, void 0, function* () { const projectId = yield this.dlpClient.getProjectId(); // handle info type excludes and includes const infoTypes = exports.defaultInfoTypes .filter((infoType) => !this.opts.excludeInfoTypes || !this.opts.excludeInfoTypes.includes(infoType.name)) .concat((this.opts.includeInfoTypes || []).map((infoTypeName) => ({ name: infoTypeName }))); const response = yield this.dlpClient.inspectContent({ parent: this.dlpClient.projectPath(projectId), inspectConfig: Object.assign({ infoTypes, customInfoTypes, minLikelihood, includeQuote, limits: { maxFindingsPerRequest: maxFindings, }, }, this.opts.inspectConfig), item: { value: textToRedact }, }); const findings = response[0].result.findings; if (findings.length > 0) { // this is necessary to prevent tokens getting messed up with other repeated partial tokens (e.g. "my name is PERLALALALALALALALALALALALALALALALALAL...") const findingsWithoutOverlaps = removeOverlappingFindings(findings); // sort findings by highest likelihood first findingsWithoutOverlaps.sort(function (a, b) { return likelihoodPriority[b.likelihood] - likelihoodPriority[a.likelihood]; }); // in order of highest likelihood replace finding with info type name findingsWithoutOverlaps.forEach((finding) => { let find = finding.quote; if (find !== finding.infoType.name && find.length >= MIN_FINDING_QUOTE_LENGTH) { let numSearches = 0; while (numSearches++ < 1000 && textToRedact.indexOf(find) >= 0) { textToRedact = textToRedact.replace(find, finding.infoType.name); } } }); } return textToRedact; }); } } exports.GoogleDLPRedactor = GoogleDLPRedactor; //# sourceMappingURL=GoogleDLPRedactor.js.map