yahdlp
Version:
Enterprise-grade PII detection and redaction tool by Yahya Enterprises
211 lines (189 loc) • 5.81 kB
JavaScript
#!/usr/bin/env node
const fs = require('fs');
const yargs = require('yargs');
const { createWorker } = require('tesseract.js');
const sharp = require('sharp');
const path = require('path');
process.removeAllListeners('warning');
const TEMP_FILE = path.join(process.cwd(), '.yahdlp-temp.json');
const COLORS = {
found: { r: 100, g: 100, b: 200, alpha: 1 },
notFound: { r: 64, g: 64, b: 153, alpha: 0.7 },
textBg: { r: 0, g: 0, b: 0, alpha: 0.7 }
};
const patterns = {
EMAIL: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/,
PHONE: /[+]?(?:\d[\s-]*){10,}/
};
function normalizePhoneNumber(phone) {
return phone.replace(/[\s\-\+()\.]/g, '');
}
function saveInspectedValues(values) {
fs.writeFileSync(TEMP_FILE, JSON.stringify(values));
}
function getInspectedValues() {
try {
return JSON.parse(fs.readFileSync(TEMP_FILE));
} catch {
return { EMAIL: null, PHONE: null };
}
}
async function inspectText(text, types = ['EMAIL', 'PHONE']) {
const findings = [];
const values = { EMAIL: null, PHONE: null };
types.forEach(type => {
const pattern = patterns[type];
const matches = text.match(pattern) || [];
if (matches.length > 0) {
if (type === 'PHONE') {
const normalizedPhone = normalizePhoneNumber(matches[0]);
values[type] = normalizedPhone;
findings.push({
type,
value: matches[0],
normalized: normalizedPhone
});
} else {
values[type] = matches[0].toLowerCase();
findings.push({ type, value: matches[0] });
}
}
});
saveInspectedValues(values);
return findings;
}
async function matchesInspectedValue(text, type) {
const values = getInspectedValues();
if (!values[type]) return false;
if (type === 'PHONE') {
const normalizedTarget = normalizePhoneNumber(values[type]);
const matches = text.match(/\d+/g) || [];
const normalizedText = matches.join('');
return normalizedText.includes(normalizedTarget) || normalizedTarget.includes(normalizedText);
}
const pattern = patterns[type];
const matches = text.match(pattern) || [];
return matches.some(match => match.toLowerCase() === values[type]);
}
async function redactImage(inputPath, outputPath, types) {
try {
const worker = await createWorker();
const { data } = await worker.recognize(inputPath);
let hasMatch = false;
for (const type of types) {
if (await matchesInspectedValue(data.text, type)) {
hasMatch = true;
break;
}
}
const image = sharp(inputPath);
const metadata = await image.metadata();
const colors = hasMatch ? COLORS.found : COLORS.notFound;
await image
.composite([{
input: {
create: {
width: metadata.width,
height: metadata.height,
channels: 4,
background: colors
}
},
top: 0,
left: 0
}, {
input: Buffer.from(`
<svg width="${metadata.width}" height="${metadata.height}">
<style>
.message-container {
font-family: Arial;
font-weight: bold;
}
.title {
fill: white;
font-size: 38px;
}
.subtitle {
fill: white;
font-size: 28px;
}
</style>
<g class="message-container">
${hasMatch ? `
<text x="50%" y="45%" class="title" text-anchor="middle">
The asset has been thoroughly processed and redacted.
</text>
<text x="50%" y="55%" class="subtitle" text-anchor="middle">
As an inclusion of PIIs are detected by yahSystems.
</text>
` : `
<text x="50%" y="50%" class="title" text-anchor="middle">
Inspected PII not found in this asset
</text>
`}
</g>
</svg>
`),
top: 0,
left: 0
}])
.toFile(outputPath);
if (fs.existsSync('eng.traineddata')) fs.unlinkSync('eng.traineddata');
console.log(hasMatch ? 'Found and redacted PII' : 'No matching PII found');
await worker.terminate();
} catch (error) {
console.error('Error:', error);
}
}
yargs
.usage('\nUsage: $0 <command> [options]')
.command('inspect', 'Inspect text for PII', {
text: {
alias: 't',
describe: 'Text to inspect',
demandOption: true
},
email: {
alias: 'e',
type: 'boolean',
describe: 'Inspect for email'
},
phone: {
alias: 'p',
type: 'boolean',
describe: 'Inspect for phone'
}
}, async (argv) => {
const types = [];
if (argv.email) types.push('EMAIL');
if (argv.phone) types.push('PHONE');
if (types.length === 0) types.push('EMAIL', 'PHONE');
const findings = await inspectText(argv.text, types);
console.log(JSON.stringify(findings, null, 2));
})
.command('redact <input> <output>', 'Redact PII from image', {
email: {
alias: 'e',
type: 'boolean',
describe: 'Redact email'
},
phone: {
alias: 'p',
type: 'boolean',
describe: 'Redact phone'
}
}, async (argv) => {
const values = getInspectedValues();
if (!values.EMAIL && !values.PHONE) {
console.error('Please run inspect command first');
return;
}
const types = [];
if (argv.email) types.push('EMAIL');
if (argv.phone) types.push('PHONE');
if (types.length === 0) types.push('EMAIL', 'PHONE');
await redactImage(argv.input, argv.output, types);
})
.demandCommand()
.help()
.argv;