webform-privacy-consent-scanner
Version:
Advanced web form scanner detecting Google Forms, HubSpot Forms, Microsoft Forms, Formstack Forms with comprehensive CMP detection including Cookiebot, OneTrust, Efilli, and GDPR compliance auditing. Also detects 3rd party application links and embedded f
687 lines (606 loc) • 25.2 kB
JavaScript
#!/usr/bin/env node
/**
* form-scanner: Detect Google Forms or HubSpot forms on a list of URLs.
* Requires Node.js >= 18 (built-in fetch). Optional: playwright for --dynamic.
*/
import fs from 'node:fs';
import path from 'node:path';
import { setTimeout as delay } from 'node:timers/promises';
import pLimit from 'p-limit';
import { normalizeUrl, globMatch, parseCollectorPatterns, extractContext } from './lib/patterns.mjs';
const argv = process.argv.slice(2);
const getArg = (name, def = undefined) => {
const idx = argv.findIndex(a => a === `--${name}` || a.startsWith(`--${name}=`));
if (idx === -1) return def;
const val = argv[idx].includes('=') ? argv[idx].split('=').slice(1).join('=') : argv[idx + 1];
return val === undefined ? true : val;
};
const input = getArg('input');
const out = getArg('out');
const concurrency = parseInt(getArg('concurrency', '8'), 10);
const timeoutMs = parseInt(getArg('timeout', '15000'), 10);
const dynamicFlag = !!getArg('dynamic', false);
const dynamicWaitMs = parseInt(getArg('wait', '6000'), 10);
const cmpFlag = !!getArg('cmp', false);
const collectorsArg = getArg('collectors', '');
// Generate timestamp for output files if not specified
function generateTimestampedFilename(baseName) {
const now = new Date();
const timestamp = now.toISOString().replace(/[:.]/g, '-').slice(0, -5); // YYYY-MM-DDTHH-MM-SS format
const extension = baseName.split('.').pop();
const nameWithoutExt = baseName.replace('.' + extension, '');
return `${nameWithoutExt}_${timestamp}.${extension}`;
}
// Determine output filenames
let outputCsv, outputJson;
if (out) {
outputCsv = out;
outputJson = out.replace(/\.csv$/i, '.json');
} else {
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, -5);
outputCsv = `results_${timestamp}.csv`;
outputJson = `results_${timestamp}.json`;
}
if (!input) {
console.error('Usage: node scanner.mjs --input urls.txt [--out results.csv] [--concurrency 8] [--timeout 15000] [--dynamic] [--wait 6000] [--cmp] [--collectors "pattern1/*,pattern2/*"]');
console.error('Note: If --out is not specified, files will be saved with timestamp (e.g., results_2024-12-19T14-30-22.csv)');
process.exit(1);
}
// Prepare patterns from config (will be loaded in main)
let patterns = {};
let cmpPatterns = [];
let collectorPatterns = [];
let globalConfig = null; // Global config for fetchDynamic
// Detection functions will be defined in main after config loading
function truncate(s, n = 220) { return s.length > n ? s.slice(0, n) + '…' : s; }
async function fetchStatic(url) {
const ctrl = new AbortController();
const t = setTimeout(() => ctrl.abort(), timeoutMs);
try {
const r = await fetch(url, {
signal: ctrl.signal,
redirect: 'follow',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
});
const status = r.status;
const contentType = r.headers.get('content-type') || '';
const text = contentType.includes('text/') || contentType.includes('html')
? await r.text()
: '';
return { status, text, contentType };
} catch (err) {
const errorMsg = err?.message || err?.code || String(err);
console.log(`STATIC_FETCH_ERROR for ${url}: ${errorMsg}, trying curl fallback...`);
// Try curl fallback
try {
const { spawn } = await import('child_process');
const curl = spawn('curl', [
'-s', '-L', '--max-time', Math.ceil(timeoutMs / 1000).toString(),
'-H', 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'-H', 'Accept-Language: en-US,en;q=0.9',
url
], { stdio: ['pipe', 'pipe', 'pipe'] });
let output = '';
let errorOutput = '';
curl.stdout.on('data', (data) => {
output += data.toString();
});
curl.stderr.on('data', (data) => {
errorOutput += data.toString();
});
return new Promise((resolve) => {
curl.on('close', (code) => {
if (code === 0 && output) {
console.log(`CURL_FALLBACK_SUCCESS for ${url}: ${output.length} bytes`);
resolve({ status: 200, text: output, contentType: 'text/html' });
} else {
console.log(`CURL_FALLBACK_FAILED for ${url}: exit code ${code}, error: ${errorOutput}`);
resolve({ status: 0, text: '', error: `fetch failed: ${errorMsg}, curl fallback failed: ${errorOutput}` });
}
});
curl.on('error', (error) => {
console.log(`CURL_FALLBACK_ERROR for ${url}: ${error.message}`);
resolve({ status: 0, text: '', error: `fetch failed: ${errorMsg}, curl error: ${error.message}` });
});
});
} catch (curlErr) {
console.log(`CURL_FALLBACK_UNAVAILABLE for ${url}: ${curlErr.message}`);
return { status: 0, text: '', error: `fetch failed: ${errorMsg}, curl unavailable: ${curlErr.message}` };
}
} finally {
clearTimeout(t);
}
}
async function fetchDynamic(url) {
console.log('DYNAMIC_MODE_START for:', url);
let pw;
try {
pw = await import('playwright');
} catch (e) {
console.log('⚠️ Playwright not installed, falling back to static mode');
console.log('💡 Tip: Install Playwright for dynamic scanning: npm i -D playwright && npx playwright install');
return { status: 0, text: '', error: 'playwright_not_available' };
}
const { chromium } = pw;
let browser;
try {
browser = await chromium.launch({ headless: true });
const ctx = await browser.newContext({
viewport: { width: 1366, height: 900 },
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
});
const page = await ctx.newPage();
const resp = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: timeoutMs });
// Handle CMP banners if enabled
console.log('CMP_FLAG_CHECK:', cmpFlag, 'GLOBAL_CONFIG_CMP_ENABLED:', globalConfig?.cmp?.enabled, 'GLOBAL_CONFIG_EXISTS:', !!globalConfig);
if (cmpFlag && globalConfig?.cmp?.enabled) {
console.log('CMP_CONDITION_MET');
try {
// Try to find and click common consent buttons
const consentSelectors = globalConfig.cmp.selectors || [];
for (const selector of consentSelectors) {
try {
const element = await page.$(selector);
if (element) {
const isVisible = await element.isVisible();
if (isVisible) {
await element.click();
console.log(`Clicked consent button: ${selector}`);
await delay(1000); // Wait for consent to be processed
break;
}
}
} catch (e) {
// Continue to next selector
}
}
// Wait a bit more after consent handling
await delay(2000);
// Enhanced CMP detection for GTM and Efilli
console.log('CMP_START');
console.log('CMP_CHECK');
// First check for immediate CMP presence
const html = await page.content();
const immediateCMP = {
hasEfilli: html.includes('efilli') || html.includes('efilli.com'),
hasGTM: html.includes('googletagmanager.com') || html.includes('GTM-'),
hasCookieConsent: html.includes('cookie') && (html.includes('consent') || html.includes('gdpr'))
};
console.log(`📊 Immediate CMP check: Efilli=${immediateCMP.hasEfilli}, GTM=${immediateCMP.hasGTM}, Consent=${immediateCMP.hasCookieConsent}`);
// Wait for GTM to be available
if (immediateCMP.hasGTM) {
await page.waitForFunction(() => {
return window.google_tag_manager ||
window.gtag ||
document.querySelector('script[src*="googletagmanager.com"]') ||
document.querySelector('script[src*="gtm.start"]');
}, { timeout: 8000 }).catch(() => {
console.log('GTM initialization timeout');
});
}
// Special handling for Efilli
if (immediateCMP.hasEfilli || immediateCMP.hasGTM) {
console.log('Waiting for Efilli/CMP initialization...');
// Wait for Efilli script to be available
await page.waitForFunction(() => {
return document.querySelector('script[src*="efilli"]') ||
document.querySelector('script[src*="bundles.efilli.com"]') ||
window.Efilli ||
document.documentElement.innerHTML.includes('efilli.com');
}, { timeout: 10000 }).catch(() => {
console.log('Efilli script detection timeout');
});
// Wait for Efilli to initialize and show consent UI
await delay(2000);
// Check for Efilli-specific consent elements
const efilliSelectors = [
'[data-efilli]',
'.efilli-consent',
'.efilli-banner',
'[class*="efilli"]',
'button[onclick*="efilli"]',
'button[onclick*="consent"]',
'button[onclick*="cookie"]',
'.cookie-consent',
'.gdpr-banner',
'.consent-modal',
'#cookie-consent',
'#gdpr-consent'
];
console.log('Checking for Efilli consent elements...');
for (const selector of [...globalConfig.cmp.selectors, ...efilliSelectors]) {
try {
const element = await page.$(selector);
if (element) {
const isVisible = await element.isVisible();
const box = await element.boundingBox().catch(() => null);
if (isVisible && box) {
console.log(`Found visible consent element: ${selector}`);
await element.click();
console.log(`Clicked consent button: ${selector}`);
await delay(1500); // Wait for consent to be processed
break;
}
}
} catch (e) {
// Continue to next selector
}
}
}
// Additional wait for any dynamic content
await delay(2000);
} catch (cmpError) {
console.log(`Enhanced CMP detection failed: ${cmpError.message}`);
}
}
await delay(dynamicWaitMs);
const html = await page.content();
const status = resp ? resp.status() : 0;
await ctx.close();
await browser.close();
return { status, text: html, contentType: 'text/html' };
} catch (err) {
try { if (browser) await browser.close(); } catch {}
return { status: 0, text: '', error: String(err && err.message || err) };
}
}
function toCsvRow(values) {
return values.map(v => {
const s = v == null ? '' : String(v);
if (/[",\n]/.test(s)) return `"${s.replace(/"/g, '""')}"`;
return s;
}).join(',');
}
import * as cheerio from 'cheerio'; // reserved for future selectors if needed
async function main() {
// Load configuration
let config;
try {
const configPath = path.resolve(process.cwd(), 'form-detection-config.json');
const configContent = await fs.promises.readFile(configPath, 'utf8');
config = JSON.parse(configContent);
globalConfig = config; // Set global config for fetchDynamic
} catch (err) {
console.error('Error loading config file:', err.message);
console.error('Make sure form-detection-config.json exists in the current directory');
process.exit(1);
}
// Prepare patterns from config
for (const [formType, formConfig] of Object.entries(config.forms)) {
if (formConfig.enabled) {
patterns[formType] = formConfig.patterns
.filter(p => p.type !== 'url') // URL patterns are handled separately
.map(p => new RegExp(p.pattern, 'i'));
}
}
// Prepare CMP patterns
cmpPatterns = config.cmp.enabled ?
config.cmp.patterns.map(p => new RegExp(p.pattern, 'i')) : [];
// Prepare collector patterns (CLI takes precedence over config)
if (collectorsArg) {
collectorPatterns = parseCollectorPatterns(collectorsArg);
} else if (config.collectors && config.collectors.enabled) {
collectorPatterns = config.collectors.patterns || [];
}
console.log('CONFIG_LOADED: cmp.enabled =', config.cmp.enabled, 'cmpFlag =', cmpFlag);
if (collectorPatterns.length > 0) {
console.log('COLLECTOR_PATTERNS_LOADED:', collectorPatterns.length, 'patterns');
}
// Define detection functions after config is loaded
function detectCMP(html) {
if (!cmpFlag || !config.cmp.enabled) {
return { has_cmp: false, cmp_vendor: null, cmp_evidence: null };
}
const hay = html || '';
for (const pattern of config.cmp.patterns) {
if (pattern.type === 'url' && hay.includes(pattern.pattern.replace(/\\.*$/, ''))) {
return {
has_cmp: true,
cmp_vendor: pattern.vendor,
cmp_evidence: truncate(pattern.description)
};
} else if (pattern.type !== 'url') {
const rx = new RegExp(pattern.pattern, 'i');
const match = hay.match(rx);
if (match) {
return {
has_cmp: true,
cmp_vendor: pattern.vendor,
cmp_evidence: truncate(match[0])
};
}
}
}
return { has_cmp: false, cmp_vendor: null, cmp_evidence: null };
}
function detect(html, url = '') {
const result = {
detected_types: [],
evidence: null,
has_cmp: false,
cmp_vendor: null,
cmp_evidence: null,
collectors_detected: false,
collector_link_count: 0,
collector_embed_count: 0,
collectors: [],
linked_forms: []
};
const hay = html || '';
// Check for forms in enabled config
for (const [formType, formConfig] of Object.entries(config.forms)) {
if (!formConfig.enabled) continue;
let formDetected = false;
// Check URL patterns
for (const pattern of formConfig.patterns) {
if (pattern.type === 'url') {
const rx = new RegExp(pattern.pattern, 'i');
if (rx.test(url)) {
formDetected = true;
result.evidence = truncate(pattern.description);
break;
}
}
}
// Check HTML patterns
if (!formDetected) {
for (const rx of patterns[formType] || []) {
const match = hay.match(rx);
if (match) {
formDetected = true;
result.evidence = truncate(match[0]);
break;
}
}
}
if (formDetected) {
result.detected_types.push(formType);
result[formType] = true;
}
}
// Check for CMP
if (cmpFlag) {
const cmpResult = detectCMP(hay);
result.has_cmp = cmpResult.has_cmp;
result.cmp_vendor = cmpResult.cmp_vendor;
result.cmp_evidence = cmpResult.cmp_evidence;
// Debug logging for CMP detection
if (cmpResult.has_cmp) {
console.log(`✅ CMP detected: ${cmpResult.cmp_vendor} (${cmpResult.cmp_evidence})`);
}
}
// Collector pattern detection (DOM analysis)
if (collectorPatterns.length > 0) {
const baseUrl = url || '';
const collectedItems = new Map(); // Deduplication key -> item
// Extract and analyze different element types
const elementTypes = [
{ selector: 'a[href]', relation: 'link' },
{ selector: 'iframe[src]', relation: 'embed-iframe' },
{ selector: 'script[src]', relation: 'embed-script' },
{ selector: 'form[action]', relation: 'form-action' }
];
for (const { selector, relation } of elementTypes) {
// Simple regex-based extraction (since we don't have DOM parser)
const regex = new RegExp(`${selector.replace('[', '\\[').replace(']', '\\]')}[^>]*>`, 'gi');
const matches = hay.match(regex) || [];
for (const match of matches) {
// Extract attribute value
let attrValue = '';
if (selector.includes('[href]')) {
const hrefMatch = match.match(/href=["']([^"']+)["']/i);
attrValue = hrefMatch ? hrefMatch[1] : '';
} else if (selector.includes('[src]')) {
const srcMatch = match.match(/src=["']([^"']+)["']/i);
attrValue = srcMatch ? srcMatch[1] : '';
} else if (selector.includes('[action]')) {
const actionMatch = match.match(/action=["']([^"']+)["']/i);
attrValue = actionMatch ? actionMatch[1] : '';
}
if (!attrValue) continue;
// Normalize URL
const normalizedUrl = normalizeUrl(attrValue, baseUrl);
if (!normalizedUrl) continue;
// Check against collector patterns
for (const pattern of collectorPatterns) {
if (globMatch(normalizedUrl, pattern)) {
const dedupKey = `${pattern}|${normalizedUrl.href}|${relation}`;
const context = extractContext(hay, match);
if (!collectedItems.has(dedupKey)) {
collectedItems.set(dedupKey, {
target_pattern: pattern,
matched_url: normalizedUrl.href,
relation: relation,
match_type: 'wildcard', // Simple wildcard for now
text_or_context: context
});
// Update counts
if (relation === 'link') {
result.collector_link_count++;
} else if (['embed-iframe', 'embed-script'].includes(relation)) {
result.collector_embed_count++;
}
}
}
}
}
}
// Convert collected items to array
result.collectors = Array.from(collectedItems.values());
result.collectors_detected = result.collectors.length > 0;
// Debug logging
if (result.collectors_detected) {
console.log(`📊 Collector matches: ${result.collectors.length} items (${result.collector_link_count} links, ${result.collector_embed_count} embeds)`);
if (globalThis.process?.env?.VERBOSE) {
result.collectors.forEach(item => {
console.log(` → ${item.relation}: ${item.matched_url} (${item.target_pattern})`);
});
}
}
}
// Form link detection
if (config.forms) {
// Extract all links from the page
const linkRegex = /<a[^>]+href=["']([^"']+)["'][^>]*>([^<]*)<\/a>/gi;
let match;
const foundLinks = new Set(); // Deduplication
while ((match = linkRegex.exec(hay)) !== null) {
const href = match[1];
const linkText = match[2] || '';
if (!href || href.startsWith('#') || href.startsWith('javascript:') || href.startsWith('mailto:')) continue;
// Normalize URL
const baseUrl = new URL(url);
let absoluteUrl;
try {
absoluteUrl = new URL(href, baseUrl);
} catch (e) {
continue; // Invalid URL
}
const fullUrl = absoluteUrl.href;
// Skip if already processed
if (foundLinks.has(fullUrl)) continue;
foundLinks.add(fullUrl);
// Check if this link matches any form pattern
for (const [formType, formConfig] of Object.entries(config.forms)) {
if (!formConfig.enabled) continue;
let isFormLink = false;
let linkEvidence = '';
// Check URL patterns
for (const pattern of formConfig.patterns) {
if (pattern.type === 'url') {
const rx = new RegExp(pattern.pattern, 'i');
if (rx.test(fullUrl)) {
isFormLink = true;
linkEvidence = pattern.description;
break;
}
}
}
if (isFormLink) {
result.linked_forms.push({
url: fullUrl,
form_type: formType,
evidence: truncate(linkEvidence),
link_text: truncate(linkText, 50),
context: extractContext(hay, match[0])
});
break; // Only add to first matching form type
}
}
}
// Debug logging for linked forms
if (result.linked_forms.length > 0) {
console.log(`🔗 Found ${result.linked_forms.length} form links on ${url}`);
if (globalThis.process?.env?.VERBOSE) {
result.linked_forms.forEach(link => {
console.log(` → ${link.form_type}: ${link.url} (${link.link_text})`);
});
}
}
}
return result;
}
const start = Date.now();
const listRaw = await fs.promises.readFile(path.resolve(process.cwd(), input), 'utf8');
const urls = listRaw.split(/\r?\n/).map(l => l.trim()).filter(Boolean);
const limit = pLimit(concurrency);
const headers = ['url','method','status','is_ozyegin_form','is_yourcompany_form','is_example_form','is_google_form','is_hubspot_form','is_microsoft_form','is_formstack_form','detected_types','evidence','has_cmp','cmp_vendor','cmp_evidence','collectors_detected','collector_link_count','collector_embed_count','linked_forms_detected','linked_forms_count','note'];
const rows = [toCsvRow(headers)];
const resultsJson = [];
let idx = 0;
const tasks = urls.map(url => limit(async () => {
const n = ++idx;
const st = await fetchStatic(url);
let det = detect(st.text, url);
let method = 'static';
let note = st.error ? `static_error: ${st.error}` : '';
const shouldUseDynamic = (det.detected_types.length === 0 && dynamicFlag) || cmpFlag;
console.log(`[${n}/${urls.length}] ${url} -> ${shouldUseDynamic ? 'will use dynamic' : 'static only'} (${cmpFlag ? 'CMP enabled' : 'CMP disabled'})`);
if (shouldUseDynamic) {
const dy = await fetchDynamic(url);
const det2 = detect(dy.text, url);
if (det2.detected_types.length > 0) {
det = det2;
method = 'dynamic';
}
if (dy.error) {
if (dy.error === 'playwright_not_available') {
note = (note ? note + ' | ' : '') + 'dynamic_mode_skipped: Playwright not available (install with: npm i -D playwright && npx playwright install)';
} else {
note = (note ? note + ' | ' : '') + `dynamic_error: ${dy.error}`;
}
}
if (dy.status) st.status = dy.status;
}
const detectedTypes = det.detected_types.join(';');
rows.push(toCsvRow([
url,
method,
st.status || 0,
det.ozyegin || false,
det.yourcompany || false,
det.example || false,
det.google || false,
det.hubspot || false,
det.microsoft || false,
det.formstack || false,
detectedTypes,
det.evidence || '',
det.has_cmp || false,
det.cmp_vendor || '',
det.cmp_evidence || '',
det.collectors_detected || false,
det.collector_link_count || 0,
det.collector_embed_count || 0,
(det.linked_forms && det.linked_forms.length > 0) || false,
(det.linked_forms && det.linked_forms.length) || 0,
note || ''
]));
resultsJson.push({
url,
method,
status: st.status || 0,
is_ozyegin_form: det.ozyegin || false,
is_yourcompany_form: det.yourcompany || false,
is_example_form: det.example || false,
is_google_form: det.google || false,
is_hubspot_form: det.hubspot || false,
is_microsoft_form: det.microsoft || false,
is_formstack_form: det.formstack || false,
detected_types: det.detected_types,
evidence: det.evidence,
has_cmp: det.has_cmp || false,
cmp_vendor: det.cmp_vendor,
cmp_evidence: det.cmp_evidence,
collectors_detected: det.collectors_detected || false,
collector_link_count: det.collector_link_count || 0,
collector_embed_count: det.collector_embed_count || 0,
collectors: det.collectors || [],
linked_forms_detected: (det.linked_forms && det.linked_forms.length > 0) || false,
linked_forms_count: (det.linked_forms && det.linked_forms.length) || 0,
linked_forms: det.linked_forms || [],
note
});
process.stderr.write(`[${n}/${urls.length}] ${url} -> ${detectedTypes || 'none'} (${method}, ${st.status || 0})\n`);
}));
await Promise.all(tasks);
await fs.promises.writeFile(path.resolve(process.cwd(), outputCsv), rows.join('\n'), 'utf8');
await fs.promises.writeFile(path.resolve(process.cwd(), outputJson), JSON.stringify(resultsJson, null, 2), 'utf8');
const dur = ((Date.now() - start)/1000).toFixed(2);
console.log(`\nSaved CSV to ${outputCsv}`);
console.log(`Saved JSON to ${outputJson}`);
console.log(`Done in ${dur}s`);
}
main().catch(err => {
console.error(err);
process.exit(1);
});