UNPKG

@fanboynz/network-scanner

Version:

A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.

715 lines (609 loc) 25.4 kB
// === Post-Processing Module for Network Scanner === // Handles cleanup and validation of scan results after scanning is complete const { formatLogMessage, messageColors } = require('./colorize'); const psl = require('psl'); // Precompiled regex patterns (avoids recompilation per rule) const REGEX_ADBLOCK = /^\|\|([^/\^]+)/; const REGEX_DNSMASQ_LOCAL = /local=\/([^/]+)\//; const REGEX_DNSMASQ_SERVER = /server=\/([^/]+)\//; const REGEX_UNBOUND = /local-zone:\s*"([^"]+)\.?"/; const REGEX_PRIVOXY = /\{\s*\+block\s*\}\s*\.?([^\s]+)/; const REGEX_PIHOLE = /^\(\^\|\\\.\)(.+)\\\.\w+\$$/; const REGEX_DOMAIN_FALLBACK = /([a-zA-Z0-9][a-zA-Z0-9.-]*\.[a-zA-Z]{2,})/; const REGEX_WHITESPACE = /\s+/; const REGEX_UNESCAPE_DOT = /\\\./g; // Cache for compiled wildcard regex patterns const wildcardRegexCache = new Map(); /** * Get or compile a wildcard pattern regex (cached) * @param {string} pattern - Wildcard pattern string * @returns {RegExp} Compiled regex */ function getWildcardRegex(pattern) { let regex = wildcardRegexCache.get(pattern); if (!regex) { regex = new RegExp('^' + pattern.replace(/\./g, '\\.').replace(/\*/g, '.*') + '$'); wildcardRegexCache.set(pattern, regex); // Cap cache size if (wildcardRegexCache.size > 200) { const firstKey = wildcardRegexCache.keys().next().value; wildcardRegexCache.delete(firstKey); } } return regex; } /** * Safely extracts hostname from a URL, handling malformed URLs gracefully * @param {string} url - The URL string to parse * @param {boolean} getFullHostname - If true, returns full hostname; if false, returns root domain * @returns {string} The hostname/domain, or empty string if URL is invalid */ function safeGetDomain(url, getFullHostname = false) { try { const parsedUrl = new URL(url); if (getFullHostname) { return parsedUrl.hostname; } const parsed = psl.parse(parsedUrl.hostname); return parsed.domain || parsedUrl.hostname; } catch (urlError) { return ''; } } /** * Enhanced domain extraction helper - single source of truth for all rule formats * (Was duplicated inline in cleanupIgnoreDomains and cleanupFirstPartyDomains) * @param {string} rule - Rule string in various formats * @returns {string|null} Extracted domain or null if not found */ function extractDomainFromRule(rule) { if (!rule || typeof rule !== 'string') { return null; } // Adblock: ||domain.com^ if (rule.charCodeAt(0) === 124 && rule.charCodeAt(1) === 124 && rule.includes('^')) { // '||' + '^' const match = REGEX_ADBLOCK.exec(rule); return match ? match[1] : null; } // Hosts file: 127.0.0.1 domain / 0.0.0.0 domain if (rule.charCodeAt(0) === 49 || rule.charCodeAt(0) === 48) { // '1' or '0' if (rule.startsWith('127.0.0.1 ') || rule.startsWith('0.0.0.0 ')) { const parts = rule.split(REGEX_WHITESPACE); return parts.length >= 2 ? parts[1] : null; } } // dnsmasq: local=/domain.com/ if (rule.includes('local=/')) { const match = REGEX_DNSMASQ_LOCAL.exec(rule); return match ? match[1] : null; } // dnsmasq old: server=/domain.com/ if (rule.includes('server=/')) { const match = REGEX_DNSMASQ_SERVER.exec(rule); return match ? match[1] : null; } // Unbound: local-zone: "domain.com." always_null if (rule.includes('local-zone:') && rule.includes('always_null')) { const match = REGEX_UNBOUND.exec(rule); return match ? match[1] : null; } // Privoxy: { +block } .domain.com if (rule.includes('+block') && rule.includes('.')) { const match = REGEX_PRIVOXY.exec(rule); return match ? match[1] : null; } // Pi-hole regex: (^|\.)domain\.com$ -- single match (was tested then matched separately) if (rule.charCodeAt(0) === 40) { // '(' const match = REGEX_PIHOLE.exec(rule); return match ? match[1].replace(REGEX_UNESCAPE_DOT, '.') : null; } // Fallback: any domain-like pattern const domainMatch = REGEX_DOMAIN_FALLBACK.exec(rule); return domainMatch ? domainMatch[1] : null; } /** * Enhanced domain matching for ignoreDomains patterns (including wildcards) * @param {string} domain - Domain to check * @param {Array} ignorePatterns - Array of ignore patterns (supports wildcards) * @param {boolean} forceDebug - Debug logging flag * @returns {Object} Match result with shouldIgnore flag and reason */ function shouldIgnoreAsIgnoreDomain(domain, ignorePatterns, forceDebug) { if (!domain || !ignorePatterns || ignorePatterns.length === 0) { return { shouldIgnore: false, reason: 'No ignore patterns' }; } for (let i = 0; i < ignorePatterns.length; i++) { const pattern = ignorePatterns[i]; if (pattern.includes('*')) { if (pattern.startsWith('*.')) { // Pattern: *.example.com const wildcardDomain = pattern.substring(2); const wildcardRoot = safeGetDomain('http://' + wildcardDomain, false); const domainRoot = safeGetDomain('http://' + domain, false); if (wildcardRoot === domainRoot) { return { shouldIgnore: true, reason: 'Matches wildcard ignore pattern: ' + pattern }; } } else if (pattern.endsWith('.*')) { // Pattern: example.* const baseDomain = pattern.slice(0, -2); if (domain.startsWith(baseDomain + '.')) { return { shouldIgnore: true, reason: 'Matches wildcard TLD ignore pattern: ' + pattern }; } } else { // Complex wildcard -- use cached regex const wildcardRegex = getWildcardRegex(pattern); if (wildcardRegex.test(domain)) { return { shouldIgnore: true, reason: 'Matches complex wildcard ignore pattern: ' + pattern }; } } } else { // Exact pattern matching if (domain === pattern || domain.endsWith('.' + pattern)) { return { shouldIgnore: true, reason: 'Matches exact ignore pattern: ' + pattern }; } } } return { shouldIgnore: false, reason: 'No ignore pattern matches' }; } /** * Enhanced domain matching that handles wildcards and first-party detection * @param {string} extractedDomain - Domain extracted from rule * @param {string} scannedRootDomain - Root domain of the scanned site * @param {boolean} forceDebug - Debug logging flag * @returns {Object} Match result with shouldRemove flag and reason */ function shouldRemoveAsFirstParty(extractedDomain, scannedRootDomain, forceDebug) { if (!extractedDomain || !scannedRootDomain) { return { shouldRemove: false, reason: 'Missing domain data' }; } if (extractedDomain.includes('*')) { if (extractedDomain.startsWith('*.')) { const wildcardDomain = extractedDomain.substring(2); const wildcardRoot = safeGetDomain('http://' + wildcardDomain, false); if (wildcardRoot === scannedRootDomain) { return { shouldRemove: true, reason: 'Wildcard subdomain pattern matches root domain (*.' + wildcardRoot + ')' }; } } else if (extractedDomain.endsWith('.*')) { const baseDomain = extractedDomain.slice(0, -2); if (scannedRootDomain.startsWith(baseDomain + '.')) { return { shouldRemove: true, reason: 'Wildcard TLD pattern matches base domain (' + baseDomain + '.*)' }; } } else { // Complex wildcard -- use cached regex const wildcardRegex = getWildcardRegex(extractedDomain); if (wildcardRegex.test(scannedRootDomain)) { return { shouldRemove: true, reason: 'Complex wildcard pattern matches root domain (' + extractedDomain + ')' }; } } } // Standard exact root domain matching const extractedRoot = safeGetDomain('http://' + extractedDomain, false); if (extractedRoot === scannedRootDomain) { return { shouldRemove: true, reason: 'Exact root domain match (' + extractedRoot + ')' }; } return { shouldRemove: false, reason: 'No first-party match detected' }; } /** * Build URL-to-site-config mapping (shared between cleanup functions) * @param {Array} sites - Array of site configurations * @returns {Map} URL to site config mapping */ function buildUrlToSiteConfig(sites) { const map = new Map(); for (let i = 0; i < sites.length; i++) { const site = sites[i]; const urls = Array.isArray(site.url) ? site.url : [site.url]; for (let j = 0; j < urls.length; j++) { map.set(urls[j], site); } } return map; } /** * Post-scan cleanup function to remove ignoreDomains from results * This is a final safety net to catch any domains that should have been ignored * * @param {Array} results - Array of scan results from all sites * @param {Array} ignoreDomains - Array of domains/patterns to ignore * @param {Object} options - Options object * @param {boolean} options.forceDebug - Debug logging flag * @param {boolean} options.silentMode - Silent mode flag * @returns {Array} Cleaned results with ignoreDomains removed */ function cleanupIgnoreDomains(results, ignoreDomains, options = {}) { const { forceDebug = false, silentMode = false } = options; if (!results || results.length === 0 || !ignoreDomains || ignoreDomains.length === 0) { return results; } if (forceDebug) { console.log(formatLogMessage('debug', '[ignoreDomains cleanup] Processing ' + results.length + ' results against ' + ignoreDomains.length + ' ignore patterns')); } const cleanedResults = []; let totalRulesRemoved = 0; let sitesAffected = 0; for (let ri = 0; ri < results.length; ri++) { const result = results[ri]; if (!result.rules || result.rules.length === 0) { cleanedResults.push(result); continue; } const cleanedRules = []; const removedRules = []; for (let j = 0; j < result.rules.length; j++) { const rule = result.rules[j]; let kept = true; try { // Use shared extractDomainFromRule (was duplicated inline) const extractedDomain = extractDomainFromRule(rule); if (extractedDomain) { const ignoreResult = shouldIgnoreAsIgnoreDomain(extractedDomain, ignoreDomains, forceDebug); if (ignoreResult.shouldIgnore) { removedRules.push({ rule: rule, domain: extractedDomain, reason: 'ignoreDomains: ' + ignoreResult.reason, matchType: ignoreResult.reason.includes('wildcard') ? 'wildcard' : 'exact' }); kept = false; } } } catch (parseErr) { if (forceDebug) { console.log(formatLogMessage('debug', '[ignoreDomains cleanup] Failed to parse rule: ' + rule + ' - ' + parseErr.message)); } } if (kept) { cleanedRules.push(rule); } } // Mutate rules directly instead of spreading entire result object result.rules = cleanedRules; cleanedResults.push(result); if (removedRules.length > 0) { sitesAffected++; totalRulesRemoved += removedRules.length; if (!silentMode) { // Single-pass count instead of two .filter() calls let wildcardCount = 0; for (let k = 0; k < removedRules.length; k++) { if (removedRules[k].matchType === 'wildcard') wildcardCount++; } const exactCount = removedRules.length - wildcardCount; let cleanupMessage = '?? Removed ' + removedRules.length + ' ignoreDomains rule(s) from ' + safeGetDomain(result.url) + ' (final cleanup)'; if (wildcardCount > 0) { cleanupMessage += ' [' + wildcardCount + ' wildcard, ' + exactCount + ' exact]'; } if (messageColors && messageColors.cleanup) { console.log(messageColors.cleanup(cleanupMessage)); } else { console.log(cleanupMessage); } } if (forceDebug) { console.log(formatLogMessage('debug', '[ignoreDomains cleanup] Removed rules from ' + result.url + ':')); for (let k = 0; k < removedRules.length; k++) { console.log(formatLogMessage('debug', ' [' + (k + 1) + '] ' + removedRules[k].rule + ' (' + removedRules[k].reason + ') [' + removedRules[k].matchType + ']')); } } } } // Summary if (totalRulesRemoved > 0 && !silentMode) { const summaryMessage = '\n?? ignoreDomains cleanup completed: Removed ' + totalRulesRemoved + ' rules from ' + sitesAffected + ' site(s)'; if (messageColors && messageColors.cleanup) { console.log(messageColors.cleanup(summaryMessage)); } else { console.log(summaryMessage); } } else if (forceDebug) { console.log(formatLogMessage('debug', '[ignoreDomains cleanup] No ignoreDomains rules found to remove')); } return cleanedResults; } /** * Post-scan cleanup function to remove first-party domains from results * Only processes sites that have firstParty: false in their configuration * * @param {Array} results - Array of scan results from all sites * @param {Array} sites - Array of site configurations * @param {Object} options - Options object * @param {boolean} options.forceDebug - Debug logging flag * @param {boolean} options.silentMode - Silent mode flag * @param {Map} [options._urlToSiteConfig] - Pre-built URL mapping (internal optimization) * @returns {Array} Cleaned results with conditional first-party removal */ function cleanupFirstPartyDomains(results, sites, options = {}) { const { forceDebug = false, silentMode = false } = options; if (!results || results.length === 0) { return results; } // Use pre-built map if passed, otherwise build it const urlToSiteConfig = options._urlToSiteConfig || buildUrlToSiteConfig(sites); const cleanedResults = []; let totalRulesRemoved = 0; let sitesAffected = 0; for (let ri = 0; ri < results.length; ri++) { const result = results[ri]; const siteConfig = urlToSiteConfig.get(result.url); const shouldCleanFirstParty = siteConfig && siteConfig.firstParty === false; if (!shouldCleanFirstParty || !result.rules || result.rules.length === 0) { cleanedResults.push(result); continue; } if (forceDebug) { console.log(formatLogMessage('debug', '[cleanup] Processing ' + result.url + ' (firstParty: false detected)')); } const scannedDomain = safeGetDomain(result.url, false); if (!scannedDomain) { cleanedResults.push(result); continue; } const cleanedRules = []; const removedRules = []; for (let j = 0; j < result.rules.length; j++) { const rule = result.rules[j]; let kept = true; try { // Use shared extractDomainFromRule (was duplicated inline) const extractedDomain = extractDomainFromRule(rule); if (extractedDomain) { const matchResult = shouldRemoveAsFirstParty(extractedDomain, scannedDomain, forceDebug); if (matchResult.shouldRemove) { removedRules.push({ rule: rule, domain: extractedDomain, rootDomain: scannedDomain, reason: 'First-party: ' + matchResult.reason + ' (firstParty: false)', matchType: matchResult.reason.includes('Wildcard') ? 'wildcard' : 'exact' }); kept = false; } } } catch (parseErr) { if (forceDebug) { console.log(formatLogMessage('debug', '[cleanup] Failed to parse rule: ' + rule + ' - ' + parseErr.message)); } } if (kept) { cleanedRules.push(rule); } } // Mutate rules directly instead of { ...result, rules: cleanedRules } result.rules = cleanedRules; cleanedResults.push(result); if (removedRules.length > 0) { sitesAffected++; totalRulesRemoved += removedRules.length; if (!silentMode) { // Single-pass count let wildcardCount = 0; for (let k = 0; k < removedRules.length; k++) { if (removedRules[k].matchType === 'wildcard') wildcardCount++; } const exactCount = removedRules.length - wildcardCount; let cleanupMessage = '?? Cleaned ' + removedRules.length + ' first-party rule(s) from ' + scannedDomain + ' (firstParty: false)'; if (wildcardCount > 0) { cleanupMessage += ' [' + wildcardCount + ' wildcard, ' + exactCount + ' exact]'; } if (messageColors && messageColors.cleanup) { console.log(messageColors.cleanup(cleanupMessage)); } else { console.log(cleanupMessage); } } if (forceDebug) { console.log(formatLogMessage('debug', '[cleanup] Removed rules from ' + result.url + ':')); for (let k = 0; k < removedRules.length; k++) { console.log(formatLogMessage('debug', ' [' + (k + 1) + '] ' + removedRules[k].rule + ' (' + removedRules[k].reason + ') [' + removedRules[k].matchType + ']')); } } } } // Summary if (totalRulesRemoved > 0 && !silentMode) { const summaryMessage = '\n?? First-party cleanup completed: Removed ' + totalRulesRemoved + ' rules from ' + sitesAffected + ' site(s) with firstParty: false'; if (messageColors && messageColors.cleanup) { console.log(messageColors.cleanup(summaryMessage)); } else { console.log(summaryMessage); } } else if (forceDebug) { console.log(formatLogMessage('debug', '[cleanup] No first-party rules found to remove')); } return cleanedResults; } /** * Validates scan results and removes any obvious false positives * * @param {Array} results - Array of scan results * @param {Object} options - Options object * @param {boolean} options.forceDebug - Debug logging flag * @param {Array} options.ignoreDomains - Domains to ignore * @returns {Array} Validated results */ function validateScanResults(results, options = {}) { const { forceDebug = false, ignoreDomains = [] } = options; if (!results || results.length === 0) { return results; } let totalValidated = 0; let totalRemoved = 0; // Pre-strip wildcards from ignore patterns once (was done per rule per pattern) let strippedIgnorePatterns = null; if (ignoreDomains.length > 0) { strippedIgnorePatterns = new Array(ignoreDomains.length); for (let i = 0; i < ignoreDomains.length; i++) { strippedIgnorePatterns[i] = ignoreDomains[i].replace('*', ''); } } for (let ri = 0; ri < results.length; ri++) { const result = results[ri]; if (!result.rules || result.rules.length === 0) { continue; } const originalCount = result.rules.length; const validRules = []; for (let j = 0; j < result.rules.length; j++) { const rule = result.rules[j]; // Basic validation if (!rule || typeof rule !== 'string' || rule.trim().length === 0) { if (forceDebug) { console.log(formatLogMessage('debug', '[validation] Removed empty/invalid rule')); } totalRemoved++; continue; } // Check against stripped ignore patterns let ignored = false; if (strippedIgnorePatterns) { for (let k = 0; k < strippedIgnorePatterns.length; k++) { if (rule.includes(strippedIgnorePatterns[k])) { if (forceDebug) { console.log(formatLogMessage('debug', '[validation] Removed rule matching ignore pattern: ' + ignoreDomains[k])); } totalRemoved++; ignored = true; break; } } } if (!ignored) { validRules.push(rule); } } totalValidated += originalCount; // Mutate in place instead of spread result.rules = validRules; } if (forceDebug && totalRemoved > 0) { console.log(formatLogMessage('debug', '[validation] Validated ' + totalValidated + ' rules, removed ' + totalRemoved + ' invalid rules')); } return results; } /** * Final validation check for firstParty: false violations * Reuses existing domain extraction and matching logic * * @param {Array} results - Array of scan results * @param {Array} sites - Array of site configurations * @param {Object} options - Options object * @param {Map} [options._urlToSiteConfig] - Pre-built URL mapping (internal optimization) * @returns {Array} Results with any remaining first-party domains removed */ function finalFirstPartyValidation(results, sites, options = {}) { const { forceDebug = false, silentMode = false } = options; if (!results || results.length === 0) { return results; } // Use pre-built map if passed, otherwise build it const urlToSiteConfig = options._urlToSiteConfig || buildUrlToSiteConfig(sites); const finalResults = []; let totalViolationsFound = 0; let sitesWithViolations = 0; for (let ri = 0; ri < results.length; ri++) { const result = results[ri]; const siteConfig = urlToSiteConfig.get(result.url); const shouldValidate = siteConfig && siteConfig.firstParty === false; if (!shouldValidate || !result.rules || result.rules.length === 0) { finalResults.push(result); continue; } const scannedDomain = safeGetDomain(result.url, false); if (!scannedDomain) { finalResults.push(result); continue; } const cleanedRules = []; const violatingRules = []; for (let j = 0; j < result.rules.length; j++) { const rule = result.rules[j]; const extractedDomain = extractDomainFromRule(rule); if (extractedDomain) { const matchResult = shouldRemoveAsFirstParty(extractedDomain, scannedDomain, forceDebug); if (matchResult.shouldRemove) { violatingRules.push({ rule: rule, domain: extractedDomain, reason: 'VALIDATION FAILURE: ' + matchResult.reason }); totalViolationsFound++; continue; } } cleanedRules.push(rule); } if (violatingRules.length > 0) { sitesWithViolations++; if (!silentMode) { const errorMessage = '? CONFIG VIOLATION: Found ' + violatingRules.length + ' first-party rule(s) in ' + scannedDomain + ' (firstParty: false)'; if (messageColors && messageColors.error) { console.log(messageColors.error(errorMessage)); } else { console.log(errorMessage); } } if (forceDebug) { console.log(formatLogMessage('debug', '[final-validation] Violations found for ' + result.url + ':')); for (let k = 0; k < violatingRules.length; k++) { console.log(formatLogMessage('debug', ' [' + (k + 1) + '] ' + violatingRules[k].rule + ' -> ' + violatingRules[k].domain)); } } } // Mutate in place result.rules = cleanedRules; finalResults.push(result); } // Summary if (totalViolationsFound > 0 && !silentMode) { const summaryMessage = '\n? SCAN FILTERING FAILURE: Removed ' + totalViolationsFound + ' first-party rules from ' + sitesWithViolations + ' site(s) in post-processing'; console.log(summaryMessage); console.log('?? This indicates firstParty: false filtering failed during scan - consider investigating root cause.'); } else if (forceDebug) { console.log(formatLogMessage('debug', '[final-validation] No first-party violations found - filtering working correctly')); } return finalResults; } /** * Main post-processing function that runs all cleanup and validation steps * * @param {Array} results - Array of scan results from all sites * @param {Array} sites - Array of site configurations * @param {Object} options - Options object * @param {boolean} options.forceDebug - Debug logging flag * @param {boolean} options.silentMode - Silent mode flag * @param {Array} options.ignoreDomains - Domains to ignore during validation * @returns {Array} Fully processed and cleaned results */ function processResults(results, sites, options = {}) { const { forceDebug = false, silentMode = false } = options; if (forceDebug) { console.log(formatLogMessage('debug', '[post-processing] Starting post-processing of ' + results.length + ' results')); } // Build URL-to-config map once, share across all steps const urlToSiteConfig = buildUrlToSiteConfig(sites); const sharedOptions = Object.assign({}, options, { _urlToSiteConfig: urlToSiteConfig }); // Step 1: Clean up first-party domains let processedResults = cleanupFirstPartyDomains(results, sites, sharedOptions); // Step 2: Clean up ignoreDomains (final safety net) processedResults = cleanupIgnoreDomains(processedResults, options.ignoreDomains || [], options); // Step 3: Final validation for firstParty: false configurations processedResults = finalFirstPartyValidation(processedResults, sites, sharedOptions); // Step 4: Validate results processedResults = validateScanResults(processedResults, options); if (forceDebug) { let totalRules = 0; for (let i = 0; i < processedResults.length; i++) { totalRules += processedResults[i].rules ? processedResults[i].rules.length : 0; } console.log(formatLogMessage('debug', '[post-processing] Completed: ' + totalRules + ' total rules remaining')); } return processedResults; } module.exports = { cleanupFirstPartyDomains, cleanupIgnoreDomains, finalFirstPartyValidation, extractDomainFromRule, validateScanResults, processResults };