UNPKG

@fanboynz/network-scanner

Version:

A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.

github.com/ryanbr/network-scanner

ryanbr/network-scanner

1,079 lines (958 loc) • 124 kB

JavaScript

// === Network scanner script (nwss.js) v1.0.57 === // puppeteer for browser automation, fs for file system operations, psl for domain parsing. // const pLimit = require('p-limit'); // Will be dynamically imported const puppeteer = require('puppeteer'); const fs = require('fs'); const psl = require('psl'); const path = require('path'); const { createGrepHandler, validateGrepAvailability } = require('./lib/grep'); const { compressMultipleFiles, formatFileSize } = require('./lib/compress'); const { parseSearchStrings, createResponseHandler, createCurlHandler } = require('./lib/searchstring'); const { applyAllFingerprintSpoofing } = require('./lib/fingerprint'); const { formatRules, handleOutput, getFormatDescription } = require('./lib/output'); // Rule validation const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile } = require('./lib/validate_rules'); // CF Bypass const { handleCloudflareProtection } = require('./lib/cloudflare'); // FP Bypass const { handleFlowProxyProtection, getFlowProxyTimeouts } = require('./lib/flowproxy'); // ignore_similar rules const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore_similar'); // Graceful exit const { handleBrowserExit, cleanupChromeTempFiles } = require('./lib/browserexit'); // Whois & Dig const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability } = require('./lib/nettools'); // File compare const { loadComparisonRules, filterUniqueRules } = require('./lib/compare'); // CDP functionality const { createCDPSession } = require('./lib/cdp'); // Colorize various text when used const { colorize, colors, messageColors, tags, formatLogMessage } = require('./lib/colorize'); // Enhanced mouse interaction and page simulation const { performPageInteraction, createInteractionConfig } = require('./lib/interaction'); // Domain detection cache for performance optimization const { createGlobalHelpers, getTotalDomainsSkipped, getDetectedDomainsCount } = require('./lib/domain-cache'); const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system // Enhanced redirect handling const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/redirect'); // Ensure web browser is working correctly const { monitorBrowserHealth, isBrowserHealthy } = require('./lib/browserhealth'); // --- Script Configuration & Constants --- const VERSION = '1.0.57'; // Script version // get startTime const startTime = Date.now(); // Initialize domain cache helpers with debug logging if enabled const domainCacheOptions = { enableLogging: false }; // Set to true for cache debug logs const { isDomainAlreadyDetected, markDomainAsDetected } = createGlobalHelpers(domainCacheOptions); // Smart cache will be initialized after config is loaded let smartCache = null; // --- Command-Line Argument Parsing --- const args = process.argv.slice(2); if (args.length === 0) { args.push('--help'); } const headfulMode = args.includes('--headful'); const SOURCES_FOLDER = 'sources'; let outputFile = null; const outputIndex = args.findIndex(arg => arg === '--output' || arg === '-o'); if (outputIndex !== -1 && args[outputIndex + 1]) { outputFile = args[outputIndex + 1]; } const appendMode = args.includes('--append'); let compareFile = null; const compareIndex = args.findIndex(arg => arg === '--compare'); if (compareIndex !== -1 && args[compareIndex + 1]) { compareFile = args[compareIndex + 1]; } const forceVerbose = args.includes('--verbose'); const forceDebug = args.includes('--debug'); const silentMode = args.includes('--silent'); const showTitles = args.includes('--titles'); const dumpUrls = args.includes('--dumpurls'); const subDomainsMode = args.includes('--sub-domains'); const localhostMode = args.includes('--localhost'); const localhostModeAlt = args.includes('--localhost-0.0.0.0'); const disableInteract = args.includes('--no-interact'); const plainOutput = args.includes('--plain'); const enableCDP = args.includes('--cdp'); const dnsmasqMode = args.includes('--dnsmasq'); const dnsmasqOldMode = args.includes('--dnsmasq-old'); const unboundMode = args.includes('--unbound'); const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes'); const privoxyMode = args.includes('--privoxy'); const piholeMode = args.includes('--pihole'); const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception const dryRunMode = args.includes('--dry-run'); const compressLogs = args.includes('--compress-logs'); const removeTempFiles = args.includes('--remove-tempfiles'); const validateConfig = args.includes('--validate-config'); const validateRules = args.includes('--validate-rules'); const testValidation = args.includes('--test-validation'); let cleanRules = args.includes('--clean-rules'); let validateRulesFile = null; const validateRulesIndex = args.findIndex(arg => arg === '--validate-rules'); if (validateRulesIndex !== -1 && args[validateRulesIndex + 1] && !args[validateRulesIndex + 1].startsWith('--')) { validateRulesFile = args[validateRulesIndex + 1]; validateRules = true; // Override the boolean if file specified } let cleanRulesFile = null; const cleanRulesIndex = args.findIndex(arg => arg === '--clean-rules'); if (cleanRulesIndex !== -1 && args[cleanRulesIndex + 1] && !args[cleanRulesIndex + 1].startsWith('--')) { cleanRulesFile = args[cleanRulesIndex + 1]; cleanRules = true; // Override the boolean if file specified } let maxConcurrentSites = null; const maxConcurrentIndex = args.findIndex(arg => arg === '--max-concurrent'); if (maxConcurrentIndex !== -1 && args[maxConcurrentIndex + 1]) { maxConcurrentSites = parseInt(args[maxConcurrentIndex + 1]); } let cleanupInterval = null; const cleanupIntervalIndex = args.findIndex(arg => arg === '--cleanup-interval'); if (cleanupIntervalIndex !== -1 && args[cleanupIntervalIndex + 1]) { cleanupInterval = parseInt(args[cleanupIntervalIndex + 1]); } const enableColors = args.includes('--color') || args.includes('--colour'); let adblockRulesMode = args.includes('--adblock-rules'); // Validate --adblock-rules usage - ignore if used incorrectly instead of erroring if (adblockRulesMode) { if (!outputFile) { if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: requires --output (-o) to specify an output file`)); adblockRulesMode = false; } else if (localhostMode || localhostModeAlt || plainOutput || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) { if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: incompatible with localhost/plain output modes`)); adblockRulesMode = false; } } // Validate --dnsmasq usage if (dnsmasqMode) { if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) { if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`)); dnsmasqMode = false; } } // Validate --dnsmasq-old usage if (dnsmasqOldMode) { if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || unboundMode || privoxyMode || piholeMode) { if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`)); dnsmasqOldMode = false; } } // Validate --unbound usage if (unboundMode) { if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || privoxyMode || piholeMode) { if (forceDebug) console.log(formatLogMessage('debug', `--unbound ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`)); unboundMode = false; } } // Validate --privoxy usage if (privoxyMode) { if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || piholeMode) { if (forceDebug) console.log(formatLogMessage('debug', `--privoxy ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound output modes`)); privoxyMode = false; } } // Validate --pihole usage if (piholeMode) { if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode) { if (forceDebug) console.log(formatLogMessage('debug', `--pihole ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound/privoxy output modes`)); piholeMode = false; } } // Validate --compress-logs usage if (compressLogs && !dumpUrls) { console.error(`❌ --compress-logs can only be used with --dumpurls`); process.exit(1); } // Validate --append usage if (appendMode && !outputFile) { console.error(`❌ --append requires --output (-o) to specify an output file`); process.exit(1); } if (appendMode && (compareFile || dryRunMode)) { console.error(`❌ --append cannot be used with --compare or --dry-run`); process.exit(1); } // Validate --dry-run usage if (dryRunMode) { if (compressLogs || compareFile) { console.error(`❌ --dry-run cannot be used with --compress-logs or --compare`); process.exit(1); } } // Validate --compare usage if (compareFile && !outputFile) { console.error(`❌ --compare requires --output (-o) to specify an output file`); process.exit(1); } if (compareFile && !fs.existsSync(compareFile)) { console.error(`❌ Compare file not found: ${compareFile}`); process.exit(1); } if (args.includes('--version')) { console.log(`nwss.js version ${VERSION}`); process.exit(0); } // Handle validation-only operations before main help if (testValidation) { console.log(`\n${messageColors.processing('Running domain validation tests...')}`); const testResult = testDomainValidation(); if (testResult) { console.log(`${messageColors.success('✅ All validation tests passed!')}`); process.exit(0); } else { console.log(`${messageColors.error('❌ Some validation tests failed!')}`); process.exit(1); } } if (validateConfig) { console.log(`\n${messageColors.processing('Validating configuration file...')}`); try { const validation = validateFullConfig(config, { forceDebug, silentMode }); // Validate referrer_headers format for (const site of sites) { if (site.referrer_headers && typeof site.referrer_headers === 'object' && !Array.isArray(site.referrer_headers)) { const validModes = ['random_search', 'social_media', 'direct_navigation', 'custom']; if (site.referrer_headers.mode && !validModes.includes(site.referrer_headers.mode)) { console.warn(`⚠ Invalid referrer_headers mode: ${site.referrer_headers.mode}. Valid modes: ${validModes.join(', ')}`); } } } if (validation.isValid) { console.log(`${messageColors.success('✅ Configuration is valid!')}`); console.log(`${messageColors.info('Summary:')} ${validation.summary.validSites}/${validation.summary.totalSites} sites valid`); if (validation.summary.sitesWithWarnings > 0) { console.log(`${messageColors.warn('⚠ Warnings:')} ${validation.summary.sitesWithWarnings} sites have warnings`); } process.exit(0); } else { console.log(`${messageColors.error('❌ Configuration validation failed!')}`); console.log(`${messageColors.error('Errors:')} ${validation.globalErrors.length} global, ${validation.summary.sitesWithErrors} site-specific`); process.exit(1); } } catch (validationErr) { console.error(`❌ Validation failed: ${validationErr.message}`); process.exit(1); } } if (validateRules || validateRulesFile) { const filesToValidate = validateRulesFile ? [validateRulesFile] : [outputFile, compareFile].filter(Boolean); if (filesToValidate.length === 0) { console.error('❌ --validate-rules requires either a file argument or --output/--compare files to be specified'); process.exit(1); } console.log(`\n${messageColors.processing('Validating rule files...')}`); let overallValid = true; for (const file of filesToValidate) { console.log(`\n${messageColors.info('Validating:')} ${file}`); try { const validation = validateRulesetFile(file, { forceDebug, silentMode, maxErrors: 20 }); if (validation.isValid) { console.log(`${messageColors.success('✅ Valid:')} ${validation.stats.valid} rules, ${validation.stats.comments} comments`); if (validation.duplicates.length > 0) { console.log(`${messageColors.warn('⚠ Duplicates:')} ${validation.duplicates.length} duplicate rules found`); } if (Object.keys(validation.stats.formats).length > 0) { console.log(`${messageColors.info('Formats:')} ${Object.entries(validation.stats.formats).map(([f, c]) => `${f}(${c})`).join(', ')}`); } } else { console.log(`${messageColors.error('❌ Invalid:')} ${validation.stats.invalid} invalid rules out of ${validation.stats.total} total`); overallValid = false; } } catch (validationErr) { console.error(`❌ Failed to validate ${file}: ${validationErr.message}`); overallValid = false; } } if (overallValid) { console.log(`\n${messageColors.success('✅ All rule files are valid!')}`); process.exit(0); } else { console.log(`\n${messageColors.error('❌ Some rule files have validation errors!')}`); process.exit(1); } } if (args.includes('--help') || args.includes('-h')) { console.log(`Usage: node nwss.js [options] Options: --color, --colour Enable colored console output for status messages -o, --output <file> Output file for rules. If omitted, prints to console --compare <file> Remove rules that already exist in this file before output --append Append new rules to output file instead of overwriting (requires -o) Output Format Options: --localhost Output as 127.0.0.1 domain.com --localhost-0.0.0.0 Output as 0.0.0.0 domain.com --plain Output just domains (no adblock formatting) --dnsmasq Output as local=/domain.com/ (dnsmasq format) --dnsmasq-old Output as server=/domain.com/ (dnsmasq old format) --unbound Output as local-zone: "domain.com." always_null (unbound format) --privoxy Output as { +block } .domain.com (Privoxy format) --pihole Output as (^|\\.)domain\\.com$ (Pi-hole regex format) --adblock-rules Generate adblock filter rules with resource type modifiers (requires -o) General Options: --verbose Force verbose mode globally --debug Force debug mode globally --silent Suppress normal console logs --titles Add ! <url> title before each site's group --dumpurls Dump matched URLs into matched_urls.log --dry-run Console output only: show matching regex, titles, whois/dig/searchstring results, and adblock rules --compress-logs Compress log files with gzip (requires --dumpurls) --sub-domains Output full subdomains instead of collapsing to root --no-interact Disable page interactions globally --custom-json <file> Use a custom config JSON file instead of config.json --headful Launch browser with GUI (not headless) --cdp Enable Chrome DevTools Protocol logging (now per-page if enabled) --remove-dupes Remove duplicate domains from output (only with -o) --eval-on-doc Globally enable evaluateOnNewDocument() for Fetch/XHR interception --help, -h Show this help menu --version Show script version --max-concurrent <number> Maximum concurrent site processing (1-50, overrides config/default) --cleanup-interval <number> Browser restart interval in URLs processed (1-1000, overrides config/default) --remove-tempfiles Remove Chrome/Puppeteer temporary files before exit Validation Options: --validate-config Validate config.json file and exit --validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified) --clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified) --test-validation Run domain validation tests and exit Global config.json options: ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards) blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked) whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random) ignore_similar: true/false Ignore domains similar to already found domains (default: true) ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80) ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true) max_concurrent_sites: 6 Maximum concurrent site processing (1-50, default: 6) resource_cleanup_interval: 180 Browser restart interval in URLs processed (1-1000, default: 180) Per-site config.json options: url: "site" or ["site1", "site2"] Single URL or list of URLs filterRegex: "regex" or ["regex1", "regex2"] Patterns to match requests Redirect Handling Options: follow_redirects: true/false Follow redirects to new domains (default: true) max_redirects: 10 Maximum number of redirects to follow (default: 10) js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000) detect_js_patterns: true/false Analyze page source for redirect patterns (default: true) redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5) comments: "text" or ["text1", "text2"] Documentation/notes - ignored by script searchstring: "text" or ["text1", "text2"] Text to search in response content (requires filterRegex match) ignore_similar: true/false Override global ignore_similar setting for this site ignore_similar_threshold: 80 Override global similarity threshold for this site ignore_similar_ignored_domains: true/false Override global ignore_similar_ignored_domains for this site searchstring_and: "text" or ["text1", "text2"] Text to search with AND logic - ALL terms must be present (requires filterRegex match) curl: true/false Use curl to download content for analysis (default: false) Note: curl respects filterRegex but ignores resourceTypes filtering grep: true/false Use grep instead of JavaScript for pattern matching (default: false) Note: requires curl=true, uses system grep command for faster searches blocked: ["regex"] Regex patterns to block requests css_blocked: ["#selector", ".class"] CSS selectors to hide elements resourceTypes: ["script", "stylesheet"] Only process requests of these resource types (default: all types) interact: true/false Simulate mouse movements/clicks isBrave: true/false Spoof Brave browser detection userAgent: "chrome"|"firefox"|"safari" Custom desktop User-Agent interact_intensity: "low"|"medium"|"high" Interaction simulation intensity (default: medium) delay: <milliseconds> Delay after load (default: 4000) reload: <number> Reload page n times after load (default: 1) forcereload: true/false Force an additional reload after reloads clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false) subDomains: 1/0 Output full subdomains (default: 0) localhost: true/false Force localhost output (127.0.0.1) localhost_0_0_0_0: true/false Force localhost output (0.0.0.0) dnsmasq: true/false Force dnsmasq output (local=/domain.com/) dnsmasq_old: true/false Force dnsmasq old output (server=/domain.com/) unbound: true/false Force unbound output (local-zone: "domain.com." always_null) privoxy: true/false Force Privoxy output ({ +block } .domain.com) pihole: true/false Force Pi-hole regex output ((^|\\.)domain\\.com$) source: true/false Save page source HTML after load firstParty: true/false Allow first-party matches (default: false) thirdParty: true/false Allow third-party matches (default: true) screenshot: true/false Capture screenshot on load failure headful: true/false Launch browser with GUI for this site fingerprint_protection: true/false/"random" Enable fingerprint spoofing: true/false/"random" adblock_rules: true/false Generate adblock filter rules with resource types for this site even_blocked: true/false Add matching rules even if requests are blocked (default: false) referrer_headers: "url" or ["url1", "url2"] Set referrer header for realistic traffic sources custom_headers: {"Header": "value"} Add custom HTTP headers to requests Cloudflare Protection Options: cloudflare_phish: true/false Auto-click through Cloudflare phishing warnings (default: false) cloudflare_bypass: true/false Auto-solve Cloudflare "Verify you are human" challenges (default: false) FlowProxy Protection Options: flowproxy_detection: true/false Enable flowProxy protection detection and handling (default: false) flowproxy_page_timeout: <milliseconds> Page timeout for flowProxy sites (default: 45000) flowproxy_nav_timeout: <milliseconds> Navigation timeout for flowProxy sites (default: 45000) flowproxy_js_timeout: <milliseconds> JavaScript challenge timeout (default: 15000) flowproxy_delay: <milliseconds> Delay for rate limiting (default: 30000) flowproxy_additional_delay: <milliseconds> Additional processing delay (default: 5000) Advanced Options: evaluateOnNewDocument: true/false Inject fetch/XHR interceptor in page (for this site) cdp: true/false Enable CDP logging for this site Inject fetch/XHR interceptor in page interact_duration: <milliseconds> Duration of interaction simulation (default: 2000) interact_scrolling: true/false Enable scrolling simulation (default: true) interact_clicks: true/false Enable element clicking simulation (default: false) interact_typing: true/false Enable typing simulation (default: false) whois: ["term1", "term2"] Check whois data for ALL specified terms (AND logic) whois-or: ["term1", "term2"] Check whois data for ANY specified term (OR logic) whois_server_mode: "random" or "cycle" Server selection mode: random (default) or cycle through list whois_server: "whois.domain.com" or ["server1", "server2"] Custom whois server(s) - single server or randomized list (default: system default) whois_max_retries: 2 Maximum retry attempts per domain (default: 2) whois_timeout_multiplier: 1.5 Timeout increase multiplier per retry (default: 1.5) whois_use_fallback: true Add TLD-specific fallback servers (default: true) whois_retry_on_timeout: true Retry on timeout errors (default: true) whois_retry_on_error: false Retry on connection/other errors (default: false) whois_delay: <milliseconds> Delay between whois requests for this site (default: global whois_delay) dig: ["term1", "term2"] Check dig output for ALL specified terms (AND logic) dig-or: ["term1", "term2"] Check dig output for ANY specified term (OR logic) goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "load"}) dig_subdomain: true/false Use subdomain for dig lookup instead of root domain (default: false) digRecordType: "A" DNS record type for dig (default: A) Referrer Header Options: referrer_headers: "https://google.com" Single referrer URL referrer_headers: ["url1", "url2"] Random selection from array referrer_headers: {"mode": "random_search", "search_terms": ["term1"]} Smart search engine traffic referrer_headers: {"mode": "social_media"} Random social media referrers referrer_headers: {"mode": "direct_navigation"} No referrer (direct access) custom_headers: {"Header": "Value"} Additional HTTP headers `); process.exit(0); } // --- Configuration File Loading --- const configPathIndex = args.findIndex(arg => arg === '--custom-json'); const configPath = (configPathIndex !== -1 && args[configPathIndex + 1]) ? args[configPathIndex + 1] : 'config.json'; let config; try { if (!fs.existsSync(configPath)) { console.error(`❌ Config file not found: ${configPath}`); process.exit(1); } if (forceDebug && configPath !== 'config.json') { console.log(formatLogMessage('debug', `Using custom config file: ${configPath}`)); } const raw = fs.readFileSync(configPath, 'utf8'); config = JSON.parse(raw); } catch (e) { console.error(`❌ Failed to load config file (${configPath}):`, e.message); process.exit(1); } // Extract config values while ignoring 'comments' field at global and site levels const { sites = [], ignoreDomains = [], blocked: globalBlocked = [], whois_delay = 3000, whois_server_mode = 'random', ignore_similar = true, ignore_similar_threshold = 80, ignore_similar_ignored_domains = true, max_concurrent_sites = 6, resource_cleanup_interval = 180, comments: globalComments, ...otherGlobalConfig } = config; // Apply global configuration overrides with validation // Priority: Command line args > config.json > defaults const MAX_CONCURRENT_SITES = (() => { // Check command line argument first if (maxConcurrentSites !== null) { if (maxConcurrentSites > 0 && maxConcurrentSites <= 50) { if (forceDebug) console.log(formatLogMessage('debug', `Using command line max_concurrent_sites: ${maxConcurrentSites}`)); return maxConcurrentSites; } else { console.warn(`⚠ Invalid --max-concurrent value: ${maxConcurrentSites}. Must be 1-50. Using config/default value.`); } } // Check config.json value if (typeof max_concurrent_sites === 'number' && max_concurrent_sites > 0 && max_concurrent_sites <= 50) { if (forceDebug) console.log(formatLogMessage('debug', `Using config max_concurrent_sites: ${max_concurrent_sites}`)); return max_concurrent_sites; } else if (max_concurrent_sites !== 6) { console.warn(`⚠ Invalid config max_concurrent_sites value: ${max_concurrent_sites}. Using default: 6`); } // Use default return 6; })(); const RESOURCE_CLEANUP_INTERVAL = (() => { // Check command line argument first if (cleanupInterval !== null) { if (cleanupInterval > 0 && cleanupInterval <= 1000) { if (forceDebug) console.log(formatLogMessage('debug', `Using command line resource_cleanup_interval: ${cleanupInterval}`)); return cleanupInterval; } else { console.warn(`⚠ Invalid --cleanup-interval value: ${cleanupInterval}. Must be 1-1000. Using config/default value.`); } } // Check config.json value if (typeof resource_cleanup_interval === 'number' && resource_cleanup_interval > 0 && resource_cleanup_interval <= 1000) { if (forceDebug) console.log(formatLogMessage('debug', `Using config resource_cleanup_interval: ${resource_cleanup_interval}`)); return resource_cleanup_interval; } else if (resource_cleanup_interval !== 180) { console.warn(`⚠ Invalid config resource_cleanup_interval value: ${resource_cleanup_interval}. Using default: 180`); } // Use default return 180; })(); // Initialize smart cache system AFTER config is loaded smartCache = createSmartCache({ ...config, forceDebug, cache_persistence: config.cache_persistence !== false, // Enable by default cache_autosave: config.cache_autosave !== false, cache_autosave_minutes: config.cache_autosave_minutes || 1, cache_max_size: config.cache_max_size || 5000 }); // Handle --clean-rules after config is loaded (so we have access to sites) if (cleanRules || cleanRulesFile) { const filesToClean = cleanRulesFile ? [cleanRulesFile] : [outputFile, compareFile].filter(Boolean); if (filesToClean.length === 0) { console.error('❌ --clean-rules requires either a file argument or --output/--compare files to be specified'); process.exit(1); } console.log(`\n${messageColors.processing('Cleaning rule files...')}`); let overallSuccess = true; let totalCleaned = 0; // Check if we're cleaning the same file we want to use for output const cleaningOutputFile = outputFile && filesToClean.includes(outputFile); if (cleaningOutputFile && forceDebug) { console.log(formatLogMessage('debug', `Output file detected: will clean ${outputFile} first, then continue with scan`)); } for (const file of filesToClean) { console.log(`\n${messageColors.info('Cleaning:')} ${file}`); // Check if file exists before trying to clean it if (!fs.existsSync(file)) { if (file === outputFile) { // If it's the output file that doesn't exist, that's OK - we'll create it during scan const modeText = appendMode ? 'created (append mode)' : 'created'; console.log(`${messageColors.info('📄 Note:')} Output file ${file} doesn't exist yet - will be ${modeText} during scan`); continue; } else { // For other files (like compare files), this is an error console.log(`${messageColors.error('❌ Failed:')} File not found: ${file}`); overallSuccess = false; continue; } } try { const cleanResult = cleanRulesetFile(file, null, { forceDebug, silentMode, removeDuplicates: removeDupes, backupOriginal: true, dryRun: dryRunMode }); if (cleanResult.success) { if (dryRunMode) { if (cleanResult.wouldModify) { console.log(`${messageColors.info('🔍 Dry run:')} Would remove ${cleanResult.stats.removed} lines (${cleanResult.stats.invalid} invalid, ${cleanResult.stats.duplicates} duplicates)`); } else { console.log(`${messageColors.success('✅ Dry run:')} File is already clean - no changes needed`); } } else { if (cleanResult.modified) { console.log(`${messageColors.success('✅ Cleaned:')} Removed ${cleanResult.stats.removed} lines, preserved ${cleanResult.stats.valid} valid rules`); if (cleanResult.backupCreated) { console.log(`${messageColors.info('💾 Backup:')} Original file backed up`); } totalCleaned += cleanResult.stats.removed; if (cleaningOutputFile && file === outputFile) { console.log(`${messageColors.info('📄 Note:')} File cleaned - new rules will be ${appendMode ? 'appended' : 'written'} during scan`); } } else { console.log(`${messageColors.success('✅ Clean:')} File was already valid - no changes needed`); } } } else { console.log(`${messageColors.error('❌ Failed:')} ${cleanResult.error}`); overallSuccess = false; } } catch (cleanErr) { console.error(`❌ Failed to clean ${file}: ${cleanErr.message}`); overallSuccess = false; } } // Determine if we should continue with scanning const shouldContinueScanning = sites && sites.length > 0 && outputFile; const cleanedOutputFileForScanning = outputFile && filesToClean.includes(outputFile); if (overallSuccess) { if (dryRunMode) { console.log(`\n${messageColors.info('🔍 Dry run completed successfully!')}`); process.exit(0); } else { console.log(`\n${messageColors.success('✅ All rule files cleaned successfully!')} Total lines removed: ${totalCleaned}`); // Continue with scan if we have sites to process and we cleaned the output file if (shouldContinueScanning && cleanedOutputFileForScanning) { const actionText = appendMode ? 'append new rules to' : 'write rules to'; console.log(`${messageColors.info('📄 Continuing:')} Proceeding with scan to ${actionText} ${outputFile}`); // Don't exit - continue with scanning } else { process.exit(0); } } } else { console.log(`\n${messageColors.error('❌ Some rule files failed to clean!')}`); process.exit(1); } } // Add global cycling index tracker for whois server selection let globalWhoisServerIndex = 0; // Track dry run output for file writing let dryRunOutput = []; // --- Log File Setup --- let debugLogFile = null; let matchedUrlsLogFile = null; let adblockRulesLogFile = null; if (forceDebug || dumpUrls) { // Create logs folder if it doesn't exist const logsFolder = 'logs'; if (!fs.existsSync(logsFolder)) { fs.mkdirSync(logsFolder, { recursive: true }); console.log(formatLogMessage('debug', `Created logs folder: ${logsFolder}`)); } // Generate timestamped log filenames const timestamp = new Date().toISOString().replace(/[:.]/g, '-').replace('T', '_').slice(0, -5); if (forceDebug) { debugLogFile = path.join(logsFolder, `debug_requests_${timestamp}.log`); console.log(formatLogMessage('debug', `Debug requests will be logged to: ${debugLogFile}`)); } if (dumpUrls) { matchedUrlsLogFile = path.join(logsFolder, `matched_urls_${timestamp}.log`); console.log(messageColors.processing('Matched URLs will be logged to:') + ` ${matchedUrlsLogFile}`); // Also create adblock rules log file with same timestamp adblockRulesLogFile = path.join(logsFolder, `adblock_rules_${timestamp}.txt`); console.log(messageColors.processing('Adblock rules will be saved to:') + ` ${adblockRulesLogFile}`); } } // Log comments if debug mode is enabled and comments exist if (forceDebug && globalComments) { const commentList = Array.isArray(globalComments) ? globalComments : [globalComments]; console.log(formatLogMessage('debug', `Global comments found: ${commentList.length} item(s)`)); commentList.forEach((comment, idx) => console.log(formatLogMessage('debug', ` Comment ${idx + 1}: ${comment}`))); } // --- Global CDP Override Logic --- [COMMENT RE-ADDED PREVIOUSLY, relevant to old logic] // If globalCDP is not already enabled by the --cdp flag, // check if any site in config.json has `cdp: true`. If so, enable globalCDP. // This allows site-specific config to trigger CDP logging for the entire session. // Note: Analysis suggests CDP should ideally be managed per-page for comprehensive logging. // (The code block that utilized this logic for a global CDP variable has been removed // as CDP is now handled per-page based on 'enableCDP' and 'siteConfig.cdp') /** * Extracts the root domain from a given URL string using the psl library. * For example, for 'http://sub.example.com/path', it returns 'example.com'. * * @param {string} url - The URL string to parse. * @returns {string} The root domain, or the original hostname if parsing fails (e.g., for IP addresses or invalid URLs), or an empty string on error. */ function getRootDomain(url) { try { const { hostname } = new URL(url); const parsed = psl.parse(hostname); return parsed.domain || hostname; } catch { return ''; } } /** * Safely extracts hostname from a URL, handling malformed URLs gracefully * @param {string} url - The URL string to parse * @param {boolean} getFullHostname - If true, returns full hostname; if false, returns root domain * @returns {string} The hostname/domain, or empty string if URL is invalid */ function safeGetDomain(url, getFullHostname = false) { try { const parsedUrl = new URL(url); if (getFullHostname) { return parsedUrl.hostname; } else { return getRootDomain(url); } } catch (urlError) { // Log malformed URLs for debugging if (forceDebug) { console.log(formatLogMessage('debug', `Malformed URL skipped: ${url} (${urlError.message})`)); } return ''; } } /** * Outputs dry run results to console with formatted display * If outputFile is specified, also captures output for file writing * @param {string} url - The URL being processed * @param {Array} matchedItems - Array of matched items with regex, domain, and resource type * @param {Array} netToolsResults - Array of whois/dig results * @param {string} pageTitle - Title of the page (if available) */ function outputDryRunResults(url, matchedItems, netToolsResults, pageTitle) { const lines = []; lines.push(`\n=== DRY RUN RESULTS === ${url}`); console.log(`\n${messageColors.scanning('=== DRY RUN RESULTS ===')} ${url}`); if (pageTitle && pageTitle.trim()) { lines.push(`Title: ${pageTitle.trim()}`); console.log(`${messageColors.info('Title:')} ${pageTitle.trim()}`); } if (matchedItems.length === 0 && netToolsResults.length === 0) { lines.push(`No matching rules found on ${url}`); // Store output for file writing if outputFile is specified if (outputFile) { dryRunOutput.push(...lines); dryRunOutput.push(''); // Add empty line } console.log(messageColors.warn(`No matching rules found on ${url}`)); return; } const totalMatches = matchedItems.length + netToolsResults.length; lines.push(`Matches found: ${totalMatches}`); console.log(`${messageColors.success('Matches found:')} ${totalMatches}`); matchedItems.forEach((item, index) => { lines.push(''); lines.push(`[${index + 1}] Regex Match:`); lines.push(` Pattern: ${item.regex}`); lines.push(` Domain: ${item.domain}`); lines.push(` Resource Type: ${item.resourceType}`); lines.push(` Full URL: ${item.fullUrl}`); console.log(`\n${messageColors.highlight(`[${index + 1}]`)} ${messageColors.match('Regex Match:')}`); console.log(` Pattern: ${item.regex}`); console.log(` Domain: ${item.domain}`); console.log(` Resource Type: ${item.resourceType}`); console.log(` Full URL: ${item.fullUrl}`); // Show searchstring results if available if (item.searchStringMatch) { lines.push(` ✓ Searchstring Match: ${item.searchStringMatch.type} - "${item.searchStringMatch.term}"`); console.log(` ${messageColors.success('✓ Searchstring Match:')} ${item.searchStringMatch.type} - "${item.searchStringMatch.term}"`); } else if (item.searchStringChecked) { lines.push(` ✗ Searchstring: No matches found in content`); console.log(` ${messageColors.warn('✗ Searchstring:')} No matches found in content`); } // Generate adblock rule const adblockRule = `||${item.domain}^$${item.resourceType}`; lines.push(` Adblock Rule: ${adblockRule}`); console.log(` ${messageColors.info('Adblock Rule:')} ${adblockRule}`); }); // Display nettools results netToolsResults.forEach((result, index) => { const resultIndex = matchedItems.length + index + 1; lines.push(''); lines.push(`[${resultIndex}] NetTools Match:`); lines.push(` Domain: ${result.domain}`); lines.push(` Tool: ${result.tool.toUpperCase()}`); lines.push(` ✓ Match: ${result.matchType} - "${result.matchedTerm}"`); if (result.details) { lines.push(` Details: ${result.details}`); } console.log(`\n${messageColors.highlight(`[${resultIndex}]`)} ${messageColors.match('NetTools Match:')}`); console.log(` Domain: ${result.domain}`); console.log(` Tool: ${result.tool.toUpperCase()}`); console.log(` ${messageColors.success('✓ Match:')} ${result.matchType} - "${result.matchedTerm}"`); if (result.details) { console.log(` Details: ${result.details}`); } // Generate adblock rule for nettools matches const adblockRule = `||${result.domain}^`; lines.push(` Adblock Rule: ${adblockRule}`); console.log(` ${messageColors.info('Adblock Rule:')} ${adblockRule}`); }); // Store output for file writing if outputFile is specified if (outputFile) { dryRunOutput.push(...lines); dryRunOutput.push(''); // Add empty line between sites } } // ability to use widcards in ignoreDomains function matchesIgnoreDomain(domain, ignorePatterns) { return ignorePatterns.some(pattern => { if (pattern.includes('*')) { // Convert wildcard pattern to regex const regexPattern = pattern .replace(/\./g, '\\.') // Escape dots .replace(/\*/g, '.*'); // Convert * to .* return new RegExp(`^${regexPattern}$`).test(domain); } return domain.endsWith(pattern); }); } function setupFrameHandling(page, forceDebug) { // Handle frame creation with error suppression page.on('frameattached', async (frame) => { if (frame.parentFrame()) { // Only handle child frames, not main frame try { const frameUrl = frame.url(); if (forceDebug) { console.log(formatLogMessage('debug', `New frame attached: ${frameUrl || 'about:blank'}`)); } // Don't try to navigate to frames with invalid/empty URLs if (!frameUrl || frameUrl === 'about:blank' || frameUrl === '' || frameUrl === 'about:srcdoc' || frameUrl.startsWith('about:') || frameUrl.startsWith('data:') || frameUrl.startsWith('blob:') || frameUrl.startsWith('chrome-error://') || frameUrl.startsWith('chrome-extension://')) { if (forceDebug) { console.log(formatLogMessage('debug', `Skipping frame with invalid/special URL: ${frameUrl}`)); } return; } // Validate URL format before attempting navigation try { const parsedUrl = new URL(frameUrl); // Only process http/https URLs if (!['http:', 'https:'].includes(parsedUrl.protocol)) { if (forceDebug) { console.log(formatLogMessage('debug', `Skipping frame with non-http protocol: ${frameUrl}`)); } return; } } catch (urlErr) { if (forceDebug) { console.log(formatLogMessage('debug', `Skipping frame with malformed URL: ${frameUrl}`)); } return; } // REMOVED: Don't try to manually navigate frames // Let frames load naturally - manual navigation often causes Protocol errors // await frame.goto(frame.url(), { waitUntil: 'domcontentloaded', timeout: 5000 }); if (forceDebug) { console.log(formatLogMessage('debug', `Frame will load naturally: ${frameUrl}`)); } } catch (err) { // Suppress "Cannot navigate to invalid URL" errors but log others if (!err.message.includes('Cannot navigate to invalid URL') && !err.message.includes('Protocol error')) { if (forceDebug) { console.log(formatLogMessage('debug', `Frame handling error: ${err.message}`)); } } } } }); // Handle frame navigations (keep this for monitoring) page.on('framenavigated', (frame) => { const frameUrl = frame.url(); if (forceDebug && frameUrl && frameUrl !== 'about:blank' && frameUrl !== 'about:srcdoc' && !frameUrl.startsWith('about:') && !frameUrl.startsWith('data:') && !frameUrl.startsWith('chrome-error://') && !frameUrl.startsWith('chrome-extension://')) { console.log(formatLogMessage('debug', `Frame navigated to: ${frameUrl}`)); } }); // Optional: Handle frame detachment for cleanup page.on('framedetached', (frame) => { if (forceDebug) { const frameUrl = frame.url(); if (frameUrl && frameUrl !== 'about:blank' && frameUrl !== 'about:srcdoc' && !frameUrl.startsWith('about:') && !frameUrl.startsWith('chrome-error://') && !frameUrl.startsWith('chrome-extension://')) { console.log(formatLogMessage('debug', `Frame detached: ${frameUrl}`)); } } }); } // --- Main Asynchronous IIFE (Immediately Invoked Function Expression) --- // This is the main entry point and execution block for the network scanner script. (async () => { // Declare userDataDir in outer scope for cleanup access let userDataDir = null; /** * Creates a new browser instance with consistent configuration * Uses system Chrome and temporary directories to minimize disk usage * @returns {Promise<import('puppeteer').Browser>} Browser instance */ async function createBrowser() { // Create temporary user data directory that we can fully control and clean up const tempUserDataDir = `/tmp/puppeteer-${Date.now()}-${Math.random().toString(36).substring(7)}`; userDataDir = tempUserDataDir; // Store for cleanup tracking (use outer scope variable) // Try to find system Chrome installation to avoid Puppeteer downloads const systemChromePaths = [ '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome', '/usr/bin/chromium-browser', '/usr/bin/chromium', '/snap/bin/chromium' ]; let executablePath = null; for (const chromePath of systemChromePaths) { if (fs.existsSync(chromePath)) { executablePath = chromePath; if (forceDebug) { console.log(formatLogMessage('debug', `Using system Chrome: ${chromePath}`)); } break; } } const browser = await puppeteer.launch({ // Use system Chrome if available to avoid downloads executablePath: executablePath, // Force temporary user data directory for complete cleanup control userDataDir: tempUserDataDir, args: [ // Disk space controls - 50MB cache limits '--disk-cache-size=52428800', // 50MB disk cache (50 * 1024 * 1024) '--media-cache-size=52428800', // 50MB media cache '--disable-application-cache', '--disable-offline-load-stale-cache', '--disable-background-downloads', '--no-first-run', '--disable-default-apps', '--disable-component-extensions-with-background-pages', '--disable-background-networking', '--no-sandbox', '--disable-setuid-sandbox', '--disable-features=SafeBrowsing', '--disable-dev-shm-usage', '--disable-sync', '--disable-gpu', '--mute-audio', '--disable-translate', '--window-size=1920,1080', '--disable-extensions', '--no-default-browser-check', '--safebrowsing-disable-auto-update', '--max_old_space_size=1024', '--ignore-ssl-errors', '--ignore-certificate-errors', '--ignore-certificate-errors-spki-list', '--ignore-certificate-errors-ca-list', '--disable-web-security', '--allow-running-insecure-content', '--disable-background-timer-throttling', '--disable-backgrounding-occluded-windows', '--disable-renderer-backgrounding', '--disable-features=TranslateUI', '--disable-features=VizDisplayCompositor', '--run-all-compositor-stages-before-draw', '--disable-threaded-animation', '--disable-threaded-scrolling', '--disable-checker-imaging', '--disable-image-animation-resync' ], headless: launchHeadless ? 'shell' : false, protocolTimeout: 60000 // 60 seconds }); // Store the user data directory on the browser object for cleanup browser._nwssUserDataDir = tempUserDataDir; return browser; } const pLimit = (await import('p-limit')).default; const limit = pLimit(MAX_CONCURRENT_SITES); const perSiteHeadful = sites.some(site => site.headful === true); const launchHeadless = !(headfulMode || perSiteHeadful); // launch with no safe browsing let browser = await createBrowser(); if (forceDebug) console.log(formatLogMessage('debug', `Launching browser with headless: ${launchHeadless}`)); // Log which headless mode is being used if (forceDebug && launchHeadless) { console.log(formatLogMessage('debug', `Using chrome-headless-shell for maximum performance`)); } // Initial cleanup of any existing Chrome temp files - always comprehensive on startup if (forceDebug) console.log(formatLogMessage('debug', 'Cleaning up any leftover temp files from previous runs...')); await cleanupChromeTempFiles({ includeSnapTemp: true, // Always clean snap dirs on startup forceDebug, comprehensive: true // Always comprehensive on startup to clean leftovers }); // Set up cleanup on process termination process.on('SIGINT', async () => { if (forceDebug) console.log(formatLogMessage('debug', 'SIGINT received, performing cle