UNPKG

@fanboynz/network-scanner

Version:

A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.

925 lines (835 loc) 283 kB
// === Network scanner script (nwss.js) v2.0.51 === // puppeteer for browser automation, fs for file system operations, psl for domain parsing. // const pLimit = require('p-limit'); // Will be dynamically imported const useObscura = process.argv.includes('--use-obscura'); const usePuppeteerCore = process.argv.includes('--use-puppeteer-core') || useObscura; const puppeteer = usePuppeteerCore ? require('puppeteer-core') : require('puppeteer'); const fs = require('fs'); const os = require('os'); const psl = require('psl'); const path = require('path'); const dnsPromises = require('node:dns/promises'); const { createGrepHandler, validateGrepAvailability } = require('./lib/grep'); const { compressMultipleFiles, formatFileSize } = require('./lib/compress'); const { parseSearchStrings, createResponseHandler, createCurlHandler } = require('./lib/searchstring'); const { applyAllFingerprintSpoofing } = require('./lib/fingerprint'); const { formatRules, handleOutput, getFormatDescription } = require('./lib/output'); // Curl functionality (replace searchstring curl handler) const { validateCurlAvailability, createCurlHandler: createCurlModuleHandler } = require('./lib/curl'); // Rule validation const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile } = require('./lib/validate_rules'); // CF Bypass const { handleCloudflareProtection, getCacheStats, clearDetectionCache, parallelChallengeDetection, cleanup: cleanupCloudflareCache } = require('./lib/cloudflare'); // FP Bypass const { handleFlowProxyProtection, getFlowProxyTimeouts, attachFlowProxyHeaderListener } = require('./lib/flowproxy'); // ignore_similar rules const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore_similar'); // Graceful exit const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit'); // Whois & Dig const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools'); // File compare const { loadComparisonRules, filterUniqueRules } = require('./lib/compare'); // CDP functionality const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeout } = require('./lib/cdp'); // Post-processing cleanup const { processResults } = require('./lib/post-processing'); // Colorize various text when used const { messageColors, formatLogMessage } = require('./lib/colorize'); const TIMEOUT_TAG = messageColors.processing('[TIMEOUT]'); const INTERACTION_TAG = messageColors.processing('[interaction]'); const GHOST_CURSOR_TAG = messageColors.processing('[ghost-cursor]'); const PROXY_TAG = messageColors.processing('[proxy]'); const GREP_RESPONSE_TAG = messageColors.processing('[grep-response]'); const IGNORE_DOMAINS_BY_URL_TAG = messageColors.processing('[ignoreDomainsByUrl]'); const BLOCK_DOMAINS_BY_URL_TAG = messageColors.processing('[blockDomainsByUrl]'); const IGNORE_SIMILAR_IGNORED_DOMAINS_TAG = messageColors.processing('[ignore_similar_ignored_domains]'); const IGNORE_SIMILAR_TAG = messageColors.processing('[ignore_similar]'); const CLEAR_SITEDATA_TAG = messageColors.processing('[clear_sitedata]'); const CSS_BLOCKED_TAG = messageColors.processing('[css_blocked]'); const EVAL_ON_DOC_TAG = messageColors.processing('[evalOnDoc]'); const REALTIME_CLEANUP_TAG = messageColors.processing('[realtime_cleanup]'); const VPN_TAG = messageColors.processing('[vpn]'); // Precomputed colored '[SmartCache]' subsystem prefix — paired with the // same constant in lib/smart-cache.js so debug lines from both files // produce consistently colored output. formatLogMessage only colors the // [severity] tag; this constant colors the subsystem prefix. const SMART_CACHE_TAG = messageColors.processing('[SmartCache]'); // Precomputed colored '[CONCURRENCY]' subsystem prefix for batch-throughput // log lines (start/completed). Same cyan as the other monitoring tags. const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]'); // Enhanced mouse interaction and page simulation const { performPageInteraction, createInteractionConfig, performContentClicks, humanLikeMouseMove } = require('./lib/interaction'); // Optional ghost-cursor support for advanced Bezier-based mouse movements const { isGhostCursorAvailable, createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor'); // Domain detection cache for performance optimization const { createGlobalHelpers, getTotalDomainsSkipped, getDetectedDomainsCount } = require('./lib/domain-cache'); const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system const { clearPersistentCache } = require('./lib/smart-cache'); const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy, prepareSocksRelays, closeAllSocksRelays } = require('./lib/proxy'); // Dry run functionality const { initializeDryRunCollections, addDryRunMatch, addDryRunNetTools, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run'); // Enhanced site data clearing functionality const { clearSiteData } = require('./lib/clear_sitedata'); // Referrer header generation const { getReferrerForUrl, validateReferrerConfig, validateReferrerDisable } = require('./lib/referrer'); // Adblock rules parser const adblockJs = require('./lib/adblock'); const adblockRust = require('./lib/adblock-rust'); // WireGuard VPN const { connectForSite: wgConnect, disconnectForSite: wgDisconnect, disconnectAll: wgDisconnectAll, validateVpnConfig, normalizeVpnConfig } = require('./lib/wireguard_vpn'); // OpenVPN const { connectForSite: ovpnConnect, disconnectForSite: ovpnDisconnect, disconnectAll: ovpnDisconnectAll, validateOvpnConfig, normalizeOvpnConfig } = require('./lib/openvpn_vpn'); // Fast setTimeout helper for Puppeteer 22.x compatibility // Uses standard Promise constructor for better performance than node:timers/promises function fastTimeout(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } // --- Configuration Constants --- const TIMEOUTS = Object.freeze({ DEFAULT_PAGE: 35000, // Standard page load timeout (35s) DEFAULT_NAVIGATION: 25000, // Navigation operation timeout DEFAULT_NAVIGATION_REDUCED: 20000, // Reduced timeout for faster failures DEFAULT_PAGE_REDUCED: 15000, // Faster page timeout for quick failures FRAME_LOAD_WAIT: 2000, // Wait time for iframes to load DEFAULT_DELAY: 6000, // Default delay: after page load NETWORK_IDLE: 2000, // Network idle detection time NETWORK_IDLE_MAX: 10000, // Maximum network idle wait time FAST_SITE_THRESHOLD: 15000, // Threshold for "fast site" optimizations EMERGENCY_RESTART_DELAY: 2000, // Delay after emergency browser restart BROWSER_STABILIZE_DELAY: 1000, // Browser stabilization after restart CURL_HANDLER_DELAY: 3000, // Wait for async curl operations PROTOCOL_TIMEOUT: 180000, // Chrome DevTools Protocol timeout REDIRECT_JS_TIMEOUT: 5000 // JavaScript redirect detection timeout }); const CACHE_LIMITS = Object.freeze({ DISK_CACHE_SIZE: 1, // Effectively disabled — forcereload clears cache between loads MEDIA_CACHE_SIZE: 1, // Effectively disabled — no media caching needed for scanning DEFAULT_CACHE_PATH: '.cache', DEFAULT_MAX_SIZE: 5000 }); const CONCURRENCY_LIMITS = Object.freeze({ MIN: 1, MAX: 50, DEFAULT: 6, HIGH_CONCURRENCY_THRESHOLD: 12 // Auto-enable aggressive caching above this }); // V8 Optimization: Use Map for user agent lookups instead of object const USER_AGENTS = Object.freeze(new Map([ ['chrome', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"], ['chrome_mac', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"], ['chrome_linux', "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"], ['firefox', "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) Gecko/20100101 Firefox/148.0"], ['firefox_mac', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:148.0) Gecko/20100101 Firefox/148.0"], ['firefox_linux', "Mozilla/5.0 (X11; Linux x86_64; rv:148.0) Gecko/20100101 Firefox/148.0"], ['safari', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15"] ])); const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup /** * Detects the installed Puppeteer version dynamically * @returns {Object} Version info and compatibility settings */ function detectPuppeteerVersion() { try { const puppeteer = usePuppeteerCore ? require('puppeteer-core') : require('puppeteer'); let versionString = null; // Try multiple methods to get version if (puppeteer.version) { versionString = puppeteer.version; } else if (puppeteer._version) { versionString = puppeteer._version; } else { // Fallback: try to get from Browser.version() after launch return { majorVersion: 22, useShellMode: true, detected: false }; } const majorVersion = parseInt(versionString.split('.')[0]); const useShellMode = majorVersion >= 22; return { version: versionString, majorVersion, useShellMode, detected: true }; } catch (err) { if (forceDebug) { console.log(formatLogMessage('debug', `Could not detect Puppeteer version: ${err.message}`)); } // Safe fallback - assume newer version return { majorVersion: 22, useShellMode: true, detected: false }; } } // Enhanced redirect handling const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/redirect'); // Ensure web browser is working correctly // purgeStaleTrackers removed from import: browserhealth's pageCreationTracker // and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries // automatically — manual purging is no longer needed. const { monitorBrowserHealth, isBrowserHealthy, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth'); // --- Script Configuration & Constants --- const VERSION = '2.0.33'; // Script version // get startTime const startTime = Date.now(); // Initialize domain cache helpers with debug logging if enabled const domainCacheOptions = { enableLogging: false }; // Set to true for cache debug logs const { isDomainAlreadyDetected, markDomainAsDetected } = createGlobalHelpers(domainCacheOptions); // Smart cache will be initialized after config is loaded let smartCache = null; // --- Command-Line Argument Parsing --- const args = process.argv.slice(2); if (args.length === 0) { args.push('--help'); } // --- .nwssconfig support: inject per-config settings into args --- const NWSSCONFIG_PATH = path.join(__dirname, '.nwssconfig'); if (fs.existsSync(NWSSCONFIG_PATH)) { try { const nwssConfig = JSON.parse(fs.readFileSync(NWSSCONFIG_PATH, 'utf-8')); // Find which config file is being used (--custom-json <file> or positional .json arg) const customJsonIdx = args.findIndex(arg => arg === '--custom-json'); const positionalJson = (customJsonIdx === -1) ? args.find(a => a.endsWith('.json') && !a.startsWith('--')) : null; const configFilename = (customJsonIdx !== -1 && args[customJsonIdx + 1]) ? args[customJsonIdx + 1] : positionalJson; // If a positional .json was used (not --custom-json), wire it to --custom-json // so the real config loader picks it up instead of defaulting to config.json if (positionalJson && customJsonIdx === -1) { args.push('--custom-json', positionalJson); process.argv.push('--custom-json', positionalJson); } if (configFilename && nwssConfig.configs && nwssConfig.configs[configFilename]) { const settings = nwssConfig.configs[configFilename]; const originalArgs = args.join(' '); // Map settings keys to CLI flags — only inject if not already in args const settingsMap = { output: ['-o', '--output'], max_concurrent: ['--max-concurrent'], dns_cache: ['--dns-cache'], cache_requests: ['--cache-requests'], dumpurls: ['--dumpurls'], remove_tempfiles: ['--remove-tempfiles'], color: ['--color'], remove_dupes: ['--remove-dupes', '--remove-dubes'], 'remove-dupes': ['--remove-dupes', '--remove-dubes'], 'remove-dubes': ['--remove-dupes', '--remove-dubes'], compress_logs: ['--compress-logs'], debug: ['--debug'], silent: ['--silent'], verbose: ['--verbose'], headful: ['--headful'], keep_open: ['--keep-open'], dry_run: ['--dry-run'], titles: ['--titles'], sub_domains: ['--sub-domains'], no_interact: ['--no-interact'], ghost_cursor: ['--ghost-cursor'], plain: ['--plain'], cdp: ['--cdp'], dnsmasq: ['--dnsmasq'], unbound: ['--unbound'], privoxy: ['--privoxy'], pihole: ['--pihole'], eval_on_doc: ['--eval-on-doc'], use_puppeteer_core: ['--use-puppeteer-core'], ignore_cache: ['--ignore-cache'], clear_cache: ['--clear-cache'], block_ads: ['--block-ads'], compare: ['--compare'], localhost: ['--localhost'], append: ['--append'] }; for (const [key, flags] of Object.entries(settingsMap)) { // Support both underscore and hyphen variants (e.g. dns_cache or dns-cache) const value = settings[key] !== undefined ? settings[key] : settings[key.replace(/_/g, '-')] !== undefined ? settings[key.replace(/_/g, '-')] : settings[key.replace(/-/g, '_')] !== undefined ? settings[key.replace(/-/g, '_')] : undefined; if (value === undefined) continue; // Skip if any variant of the flag is already in CLI args if (flags.some(f => originalArgs.includes(f))) continue; if (typeof value === 'boolean') { if (value) args.push(flags[flags.length - 1]); } else if (typeof value === 'string' || typeof value === 'number') { args.push(flags[flags.length - 1], String(value)); } } } } catch (e) { console.error(`Warning: Failed to parse .nwssconfig: ${e.message}`); } } const headfulMode = args.includes('--headful'); // Sites (esp. video/streaming) call element.requestFullscreen() on load or // click. In --headful that hijacks the real Chrome window into true // fullscreen, forcing a manual ESC. Neutralize the Fullscreen API by // default so it can't. Harmless in headless (no screen — the API is // already inert there), so default-on keeps headful consistent with the // primary headless path. --allow-fullscreen restores native behavior. const allowFullscreen = args.includes('--allow-fullscreen'); const SOURCES_FOLDER = 'sources'; let outputFile = null; const outputIndex = args.findIndex(arg => arg === '--output' || arg === '-o'); if (outputIndex !== -1 && args[outputIndex + 1]) { outputFile = args[outputIndex + 1]; } const appendMode = args.includes('--append'); let compareFile = null; const compareIndex = args.findIndex(arg => arg === '--compare'); if (compareIndex !== -1 && args[compareIndex + 1]) { compareFile = args[compareIndex + 1]; } const forceVerbose = args.includes('--verbose'); const forceDebug = args.includes('--debug'); const silentMode = args.includes('--silent'); const showTitles = args.includes('--titles'); const dumpUrls = args.includes('--dumpurls'); const subDomainsMode = args.includes('--sub-domains'); // Parse --localhost with optional IP address let localhostIP = null; const localhostIndex = args.findIndex(arg => arg.startsWith('--localhost')); if (localhostIndex !== -1) { localhostIP = args[localhostIndex].includes('=') ? args[localhostIndex].split('=')[1] : '127.0.0.1'; } const keepBrowserOpen = args.includes('--keep-open'); const loadExtensionPaths = []; args.forEach((arg, idx) => { if (arg === '--load-extension' && args[idx + 1] && !args[idx + 1].startsWith('--')) { loadExtensionPaths.push(path.resolve(args[idx + 1])); } }); const disableInteract = args.includes('--no-interact'); const globalGhostCursor = args.includes('--ghost-cursor'); const plainOutput = args.includes('--plain'); const enableCDP = args.includes('--cdp'); const dnsmasqMode = args.includes('--dnsmasq'); const dnsmasqOldMode = args.includes('--dnsmasq-old'); const unboundMode = args.includes('--unbound'); const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes'); const privoxyMode = args.includes('--privoxy'); const piholeMode = args.includes('--pihole'); const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception const dryRunMode = args.includes('--dry-run'); const compressLogs = args.includes('--compress-logs'); const removeTempFiles = args.includes('--remove-tempfiles'); const validateConfig = args.includes('--validate-config'); let validateRules = args.includes('--validate-rules'); const testValidation = args.includes('--test-validation'); let cleanRules = args.includes('--clean-rules'); const clearCache = args.includes('--clear-cache'); const ignoreCache = args.includes('--ignore-cache'); const cacheRequests = args.includes('--cache-requests'); const dnsCacheMode = args.includes('--dns-cache'); if (dnsCacheMode) enableDiskCache(); // DNS pre-check before page.goto() — default-on, --no-dns-precheck disables. // Filters NXDOMAIN / unresolvable hostnames in <100ms before paying the // ~5-15s Puppeteer + Cloudflare detection round-trip on each. const dnsPrecheckEnabled = !args.includes('--no-dns-precheck'); const dnsPrecheckTimeoutMs = 2000; // Per-scan cache of negative DNS lookups. OS resolvers don't always cache // NXDOMAIN responses, and a scan can hit the same dead hostname many times // (different URL paths on the same site). Positive results are left to the // OS cache; failure-cache avoids repeated lookup latency for known-dead hosts. // FIFO eviction at DNS_NEGATIVE_CACHE_MAX so pathological scans (thousands // of unique dead hosts) can't grow the cache unboundedly. Same pattern as // the rest of the codebase's in-memory caches. const dnsNegativeCache = new Map(); // hostname -> { error, timestamp } const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes const DNS_NEGATIVE_CACHE_MAX = 1000; let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path // c-ares transient codes — read-only, hoisted out of the per-task DNS // pre-check so we don't allocate a fresh Set per URL. const DNS_TRANSIENT_ERRORS = new Set(['ETIMEOUT', 'ESERVFAIL', 'EREFUSED', 'ECONNREFUSED']); function dnsNegativeCacheSet(hostname, error) { if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) { dnsNegativeCache.delete(dnsNegativeCache.keys().next().value); } dnsNegativeCache.set(hostname, { error, timestamp: Date.now() }); } let validateRulesFile = null; const validateRulesIndex = args.findIndex(arg => arg === '--validate-rules'); if (validateRulesIndex !== -1 && args[validateRulesIndex + 1] && !args[validateRulesIndex + 1].startsWith('--')) { validateRulesFile = args[validateRulesIndex + 1]; validateRules = true; // Override the boolean if file specified } let cleanRulesFile = null; const cleanRulesIndex = args.findIndex(arg => arg === '--clean-rules'); if (cleanRulesIndex !== -1 && args[cleanRulesIndex + 1] && !args[cleanRulesIndex + 1].startsWith('--')) { cleanRulesFile = args[cleanRulesIndex + 1]; cleanRules = true; // Override the boolean if file specified } let maxConcurrentSites = null; const maxConcurrentIndex = args.findIndex(arg => arg === '--max-concurrent'); if (maxConcurrentIndex !== -1 && args[maxConcurrentIndex + 1]) { maxConcurrentSites = parseInt(args[maxConcurrentIndex + 1]); } let cleanupInterval = null; const cleanupIntervalIndex = args.findIndex(arg => arg === '--cleanup-interval'); if (cleanupIntervalIndex !== -1 && args[cleanupIntervalIndex + 1]) { cleanupInterval = parseInt(args[cleanupIntervalIndex + 1]); } const enableColors = args.includes('--color') || args.includes('--colour'); let adblockRulesMode = args.includes('--adblock-rules'); // Adblock variables (request blocking) let adblockEnabled = false; let adblockMatcher = null; let adblockStats = { blocked: 0, allowed: 0 }; // Cloudflare scan-wide stats. errorPages counts URLs where the returned page // was a Cloudflare-served 5xx origin error (522/523/etc.) — no bypass // possible, useful signal for diagnosing dead-origin scans. Named distinct // from the local cloudflareStats = getCacheStats() in the debug stats block. let cloudflareScanStats = { errorPages: 0 }; // Validate --adblock-rules usage - ignore if used incorrectly instead of erroring if (adblockRulesMode) { if (!outputFile) { if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: requires --output (-o) to specify an output file`)); adblockRulesMode = false; } else if (localhostIP || plainOutput || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) { if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: incompatible with localhost/plain output modes`)); adblockRulesMode = false; } } // Validate --dnsmasq usage if (dnsmasqMode) { if (localhostIP || plainOutput || adblockRulesMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) { if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`)); dnsmasqMode = false; } } // Validate --dnsmasq-old usage if (dnsmasqOldMode) { if (localhostIP || plainOutput || adblockRulesMode || dnsmasqMode || unboundMode || privoxyMode || piholeMode) { if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`)); dnsmasqOldMode = false; } } // Validate --unbound usage if (unboundMode) { if (localhostIP || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || privoxyMode || piholeMode) { if (forceDebug) console.log(formatLogMessage('debug', `--unbound ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`)); unboundMode = false; } } // Validate --privoxy usage if (privoxyMode) { if (localhostIP || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || piholeMode) { if (forceDebug) console.log(formatLogMessage('debug', `--privoxy ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound output modes`)); privoxyMode = false; } } // Validate --pihole usage if (piholeMode) { if (localhostIP || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode) { if (forceDebug) console.log(formatLogMessage('debug', `--pihole ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound/privoxy output modes`)); piholeMode = false; } } // Validate --compress-logs usage if (compressLogs && !dumpUrls) { console.error(`❌ --compress-logs can only be used with --dumpurls`); process.exit(1); } // Validate --append usage if (appendMode && !outputFile) { console.error(`❌ --append requires --output (-o) to specify an output file`); process.exit(1); } if (appendMode && (compareFile || dryRunMode)) { console.error(`❌ --append cannot be used with --compare or --dry-run`); process.exit(1); } // Validate --dry-run usage if (dryRunMode) { if (compressLogs || compareFile) { console.error(`❌ --dry-run cannot be used with --compress-logs or --compare`); process.exit(1); } } // Validate --compare usage if (compareFile && !outputFile) { console.error(`❌ --compare requires --output (-o) to specify an output file`); process.exit(1); } if (compareFile && !fs.existsSync(compareFile)) { console.error(`❌ Compare file not found: ${compareFile}`); process.exit(1); } if (args.includes('--version')) { console.log(`nwss.js version ${VERSION}`); process.exit(0); } // Handle --clear-cache before config loading (uses default cache path) if (clearCache && !dryRunMode) { clearPersistentCache({ silent: silentMode, forceDebug, cachePath: CACHE_LIMITS.DEFAULT_CACHE_PATH // Default path, will be updated after config loads if needed }); // Also clear Cloudflare detection cache clearDetectionCache(); if (forceDebug) console.log(formatLogMessage('debug', 'Cleared Cloudflare detection cache')); } // Handle validation-only operations before main help if (testValidation) { console.log(`\n${messageColors.processing('Running domain validation tests...')}`); const testResult = testDomainValidation(); if (testResult) { console.log(`${messageColors.success('✅ All validation tests passed!')}`); process.exit(0); } else { console.log(`${messageColors.error('❌ Some validation tests failed!')}`); process.exit(1); } } // Note: --validate-config is handled further down, AFTER the config file is // loaded and `config`/`sites` are populated. Running it here would fail with // "Cannot access 'config' before initialization" since those are declared // later in the module. if (validateRules || validateRulesFile) { const filesToValidate = validateRulesFile ? [validateRulesFile] : [outputFile, compareFile].filter(Boolean); if (filesToValidate.length === 0) { console.error('❌ --validate-rules requires either a file argument or --output/--compare files to be specified'); process.exit(1); } console.log(`\n${messageColors.processing('Validating rule files...')}`); let overallValid = true; for (const file of filesToValidate) { console.log(`\n${messageColors.info('Validating:')} ${file}`); try { const validation = validateRulesetFile(file, { forceDebug, silentMode, maxErrors: 20 }); if (validation.isValid) { console.log(`${messageColors.success('✅ Valid:')} ${validation.stats.valid} rules, ${validation.stats.comments} comments`); if (validation.duplicates.length > 0) { console.log(`${messageColors.warn('⚠ Duplicates:')} ${validation.duplicates.length} duplicate rules found`); } if (Object.keys(validation.stats.formats).length > 0) { console.log(`${messageColors.info('Formats:')} ${Object.entries(validation.stats.formats).map(([f, c]) => `${f}(${c})`).join(', ')}`); } } else { console.log(`${messageColors.error('❌ Invalid:')} ${validation.stats.invalid} invalid rules out of ${validation.stats.total} total`); overallValid = false; } } catch (validationErr) { console.error(`❌ Failed to validate ${file}: ${validationErr.message}`); overallValid = false; } } if (overallValid) { console.log(`\n${messageColors.success('✅ All rule files are valid!')}`); process.exit(0); } else { console.log(`\n${messageColors.error('❌ Some rule files have validation errors!')}`); process.exit(1); } } // Parse --adblock-engine=<js|rust> (default: js). Selects the matcher backend // used by --block-ads. The rust engine requires the optional adblock-rs package. const adblockEngineIndex = args.findIndex(arg => arg.startsWith('--adblock-engine')); let adblockEngineName = 'js'; if (adblockEngineIndex !== -1) { const engineArg = args[adblockEngineIndex].includes('=') ? args[adblockEngineIndex].split('=')[1] : args[adblockEngineIndex + 1]; if (engineArg === 'rust' || engineArg === 'js') { adblockEngineName = engineArg; } else { console.log(`Error: --adblock-engine must be 'js' or 'rust' (got: ${engineArg})`); process.exit(1); } } // Parse --block-ads argument for request-level ad blocking (supports comma-separated lists) const blockAdsIndex = args.findIndex(arg => arg.startsWith('--block-ads')); if (blockAdsIndex !== -1) { const rulesArg = args[blockAdsIndex].includes('=') ? args[blockAdsIndex].split('=')[1] : args[blockAdsIndex + 1]; if (!rulesArg) { console.log('Error: No adblock rules file specified'); process.exit(1); } const rulesFiles = rulesArg.split(',').map(f => f.trim()).filter(f => f); for (const file of rulesFiles) { if (!fs.existsSync(file)) { console.log(`Error: Adblock rules file not found: ${file}`); process.exit(1); } } adblockEnabled = true; const engine = adblockEngineName === 'rust' ? adblockRust : adblockJs; try { if (engine === adblockRust) { // Rust wrapper accepts an array directly — no temp file needed. adblockMatcher = engine.parseAdblockRules(rulesFiles, { enableLogging: forceDebug }); } else { // JS engine takes a single path; concat to a temp file when multiple lists. let rulesFile = rulesFiles[0]; if (rulesFiles.length > 1) { rulesFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`); const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n'); fs.writeFileSync(rulesFile, combined); } adblockMatcher = engine.parseAdblockRules(rulesFile, { enableLogging: forceDebug }); } } catch (err) { console.log(`Error: Failed to load adblock engine '${adblockEngineName}': ${err.message}`); process.exit(1); } const stats = adblockMatcher.getStats(); const ruleDesc = stats.total != null ? `${stats.total} blocking rules` : `compiled engine (cached)`; if (!silentMode) console.log(messageColors.success(`Adblock enabled (${adblockEngineName}): Loaded ${ruleDesc} from ${rulesFiles.length} list${rulesFiles.length > 1 ? 's' : ''}`)); } if (args.includes('--help') || args.includes('-h')) { console.log(`Usage: node nwss.js [options] Options: --color, --colour Enable colored console output for status messages -o, --output <file> Output file for rules. If omitted, prints to console --compare <file> Remove rules that already exist in this file before output --append Append new rules to output file instead of overwriting (requires -o) Output Format Options: --localhost[=IP] Output as IP domain.com (default: 127.0.0.1) Examples: --localhost, --localhost=0.0.0.0, --localhost=192.168.1.1 --plain Output just domains (no adblock formatting) --dnsmasq Output as local=/domain.com/ (dnsmasq format) --dnsmasq-old Output as server=/domain.com/ (dnsmasq old format) --unbound Output as local-zone: "domain.com." always_null (unbound format) --privoxy Output as { +block } .domain.com (Privoxy format) --pihole Output as (^|\\.)domain\\.com$ (Pi-hole regex format) --adblock-rules Generate adblock filter rules with resource type modifiers (requires -o) Request Blocking: --block-ads=<file> Block ads/trackers using EasyList format rules (||domain.com^, /ads/*, etc) Works at request-level for maximum performance Supports comma-separated lists: --block-ads=easylist.txt,easyprivacy.txt --adblock-engine=<js|rust> Matcher backend for --block-ads (default: js) 'rust' uses Brave's adblock-rs (faster on large lists; needs: npm i adblock-rs) Per-config settings file (.nwssconfig): Place a .nwssconfig file in the project root to define per-config settings. When a config filename matches a key in .nwssconfig, those settings are used. CLI flags merge with and override .nwssconfig settings. See README.md for format details. General Options: --verbose Force verbose mode globally --debug Force debug mode globally --silent Suppress normal console logs --titles Add ! <url> title before each site's group --dumpurls Dump matched URLs into matched_urls.log --dry-run Console output only: show matching regex, titles, whois/dig/searchstring results, and adblock rules --compress-logs Compress log files with gzip (requires --dumpurls) --sub-domains Output full subdomains instead of collapsing to root --no-interact Disable page interactions globally --ghost-cursor Use ghost-cursor Bezier mouse movements (requires: npm i ghost-cursor) --custom-json <file> Use a custom config JSON file instead of config.json --headful Launch browser with GUI (not headless) --keep-open Keep browser open after scan completes (use with --headful) --allow-fullscreen Allow sites to use the Fullscreen API. By default it is neutralized so sites can't hijack the window in --headful --use-puppeteer-core Use puppeteer-core with system Chrome instead of bundled Chromium --use-obscura Connect to running Obscura CDP server (ws://127.0.0.1:9222 or OBSCURA_WS env) Skips fingerprint injection — Obscura provides built-in stealth --load-extension <path> Load unpacked Chrome extension from directory --cdp Enable Chrome DevTools Protocol logging (now per-page if enabled) --remove-dupes Remove duplicate domains from output (only with -o) --eval-on-doc Globally enable evaluateOnNewDocument() for Fetch/XHR interception --help, -h Show this help menu --version Show script version --max-concurrent <number> Maximum concurrent site processing (1-50, overrides config/default) --cleanup-interval <number> Browser restart interval in URLs processed (1-1000, overrides config/default) --remove-tempfiles Remove Chrome/Puppeteer temporary files before exit Validation Options: --cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan --dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each) --no-dns-precheck Disable per-URL DNS resolution check before page navigation. By default, URLs whose hostname doesn't resolve are skipped immediately (saves ~5-15s of Puppeteer time per dead host). --validate-config Validate config.json file and exit --validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified) --clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified) --test-validation Run domain validation tests and exit --clear-cache Clear persistent cache before scanning (improves fresh start performance) --ignore-cache Bypass all smart caching functionality during scanning Global config.json options: ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards) ignoreDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, the request's root domain is ignored for the rest of the scan blockDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, ALL subsequent requests on that root domain (and subdomains) are aborted via Puppeteer for the rest of the scan blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked) whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random) ignore_similar: true/false Ignore domains similar to already found domains (default: true) ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80) ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true) max_concurrent_sites: 8 Maximum concurrent site processing (1-50, default: 8) resource_cleanup_interval: 80 Browser restart interval in URLs processed (1-1000, default: 80) disable_ad_tagging: true/false Disable Chrome AdTagging to prevent ad frame throttling (default: true) Per-site config.json options: url: "site" or ["site1", "site2"] Single URL or list of URLs filterRegex: "regex" or ["regex1", "regex2"] Patterns to match requests regex_and: true/false Use AND logic for multiple filterRegex patterns (default: false) When true, ALL regex patterns must match the same URL Redirect Handling Options: follow_redirects: true/false Follow redirects to new domains (default: true) max_redirects: 10 Maximum number of redirects to follow (default: 10) js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000) detect_js_patterns: true/false Analyze page source for redirect patterns (default: true) redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5) comments: "text" or ["text1", "text2"] Documentation/notes - ignored by script searchstring: "text" or ["text1", "text2"] Text to search in response content (requires filterRegex match) ignore_similar: true/false Override global ignore_similar setting for this site ignore_similar_threshold: 80 Override global similarity threshold for this site ignore_similar_ignored_domains: true/false Override global ignore_similar_ignored_domains for this site searchstring_and: "text" or ["text1", "text2"] Text to search with AND logic - ALL terms must be present (requires filterRegex match) curl: true/false Use curl to download content for analysis (default: false) Note: curl respects filterRegex but ignores resourceTypes filtering grep: true/false Use grep instead of JavaScript for pattern matching (default: false) Note: requires curl=true, uses system grep command for faster searches blocked: ["regex"] Regex patterns to block requests css_blocked: ["#selector", ".class"] CSS selectors to hide elements resourceTypes: ["script", "stylesheet"] Only process requests of these resource types (default: all types) interact: true/false Simulate mouse movements/clicks isBrave: true/false Spoof Brave browser detection userAgent: "chrome"|"chrome_mac"|"chrome_linux"|"firefox"|"firefox_mac"|"firefox_linux"|"safari" Custom desktop User-Agent interact_intensity: "low"|"medium"|"high" Interaction simulation intensity (default: medium) delay: <milliseconds> Delay after load (default: 4000) reload: <number> Reload page n times after load (default: 1) forcereload: true/false or ["domain1.com", "domain2.com"] Force cache-clearing reload for all URLs or specific domains clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false) subDomains: 1/0 Output full subdomains (default: 0) localhost: true/false Force localhost output (127.0.0.1) localhost_0_0_0_0: true/false Force localhost output (0.0.0.0) dnsmasq: true/false Force dnsmasq output (local=/domain.com/) dnsmasq_old: true/false Force dnsmasq old output (server=/domain.com/) unbound: true/false Force unbound output (local-zone: "domain.com." always_null) privoxy: true/false Force Privoxy output ({ +block } .domain.com) pihole: true/false Force Pi-hole regex output ((^|\\.)domain\\.com$) source: true/false Save page source HTML after load firstParty: true/false Allow first-party matches (default: false) thirdParty: true/false Allow third-party matches (default: true) screenshot: true/false/\"force\" Capture screenshot (true=on failure, \"force\"=always) headful: true/false Launch browser with GUI for this site fingerprint_protection: true/false/"random" Enable fingerprint spoofing: true/false/"random" adblock_rules: true/false Generate adblock filter rules with resource types for this site even_blocked: true/false Add matching rules even if requests are blocked (default: false) bypass_cache: true/false Skip all caching for this site's URLs (default: false) referrer_headers: "url" or ["url1", "url2"] Set referrer header for realistic traffic sources custom_headers: {"Header": "value"} Add custom HTTP headers to requests referrer_disable: ["url1", "url2"] Disable referrer headers for specific URLs Cloudflare Protection Options: cloudflare_phish: true/false Auto-click through Cloudflare phishing warnings (default: false) cloudflare_bypass: true/false Auto-solve Cloudflare "Verify you are human" challenges (default: false) cloudflare_parallel_detection: true/false Use parallel detection for faster Cloudflare checks (default: true) cloudflare_max_retries: <number> Maximum retry attempts for Cloudflare operations (default: 3) cloudflare_cache_ttl: <milliseconds> TTL for Cloudflare detection cache (default: 300000 - 5 minutes) cloudflare_retry_on_error: true/false Enable retry logic for Cloudflare operations (default: true) Note: Automatically detects and exits on redirect loops to prevent endless loading cloudflare_retry_on_error: true/false Enable retry logic for Cloudflare operations (default: true) FlowProxy Protection Options: flowproxy_detection: true/false Enable flowProxy protection detection and handling (default: false) flowproxy_page_timeout: <milliseconds> Page timeout for flowProxy sites (default: 45000) flowproxy_nav_timeout: <milliseconds> Navigation timeout for flowProxy sites (default: 45000) flowproxy_js_timeout: <milliseconds> JavaScript challenge timeout (default: 15000) flowproxy_delay: <milliseconds> Delay for rate limiting (default: 30000) flowproxy_additional_delay: <milliseconds> Additional processing delay (default: 5000) Advanced Options: evaluateOnNewDocument: true/false Inject fetch/XHR interceptor in page (for this site) cdp: true/false Enable CDP logging for this site Inject fetch/XHR interceptor in page cdp_specific: ["domain1.com", "domain2.com"] Enable CDP logging only for specific domains in the URL list interact_duration: <milliseconds> Duration of interaction simulation (default: 2000) interact_scrolling: true/false Enable scrolling simulation (default: true) interact_clicks: true/false Enable element clicking simulation (default: false) interact_typing: true/false Enable typing simulation (default: false) cursor_mode: "ghost" Use ghost-cursor Bezier mouse (requires: npm i ghost-cursor) ghost_cursor_speed: <number> Ghost-cursor speed multiplier (default: auto) ghost_cursor_hesitate: <milliseconds> Delay before ghost-cursor clicks (default: 50) ghost_cursor_overshoot: <pixels> Max ghost-cursor overshoot distance (default: auto) ghost_cursor_duration: <milliseconds> Ghost-cursor interaction duration (default: interact_duration or 2000) whois: ["term1", "term2"] Check whois data for ALL specified terms (AND logic) whois-or: ["term1", "term2"] Check whois data for ANY specified term (OR logic) whois_server_mode: "random" or "cycle" Server selection mode: random (default) or cycle through list whois_server: "whois.domain.com" or ["server1", "server2"] Custom whois server(s) - single server or randomized list (default: system default) whois_max_retries: 2 Maximum retry attempts per domain (default: 2) whois_timeout_multiplier: 1.5 Timeout increase multiplier per retry (default: 1.5) whois_use_fallback: true Add TLD-specific fallback servers (default: true) whois_retry_on_timeout: true Retry on timeout errors (default: true) whois_retry_on_error: true Retry on connection/other errors (default: true) whois_delay: <milliseconds> Delay between whois requests for this site (default: global whois_delay) dig: ["term1", "term2"] Check dig output for ALL specified terms (AND logic) dig-or: ["term1", "term2"] Check dig output for ANY specified term (OR logic) goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "load"}) dig_subdomain: true/false Use subdomain for dig lookup instead of root domain (default: false) digRecordType: "A" DNS record type for dig (default: A) VPN Options (requires sudo, affects system routing — not isolated per-site during concurrent scans): vpn: "/etc/wireguard/wg0.conf" WireGuard config file path vpn: { config: "wg-us", interface: "wg0", WireGuard with options: health_check, test_host, health_check: true, retry: true } retry, max_retries openvpn: "/path/to/server.ovpn" OpenVPN config file path (uses embedded credentials) openvpn: { config: "server.ovpn", OpenVPN with options: username, password, username: "user", auth_file, health_check, test_host, retry, password: "pass", max_retries, connect_timeout, extra_args health_check: true, retry: true, max_retries: 2, connect_timeout: 30000 } window_cleanup: true/false/"realtime"/"all" Window cleanup mode: true/false - Close extra windows after URL group completes (default: false) "realtime" - Continuously cleanup oldest pages when threshold exceeded "all" - Aggressive cleanup of all content pages after group window_cleanup_threshold: <number> For realtime mode: max pages to keep open (default: 8) Referrer Header Options: referrer_headers: "https://google.com" Single referrer URL referrer_headers: ["url1", "url2"] Random selection from array referrer_headers: {"mode": "random_search", "search_terms": ["term1"]} Smart search engine traffic referrer_headers: {"mode": "social_media"} Random social media referrers referrer_headers: {"mode": "direct_navigation"} No referrer (direct access) referrer_headers: {"mode": "news_sites"} Random news website referrers referrer_headers: {"mode": "custom", "url": "https://example.com"} Custom referrer URL referrer_headers: {"mode": "mixed"} Mixed referrer types for varied traffic referrer_disable: ["https://example.com/no-ref", "sensitive-site.com"] Disable referrer for specific URLs custom_headers: {"Header": "Value"} Additional HTTP headers `); process.exit(0); } // --- Configuration File Loading --- const configPathIndex = args.findIndex(arg => arg === '--custom-json'); const configPath = (configPathIndex !== -1 && args[configPathIndex + 1]) ? args[configPathIndex + 1] : 'config.json'; let config; try { if (!fs.existsSync(configPath)) { console.error(`❌ Config file not found: ${configPath}`); process.exit(1); } if (forceDebug && configPath !== 'config.json') { console.log(formatLogMessage('debug', `Using custom config file: ${configPath}`)); } const raw = fs.readFileSync(configPath, 'utf8'); config = JSON.parse(raw); } catch (e) { console.error(`❌ Failed to load config file (${configPath}):`, e.message); process.exit(1); } // Extract config values while ignoring 'comments' field at global and site levels const { sites = [], ignoreDomains = [], ignoreDomainsByUrl = [], blockDomainsByUrl = [], blocked: globalBlocked = [], whois_delay = 3000, whois_server_mode = 'random', ignore_similar = true, ignore_similar_threshold = 80, ignore_similar_ignored_domains = true, disable_ad_tagging = true, max_concurrent_sites = 6, resource_cleanup_interval = 80, comments: globalComments, ...otherGlobalConfig } = config; // --validate-config runs here, after `config` and `sites` are populated. // Previously this block lived above t