@fanboynz/network-scanner
Version:
A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.
925 lines (835 loc) • 283 kB
JavaScript
// === Network scanner script (nwss.js) v2.0.51 ===
// puppeteer for browser automation, fs for file system operations, psl for domain parsing.
// const pLimit = require('p-limit'); // Will be dynamically imported
const useObscura = process.argv.includes('--use-obscura');
const usePuppeteerCore = process.argv.includes('--use-puppeteer-core') || useObscura;
const puppeteer = usePuppeteerCore ? require('puppeteer-core') : require('puppeteer');
const fs = require('fs');
const os = require('os');
const psl = require('psl');
const path = require('path');
const dnsPromises = require('node:dns/promises');
const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
const { parseSearchStrings, createResponseHandler, createCurlHandler } = require('./lib/searchstring');
const { applyAllFingerprintSpoofing } = require('./lib/fingerprint');
const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
// Curl functionality (replace searchstring curl handler)
const { validateCurlAvailability, createCurlHandler: createCurlModuleHandler } = require('./lib/curl');
// Rule validation
const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile } = require('./lib/validate_rules');
// CF Bypass
const {
handleCloudflareProtection,
getCacheStats,
clearDetectionCache,
parallelChallengeDetection,
cleanup: cleanupCloudflareCache
} = require('./lib/cloudflare');
// FP Bypass
const { handleFlowProxyProtection, getFlowProxyTimeouts, attachFlowProxyHeaderListener } = require('./lib/flowproxy');
// ignore_similar rules
const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore_similar');
// Graceful exit
const { handleBrowserExit, cleanupChromeTempFiles, cleanupUserDataDir } = require('./lib/browserexit');
// Whois & Dig
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability, enableDiskCache, getDnsCacheStats, domainKnownToResolve } = require('./lib/nettools');
// File compare
const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
// CDP functionality
const { createCDPSession, createPageWithTimeout, setRequestInterceptionWithTimeout } = require('./lib/cdp');
// Post-processing cleanup
const { processResults } = require('./lib/post-processing');
// Colorize various text when used
const { messageColors, formatLogMessage } = require('./lib/colorize');
const TIMEOUT_TAG = messageColors.processing('[TIMEOUT]');
const INTERACTION_TAG = messageColors.processing('[interaction]');
const GHOST_CURSOR_TAG = messageColors.processing('[ghost-cursor]');
const PROXY_TAG = messageColors.processing('[proxy]');
const GREP_RESPONSE_TAG = messageColors.processing('[grep-response]');
const IGNORE_DOMAINS_BY_URL_TAG = messageColors.processing('[ignoreDomainsByUrl]');
const BLOCK_DOMAINS_BY_URL_TAG = messageColors.processing('[blockDomainsByUrl]');
const IGNORE_SIMILAR_IGNORED_DOMAINS_TAG = messageColors.processing('[ignore_similar_ignored_domains]');
const IGNORE_SIMILAR_TAG = messageColors.processing('[ignore_similar]');
const CLEAR_SITEDATA_TAG = messageColors.processing('[clear_sitedata]');
const CSS_BLOCKED_TAG = messageColors.processing('[css_blocked]');
const EVAL_ON_DOC_TAG = messageColors.processing('[evalOnDoc]');
const REALTIME_CLEANUP_TAG = messageColors.processing('[realtime_cleanup]');
const VPN_TAG = messageColors.processing('[vpn]');
// Precomputed colored '[SmartCache]' subsystem prefix — paired with the
// same constant in lib/smart-cache.js so debug lines from both files
// produce consistently colored output. formatLogMessage only colors the
// [severity] tag; this constant colors the subsystem prefix.
const SMART_CACHE_TAG = messageColors.processing('[SmartCache]');
// Precomputed colored '[CONCURRENCY]' subsystem prefix for batch-throughput
// log lines (start/completed). Same cyan as the other monitoring tags.
const CONCURRENCY_TAG = messageColors.processing('[CONCURRENCY]');
// Enhanced mouse interaction and page simulation
const { performPageInteraction, createInteractionConfig, performContentClicks, humanLikeMouseMove } = require('./lib/interaction');
// Optional ghost-cursor support for advanced Bezier-based mouse movements
const { isGhostCursorAvailable, createGhostCursor, ghostMove, ghostClick, ghostRandomMove, resolveGhostCursorConfig } = require('./lib/ghost-cursor');
// Domain detection cache for performance optimization
const { createGlobalHelpers, getTotalDomainsSkipped, getDetectedDomainsCount } = require('./lib/domain-cache');
const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system
const { clearPersistentCache } = require('./lib/smart-cache');
const { needsProxy, getProxyArgs, applyProxyAuth, getProxyInfo, testProxy, prepareSocksRelays, closeAllSocksRelays } = require('./lib/proxy');
// Dry run functionality
const { initializeDryRunCollections, addDryRunMatch, addDryRunNetTools, processDryRunResults, writeDryRunOutput } = require('./lib/dry-run');
// Enhanced site data clearing functionality
const { clearSiteData } = require('./lib/clear_sitedata');
// Referrer header generation
const { getReferrerForUrl, validateReferrerConfig, validateReferrerDisable } = require('./lib/referrer');
// Adblock rules parser
const adblockJs = require('./lib/adblock');
const adblockRust = require('./lib/adblock-rust');
// WireGuard VPN
const { connectForSite: wgConnect, disconnectForSite: wgDisconnect, disconnectAll: wgDisconnectAll, validateVpnConfig, normalizeVpnConfig } = require('./lib/wireguard_vpn');
// OpenVPN
const { connectForSite: ovpnConnect, disconnectForSite: ovpnDisconnect, disconnectAll: ovpnDisconnectAll, validateOvpnConfig, normalizeOvpnConfig } = require('./lib/openvpn_vpn');
// Fast setTimeout helper for Puppeteer 22.x compatibility
// Uses standard Promise constructor for better performance than node:timers/promises
function fastTimeout(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
// --- Configuration Constants ---
const TIMEOUTS = Object.freeze({
DEFAULT_PAGE: 35000, // Standard page load timeout (35s)
DEFAULT_NAVIGATION: 25000, // Navigation operation timeout
DEFAULT_NAVIGATION_REDUCED: 20000, // Reduced timeout for faster failures
DEFAULT_PAGE_REDUCED: 15000, // Faster page timeout for quick failures
FRAME_LOAD_WAIT: 2000, // Wait time for iframes to load
DEFAULT_DELAY: 6000, // Default delay: after page load
NETWORK_IDLE: 2000, // Network idle detection time
NETWORK_IDLE_MAX: 10000, // Maximum network idle wait time
FAST_SITE_THRESHOLD: 15000, // Threshold for "fast site" optimizations
EMERGENCY_RESTART_DELAY: 2000, // Delay after emergency browser restart
BROWSER_STABILIZE_DELAY: 1000, // Browser stabilization after restart
CURL_HANDLER_DELAY: 3000, // Wait for async curl operations
PROTOCOL_TIMEOUT: 180000, // Chrome DevTools Protocol timeout
REDIRECT_JS_TIMEOUT: 5000 // JavaScript redirect detection timeout
});
const CACHE_LIMITS = Object.freeze({
DISK_CACHE_SIZE: 1, // Effectively disabled — forcereload clears cache between loads
MEDIA_CACHE_SIZE: 1, // Effectively disabled — no media caching needed for scanning
DEFAULT_CACHE_PATH: '.cache',
DEFAULT_MAX_SIZE: 5000
});
const CONCURRENCY_LIMITS = Object.freeze({
MIN: 1,
MAX: 50,
DEFAULT: 6,
HIGH_CONCURRENCY_THRESHOLD: 12 // Auto-enable aggressive caching above this
});
// V8 Optimization: Use Map for user agent lookups instead of object
const USER_AGENTS = Object.freeze(new Map([
['chrome', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"],
['chrome_mac', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"],
['chrome_linux', "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"],
['firefox', "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:148.0) Gecko/20100101 Firefox/148.0"],
['firefox_mac', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:148.0) Gecko/20100101 Firefox/148.0"],
['firefox_linux', "Mozilla/5.0 (X11; Linux x86_64; rv:148.0) Gecko/20100101 Firefox/148.0"],
['safari', "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15"]
]));
const REALTIME_CLEANUP_THRESHOLD = 8; // Default pages to keep for realtime cleanup
/**
* Detects the installed Puppeteer version dynamically
* @returns {Object} Version info and compatibility settings
*/
function detectPuppeteerVersion() {
try {
const puppeteer = usePuppeteerCore ? require('puppeteer-core') : require('puppeteer');
let versionString = null;
// Try multiple methods to get version
if (puppeteer.version) {
versionString = puppeteer.version;
} else if (puppeteer._version) {
versionString = puppeteer._version;
} else {
// Fallback: try to get from Browser.version() after launch
return { majorVersion: 22, useShellMode: true, detected: false };
}
const majorVersion = parseInt(versionString.split('.')[0]);
const useShellMode = majorVersion >= 22;
return {
version: versionString,
majorVersion,
useShellMode,
detected: true
};
} catch (err) {
if (forceDebug) {
console.log(formatLogMessage('debug', `Could not detect Puppeteer version: ${err.message}`));
}
// Safe fallback - assume newer version
return { majorVersion: 22, useShellMode: true, detected: false };
}
}
// Enhanced redirect handling
const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/redirect');
// Ensure web browser is working correctly
// purgeStaleTrackers removed from import: browserhealth's pageCreationTracker
// and pageUsageTracker are now WeakMaps, so GC reclaims dead-page entries
// automatically — manual purging is no longer needed.
const { monitorBrowserHealth, isBrowserHealthy, isQuicklyResponsive, performGroupWindowCleanup, performRealtimeWindowCleanup, trackPageForRealtime, updatePageUsage, untrackPage, cleanupPageBeforeReload } = require('./lib/browserhealth');
// --- Script Configuration & Constants ---
const VERSION = '2.0.33'; // Script version
// get startTime
const startTime = Date.now();
// Initialize domain cache helpers with debug logging if enabled
const domainCacheOptions = { enableLogging: false }; // Set to true for cache debug logs
const { isDomainAlreadyDetected, markDomainAsDetected } = createGlobalHelpers(domainCacheOptions);
// Smart cache will be initialized after config is loaded
let smartCache = null;
// --- Command-Line Argument Parsing ---
const args = process.argv.slice(2);
if (args.length === 0) {
args.push('--help');
}
// --- .nwssconfig support: inject per-config settings into args ---
const NWSSCONFIG_PATH = path.join(__dirname, '.nwssconfig');
if (fs.existsSync(NWSSCONFIG_PATH)) {
try {
const nwssConfig = JSON.parse(fs.readFileSync(NWSSCONFIG_PATH, 'utf-8'));
// Find which config file is being used (--custom-json <file> or positional .json arg)
const customJsonIdx = args.findIndex(arg => arg === '--custom-json');
const positionalJson = (customJsonIdx === -1)
? args.find(a => a.endsWith('.json') && !a.startsWith('--'))
: null;
const configFilename = (customJsonIdx !== -1 && args[customJsonIdx + 1])
? args[customJsonIdx + 1]
: positionalJson;
// If a positional .json was used (not --custom-json), wire it to --custom-json
// so the real config loader picks it up instead of defaulting to config.json
if (positionalJson && customJsonIdx === -1) {
args.push('--custom-json', positionalJson);
process.argv.push('--custom-json', positionalJson);
}
if (configFilename && nwssConfig.configs && nwssConfig.configs[configFilename]) {
const settings = nwssConfig.configs[configFilename];
const originalArgs = args.join(' ');
// Map settings keys to CLI flags — only inject if not already in args
const settingsMap = {
output: ['-o', '--output'],
max_concurrent: ['--max-concurrent'],
dns_cache: ['--dns-cache'],
cache_requests: ['--cache-requests'],
dumpurls: ['--dumpurls'],
remove_tempfiles: ['--remove-tempfiles'],
color: ['--color'],
remove_dupes: ['--remove-dupes', '--remove-dubes'],
'remove-dupes': ['--remove-dupes', '--remove-dubes'],
'remove-dubes': ['--remove-dupes', '--remove-dubes'],
compress_logs: ['--compress-logs'],
debug: ['--debug'],
silent: ['--silent'],
verbose: ['--verbose'],
headful: ['--headful'],
keep_open: ['--keep-open'],
dry_run: ['--dry-run'],
titles: ['--titles'],
sub_domains: ['--sub-domains'],
no_interact: ['--no-interact'],
ghost_cursor: ['--ghost-cursor'],
plain: ['--plain'],
cdp: ['--cdp'],
dnsmasq: ['--dnsmasq'],
unbound: ['--unbound'],
privoxy: ['--privoxy'],
pihole: ['--pihole'],
eval_on_doc: ['--eval-on-doc'],
use_puppeteer_core: ['--use-puppeteer-core'],
ignore_cache: ['--ignore-cache'],
clear_cache: ['--clear-cache'],
block_ads: ['--block-ads'],
compare: ['--compare'],
localhost: ['--localhost'],
append: ['--append']
};
for (const [key, flags] of Object.entries(settingsMap)) {
// Support both underscore and hyphen variants (e.g. dns_cache or dns-cache)
const value = settings[key] !== undefined ? settings[key]
: settings[key.replace(/_/g, '-')] !== undefined ? settings[key.replace(/_/g, '-')]
: settings[key.replace(/-/g, '_')] !== undefined ? settings[key.replace(/-/g, '_')]
: undefined;
if (value === undefined) continue;
// Skip if any variant of the flag is already in CLI args
if (flags.some(f => originalArgs.includes(f))) continue;
if (typeof value === 'boolean') {
if (value) args.push(flags[flags.length - 1]);
} else if (typeof value === 'string' || typeof value === 'number') {
args.push(flags[flags.length - 1], String(value));
}
}
}
} catch (e) {
console.error(`Warning: Failed to parse .nwssconfig: ${e.message}`);
}
}
const headfulMode = args.includes('--headful');
// Sites (esp. video/streaming) call element.requestFullscreen() on load or
// click. In --headful that hijacks the real Chrome window into true
// fullscreen, forcing a manual ESC. Neutralize the Fullscreen API by
// default so it can't. Harmless in headless (no screen — the API is
// already inert there), so default-on keeps headful consistent with the
// primary headless path. --allow-fullscreen restores native behavior.
const allowFullscreen = args.includes('--allow-fullscreen');
const SOURCES_FOLDER = 'sources';
let outputFile = null;
const outputIndex = args.findIndex(arg => arg === '--output' || arg === '-o');
if (outputIndex !== -1 && args[outputIndex + 1]) {
outputFile = args[outputIndex + 1];
}
const appendMode = args.includes('--append');
let compareFile = null;
const compareIndex = args.findIndex(arg => arg === '--compare');
if (compareIndex !== -1 && args[compareIndex + 1]) {
compareFile = args[compareIndex + 1];
}
const forceVerbose = args.includes('--verbose');
const forceDebug = args.includes('--debug');
const silentMode = args.includes('--silent');
const showTitles = args.includes('--titles');
const dumpUrls = args.includes('--dumpurls');
const subDomainsMode = args.includes('--sub-domains');
// Parse --localhost with optional IP address
let localhostIP = null;
const localhostIndex = args.findIndex(arg => arg.startsWith('--localhost'));
if (localhostIndex !== -1) {
localhostIP = args[localhostIndex].includes('=') ? args[localhostIndex].split('=')[1] : '127.0.0.1';
}
const keepBrowserOpen = args.includes('--keep-open');
const loadExtensionPaths = [];
args.forEach((arg, idx) => {
if (arg === '--load-extension' && args[idx + 1] && !args[idx + 1].startsWith('--')) {
loadExtensionPaths.push(path.resolve(args[idx + 1]));
}
});
const disableInteract = args.includes('--no-interact');
const globalGhostCursor = args.includes('--ghost-cursor');
const plainOutput = args.includes('--plain');
const enableCDP = args.includes('--cdp');
const dnsmasqMode = args.includes('--dnsmasq');
const dnsmasqOldMode = args.includes('--dnsmasq-old');
const unboundMode = args.includes('--unbound');
const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes');
const privoxyMode = args.includes('--privoxy');
const piholeMode = args.includes('--pihole');
const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception
const dryRunMode = args.includes('--dry-run');
const compressLogs = args.includes('--compress-logs');
const removeTempFiles = args.includes('--remove-tempfiles');
const validateConfig = args.includes('--validate-config');
let validateRules = args.includes('--validate-rules');
const testValidation = args.includes('--test-validation');
let cleanRules = args.includes('--clean-rules');
const clearCache = args.includes('--clear-cache');
const ignoreCache = args.includes('--ignore-cache');
const cacheRequests = args.includes('--cache-requests');
const dnsCacheMode = args.includes('--dns-cache');
if (dnsCacheMode) enableDiskCache();
// DNS pre-check before page.goto() — default-on, --no-dns-precheck disables.
// Filters NXDOMAIN / unresolvable hostnames in <100ms before paying the
// ~5-15s Puppeteer + Cloudflare detection round-trip on each.
const dnsPrecheckEnabled = !args.includes('--no-dns-precheck');
const dnsPrecheckTimeoutMs = 2000;
// Per-scan cache of negative DNS lookups. OS resolvers don't always cache
// NXDOMAIN responses, and a scan can hit the same dead hostname many times
// (different URL paths on the same site). Positive results are left to the
// OS cache; failure-cache avoids repeated lookup latency for known-dead hosts.
// FIFO eviction at DNS_NEGATIVE_CACHE_MAX so pathological scans (thousands
// of unique dead hosts) can't grow the cache unboundedly. Same pattern as
// the rest of the codebase's in-memory caches.
const dnsNegativeCache = new Map(); // hostname -> { error, timestamp }
const DNS_NEGATIVE_CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
const DNS_NEGATIVE_CACHE_MAX = 1000;
let dnsPrecheckSkips = 0; // URLs skipped because hostname is NXDOMAIN-cached
let dnsPositiveSkips = 0; // URLs skipped because dig/whois cache proves resolution
const dnsPositiveSkippedHosts = new Set(); // unique hostnames that triggered the positive skip path
// c-ares transient codes — read-only, hoisted out of the per-task DNS
// pre-check so we don't allocate a fresh Set per URL.
const DNS_TRANSIENT_ERRORS = new Set(['ETIMEOUT', 'ESERVFAIL', 'EREFUSED', 'ECONNREFUSED']);
function dnsNegativeCacheSet(hostname, error) {
if (dnsNegativeCache.size >= DNS_NEGATIVE_CACHE_MAX) {
dnsNegativeCache.delete(dnsNegativeCache.keys().next().value);
}
dnsNegativeCache.set(hostname, { error, timestamp: Date.now() });
}
let validateRulesFile = null;
const validateRulesIndex = args.findIndex(arg => arg === '--validate-rules');
if (validateRulesIndex !== -1 && args[validateRulesIndex + 1] && !args[validateRulesIndex + 1].startsWith('--')) {
validateRulesFile = args[validateRulesIndex + 1];
validateRules = true; // Override the boolean if file specified
}
let cleanRulesFile = null;
const cleanRulesIndex = args.findIndex(arg => arg === '--clean-rules');
if (cleanRulesIndex !== -1 && args[cleanRulesIndex + 1] && !args[cleanRulesIndex + 1].startsWith('--')) {
cleanRulesFile = args[cleanRulesIndex + 1];
cleanRules = true; // Override the boolean if file specified
}
let maxConcurrentSites = null;
const maxConcurrentIndex = args.findIndex(arg => arg === '--max-concurrent');
if (maxConcurrentIndex !== -1 && args[maxConcurrentIndex + 1]) {
maxConcurrentSites = parseInt(args[maxConcurrentIndex + 1]);
}
let cleanupInterval = null;
const cleanupIntervalIndex = args.findIndex(arg => arg === '--cleanup-interval');
if (cleanupIntervalIndex !== -1 && args[cleanupIntervalIndex + 1]) {
cleanupInterval = parseInt(args[cleanupIntervalIndex + 1]);
}
const enableColors = args.includes('--color') || args.includes('--colour');
let adblockRulesMode = args.includes('--adblock-rules');
// Adblock variables (request blocking)
let adblockEnabled = false;
let adblockMatcher = null;
let adblockStats = { blocked: 0, allowed: 0 };
// Cloudflare scan-wide stats. errorPages counts URLs where the returned page
// was a Cloudflare-served 5xx origin error (522/523/etc.) — no bypass
// possible, useful signal for diagnosing dead-origin scans. Named distinct
// from the local cloudflareStats = getCacheStats() in the debug stats block.
let cloudflareScanStats = { errorPages: 0 };
// Validate --adblock-rules usage - ignore if used incorrectly instead of erroring
if (adblockRulesMode) {
if (!outputFile) {
if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: requires --output (-o) to specify an output file`));
adblockRulesMode = false;
} else if (localhostIP || plainOutput || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: incompatible with localhost/plain output modes`));
adblockRulesMode = false;
}
}
// Validate --dnsmasq usage
if (dnsmasqMode) {
if (localhostIP || plainOutput || adblockRulesMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
dnsmasqMode = false;
}
}
// Validate --dnsmasq-old usage
if (dnsmasqOldMode) {
if (localhostIP || plainOutput || adblockRulesMode || dnsmasqMode || unboundMode || privoxyMode || piholeMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
dnsmasqOldMode = false;
}
}
// Validate --unbound usage
if (unboundMode) {
if (localhostIP || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || privoxyMode || piholeMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--unbound ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
unboundMode = false;
}
}
// Validate --privoxy usage
if (privoxyMode) {
if (localhostIP || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || piholeMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--privoxy ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound output modes`));
privoxyMode = false;
}
}
// Validate --pihole usage
if (piholeMode) {
if (localhostIP || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--pihole ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound/privoxy output modes`));
piholeMode = false;
}
}
// Validate --compress-logs usage
if (compressLogs && !dumpUrls) {
console.error(`❌ --compress-logs can only be used with --dumpurls`);
process.exit(1);
}
// Validate --append usage
if (appendMode && !outputFile) {
console.error(`❌ --append requires --output (-o) to specify an output file`);
process.exit(1);
}
if (appendMode && (compareFile || dryRunMode)) {
console.error(`❌ --append cannot be used with --compare or --dry-run`);
process.exit(1);
}
// Validate --dry-run usage
if (dryRunMode) {
if (compressLogs || compareFile) {
console.error(`❌ --dry-run cannot be used with --compress-logs or --compare`);
process.exit(1);
}
}
// Validate --compare usage
if (compareFile && !outputFile) {
console.error(`❌ --compare requires --output (-o) to specify an output file`);
process.exit(1);
}
if (compareFile && !fs.existsSync(compareFile)) {
console.error(`❌ Compare file not found: ${compareFile}`);
process.exit(1);
}
if (args.includes('--version')) {
console.log(`nwss.js version ${VERSION}`);
process.exit(0);
}
// Handle --clear-cache before config loading (uses default cache path)
if (clearCache && !dryRunMode) {
clearPersistentCache({
silent: silentMode,
forceDebug,
cachePath: CACHE_LIMITS.DEFAULT_CACHE_PATH // Default path, will be updated after config loads if needed
});
// Also clear Cloudflare detection cache
clearDetectionCache();
if (forceDebug) console.log(formatLogMessage('debug', 'Cleared Cloudflare detection cache'));
}
// Handle validation-only operations before main help
if (testValidation) {
console.log(`\n${messageColors.processing('Running domain validation tests...')}`);
const testResult = testDomainValidation();
if (testResult) {
console.log(`${messageColors.success('✅ All validation tests passed!')}`);
process.exit(0);
} else {
console.log(`${messageColors.error('❌ Some validation tests failed!')}`);
process.exit(1);
}
}
// Note: --validate-config is handled further down, AFTER the config file is
// loaded and `config`/`sites` are populated. Running it here would fail with
// "Cannot access 'config' before initialization" since those are declared
// later in the module.
if (validateRules || validateRulesFile) {
const filesToValidate = validateRulesFile ? [validateRulesFile] : [outputFile, compareFile].filter(Boolean);
if (filesToValidate.length === 0) {
console.error('❌ --validate-rules requires either a file argument or --output/--compare files to be specified');
process.exit(1);
}
console.log(`\n${messageColors.processing('Validating rule files...')}`);
let overallValid = true;
for (const file of filesToValidate) {
console.log(`\n${messageColors.info('Validating:')} ${file}`);
try {
const validation = validateRulesetFile(file, { forceDebug, silentMode, maxErrors: 20 });
if (validation.isValid) {
console.log(`${messageColors.success('✅ Valid:')} ${validation.stats.valid} rules, ${validation.stats.comments} comments`);
if (validation.duplicates.length > 0) {
console.log(`${messageColors.warn('⚠ Duplicates:')} ${validation.duplicates.length} duplicate rules found`);
}
if (Object.keys(validation.stats.formats).length > 0) {
console.log(`${messageColors.info('Formats:')} ${Object.entries(validation.stats.formats).map(([f, c]) => `${f}(${c})`).join(', ')}`);
}
} else {
console.log(`${messageColors.error('❌ Invalid:')} ${validation.stats.invalid} invalid rules out of ${validation.stats.total} total`);
overallValid = false;
}
} catch (validationErr) {
console.error(`❌ Failed to validate ${file}: ${validationErr.message}`);
overallValid = false;
}
}
if (overallValid) {
console.log(`\n${messageColors.success('✅ All rule files are valid!')}`);
process.exit(0);
} else {
console.log(`\n${messageColors.error('❌ Some rule files have validation errors!')}`);
process.exit(1);
}
}
// Parse --adblock-engine=<js|rust> (default: js). Selects the matcher backend
// used by --block-ads. The rust engine requires the optional adblock-rs package.
const adblockEngineIndex = args.findIndex(arg => arg.startsWith('--adblock-engine'));
let adblockEngineName = 'js';
if (adblockEngineIndex !== -1) {
const engineArg = args[adblockEngineIndex].includes('=')
? args[adblockEngineIndex].split('=')[1]
: args[adblockEngineIndex + 1];
if (engineArg === 'rust' || engineArg === 'js') {
adblockEngineName = engineArg;
} else {
console.log(`Error: --adblock-engine must be 'js' or 'rust' (got: ${engineArg})`);
process.exit(1);
}
}
// Parse --block-ads argument for request-level ad blocking (supports comma-separated lists)
const blockAdsIndex = args.findIndex(arg => arg.startsWith('--block-ads'));
if (blockAdsIndex !== -1) {
const rulesArg = args[blockAdsIndex].includes('=')
? args[blockAdsIndex].split('=')[1]
: args[blockAdsIndex + 1];
if (!rulesArg) {
console.log('Error: No adblock rules file specified');
process.exit(1);
}
const rulesFiles = rulesArg.split(',').map(f => f.trim()).filter(f => f);
for (const file of rulesFiles) {
if (!fs.existsSync(file)) {
console.log(`Error: Adblock rules file not found: ${file}`);
process.exit(1);
}
}
adblockEnabled = true;
const engine = adblockEngineName === 'rust' ? adblockRust : adblockJs;
try {
if (engine === adblockRust) {
// Rust wrapper accepts an array directly — no temp file needed.
adblockMatcher = engine.parseAdblockRules(rulesFiles, { enableLogging: forceDebug });
} else {
// JS engine takes a single path; concat to a temp file when multiple lists.
let rulesFile = rulesFiles[0];
if (rulesFiles.length > 1) {
rulesFile = path.join(os.tmpdir(), `nwss-adblock-combined-${Date.now()}.txt`);
const combined = rulesFiles.map(f => fs.readFileSync(f, 'utf-8')).join('\n');
fs.writeFileSync(rulesFile, combined);
}
adblockMatcher = engine.parseAdblockRules(rulesFile, { enableLogging: forceDebug });
}
} catch (err) {
console.log(`Error: Failed to load adblock engine '${adblockEngineName}': ${err.message}`);
process.exit(1);
}
const stats = adblockMatcher.getStats();
const ruleDesc = stats.total != null
? `${stats.total} blocking rules`
: `compiled engine (cached)`;
if (!silentMode) console.log(messageColors.success(`Adblock enabled (${adblockEngineName}): Loaded ${ruleDesc} from ${rulesFiles.length} list${rulesFiles.length > 1 ? 's' : ''}`));
}
if (args.includes('--help') || args.includes('-h')) {
console.log(`Usage: node nwss.js [options]
Options:
--color, --colour Enable colored console output for status messages
-o, --output <file> Output file for rules. If omitted, prints to console
--compare <file> Remove rules that already exist in this file before output
--append Append new rules to output file instead of overwriting (requires -o)
Output Format Options:
--localhost[=IP] Output as IP domain.com (default: 127.0.0.1)
Examples: --localhost, --localhost=0.0.0.0, --localhost=192.168.1.1
--plain Output just domains (no adblock formatting)
--dnsmasq Output as local=/domain.com/ (dnsmasq format)
--dnsmasq-old Output as server=/domain.com/ (dnsmasq old format)
--unbound Output as local-zone: "domain.com." always_null (unbound format)
--privoxy Output as { +block } .domain.com (Privoxy format)
--pihole Output as (^|\\.)domain\\.com$ (Pi-hole regex format)
--adblock-rules Generate adblock filter rules with resource type modifiers (requires -o)
Request Blocking:
--block-ads=<file> Block ads/trackers using EasyList format rules (||domain.com^, /ads/*, etc)
Works at request-level for maximum performance
Supports comma-separated lists: --block-ads=easylist.txt,easyprivacy.txt
--adblock-engine=<js|rust> Matcher backend for --block-ads (default: js)
'rust' uses Brave's adblock-rs (faster on large lists; needs: npm i adblock-rs)
Per-config settings file (.nwssconfig):
Place a .nwssconfig file in the project root to define per-config settings.
When a config filename matches a key in .nwssconfig, those settings are used.
CLI flags merge with and override .nwssconfig settings.
See README.md for format details.
General Options:
--verbose Force verbose mode globally
--debug Force debug mode globally
--silent Suppress normal console logs
--titles Add ! <url> title before each site's group
--dumpurls Dump matched URLs into matched_urls.log
--dry-run Console output only: show matching regex, titles, whois/dig/searchstring results, and adblock rules
--compress-logs Compress log files with gzip (requires --dumpurls)
--sub-domains Output full subdomains instead of collapsing to root
--no-interact Disable page interactions globally
--ghost-cursor Use ghost-cursor Bezier mouse movements (requires: npm i ghost-cursor)
--custom-json <file> Use a custom config JSON file instead of config.json
--headful Launch browser with GUI (not headless)
--keep-open Keep browser open after scan completes (use with --headful)
--allow-fullscreen Allow sites to use the Fullscreen API. By default it is
neutralized so sites can't hijack the window in --headful
--use-puppeteer-core Use puppeteer-core with system Chrome instead of bundled Chromium
--use-obscura Connect to running Obscura CDP server (ws://127.0.0.1:9222 or OBSCURA_WS env)
Skips fingerprint injection — Obscura provides built-in stealth
--load-extension <path> Load unpacked Chrome extension from directory
--cdp Enable Chrome DevTools Protocol logging (now per-page if enabled)
--remove-dupes Remove duplicate domains from output (only with -o)
--eval-on-doc Globally enable evaluateOnNewDocument() for Fetch/XHR interception
--help, -h Show this help menu
--version Show script version
--max-concurrent <number> Maximum concurrent site processing (1-50, overrides config/default)
--cleanup-interval <number> Browser restart interval in URLs processed (1-1000, overrides config/default)
--remove-tempfiles Remove Chrome/Puppeteer temporary files before exit
Validation Options:
--cache-requests Cache HTTP requests to avoid re-requesting same URLs within scan
--dns-cache Persist dig/whois results to disk between runs (20h TTL, 2000-entry cap each)
--no-dns-precheck Disable per-URL DNS resolution check before page navigation.
By default, URLs whose hostname doesn't resolve are skipped
immediately (saves ~5-15s of Puppeteer time per dead host).
--validate-config Validate config.json file and exit
--validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
--clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
--test-validation Run domain validation tests and exit
--clear-cache Clear persistent cache before scanning (improves fresh start performance)
--ignore-cache Bypass all smart caching functionality during scanning
Global config.json options:
ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards)
ignoreDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, the request's root domain is ignored for the rest of the scan
blockDomainsByUrl: ["regex1", "regex2"] Regex patterns; if any request URL matches, ALL subsequent requests on that root domain (and subdomains) are aborted via Puppeteer for the rest of the scan
blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked)
whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random)
ignore_similar: true/false Ignore domains similar to already found domains (default: true)
ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80)
ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true)
max_concurrent_sites: 8 Maximum concurrent site processing (1-50, default: 8)
resource_cleanup_interval: 80 Browser restart interval in URLs processed (1-1000, default: 80)
disable_ad_tagging: true/false Disable Chrome AdTagging to prevent ad frame throttling (default: true)
Per-site config.json options:
url: "site" or ["site1", "site2"] Single URL or list of URLs
filterRegex: "regex" or ["regex1", "regex2"] Patterns to match requests
regex_and: true/false Use AND logic for multiple filterRegex patterns (default: false)
When true, ALL regex patterns must match the same URL
Redirect Handling Options:
follow_redirects: true/false Follow redirects to new domains (default: true)
max_redirects: 10 Maximum number of redirects to follow (default: 10)
js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000)
detect_js_patterns: true/false Analyze page source for redirect patterns (default: true)
redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5)
comments: "text" or ["text1", "text2"] Documentation/notes - ignored by script
searchstring: "text" or ["text1", "text2"] Text to search in response content (requires filterRegex match)
ignore_similar: true/false Override global ignore_similar setting for this site
ignore_similar_threshold: 80 Override global similarity threshold for this site
ignore_similar_ignored_domains: true/false Override global ignore_similar_ignored_domains for this site
searchstring_and: "text" or ["text1", "text2"] Text to search with AND logic - ALL terms must be present (requires filterRegex match)
curl: true/false Use curl to download content for analysis (default: false)
Note: curl respects filterRegex but ignores resourceTypes filtering
grep: true/false Use grep instead of JavaScript for pattern matching (default: false)
Note: requires curl=true, uses system grep command for faster searches
blocked: ["regex"] Regex patterns to block requests
css_blocked: ["#selector", ".class"] CSS selectors to hide elements
resourceTypes: ["script", "stylesheet"] Only process requests of these resource types (default: all types)
interact: true/false Simulate mouse movements/clicks
isBrave: true/false Spoof Brave browser detection
userAgent: "chrome"|"chrome_mac"|"chrome_linux"|"firefox"|"firefox_mac"|"firefox_linux"|"safari" Custom desktop User-Agent
interact_intensity: "low"|"medium"|"high" Interaction simulation intensity (default: medium)
delay: <milliseconds> Delay after load (default: 4000)
reload: <number> Reload page n times after load (default: 1)
forcereload: true/false or ["domain1.com", "domain2.com"] Force cache-clearing reload for all URLs or specific domains
clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false)
subDomains: 1/0 Output full subdomains (default: 0)
localhost: true/false Force localhost output (127.0.0.1)
localhost_0_0_0_0: true/false Force localhost output (0.0.0.0)
dnsmasq: true/false Force dnsmasq output (local=/domain.com/)
dnsmasq_old: true/false Force dnsmasq old output (server=/domain.com/)
unbound: true/false Force unbound output (local-zone: "domain.com." always_null)
privoxy: true/false Force Privoxy output ({ +block } .domain.com)
pihole: true/false Force Pi-hole regex output ((^|\\.)domain\\.com$)
source: true/false Save page source HTML after load
firstParty: true/false Allow first-party matches (default: false)
thirdParty: true/false Allow third-party matches (default: true)
screenshot: true/false/\"force\" Capture screenshot (true=on failure, \"force\"=always)
headful: true/false Launch browser with GUI for this site
fingerprint_protection: true/false/"random" Enable fingerprint spoofing: true/false/"random"
adblock_rules: true/false Generate adblock filter rules with resource types for this site
even_blocked: true/false Add matching rules even if requests are blocked (default: false)
bypass_cache: true/false Skip all caching for this site's URLs (default: false)
referrer_headers: "url" or ["url1", "url2"] Set referrer header for realistic traffic sources
custom_headers: {"Header": "value"} Add custom HTTP headers to requests
referrer_disable: ["url1", "url2"] Disable referrer headers for specific URLs
Cloudflare Protection Options:
cloudflare_phish: true/false Auto-click through Cloudflare phishing warnings (default: false)
cloudflare_bypass: true/false Auto-solve Cloudflare "Verify you are human" challenges (default: false)
cloudflare_parallel_detection: true/false Use parallel detection for faster Cloudflare checks (default: true)
cloudflare_max_retries: <number> Maximum retry attempts for Cloudflare operations (default: 3)
cloudflare_cache_ttl: <milliseconds> TTL for Cloudflare detection cache (default: 300000 - 5 minutes)
cloudflare_retry_on_error: true/false Enable retry logic for Cloudflare operations (default: true)
Note: Automatically detects and exits on redirect loops to prevent endless loading
cloudflare_retry_on_error: true/false Enable retry logic for Cloudflare operations (default: true)
FlowProxy Protection Options:
flowproxy_detection: true/false Enable flowProxy protection detection and handling (default: false)
flowproxy_page_timeout: <milliseconds> Page timeout for flowProxy sites (default: 45000)
flowproxy_nav_timeout: <milliseconds> Navigation timeout for flowProxy sites (default: 45000)
flowproxy_js_timeout: <milliseconds> JavaScript challenge timeout (default: 15000)
flowproxy_delay: <milliseconds> Delay for rate limiting (default: 30000)
flowproxy_additional_delay: <milliseconds> Additional processing delay (default: 5000)
Advanced Options:
evaluateOnNewDocument: true/false Inject fetch/XHR interceptor in page (for this site)
cdp: true/false Enable CDP logging for this site Inject fetch/XHR interceptor in page
cdp_specific: ["domain1.com", "domain2.com"] Enable CDP logging only for specific domains in the URL list
interact_duration: <milliseconds> Duration of interaction simulation (default: 2000)
interact_scrolling: true/false Enable scrolling simulation (default: true)
interact_clicks: true/false Enable element clicking simulation (default: false)
interact_typing: true/false Enable typing simulation (default: false)
cursor_mode: "ghost" Use ghost-cursor Bezier mouse (requires: npm i ghost-cursor)
ghost_cursor_speed: <number> Ghost-cursor speed multiplier (default: auto)
ghost_cursor_hesitate: <milliseconds> Delay before ghost-cursor clicks (default: 50)
ghost_cursor_overshoot: <pixels> Max ghost-cursor overshoot distance (default: auto)
ghost_cursor_duration: <milliseconds> Ghost-cursor interaction duration (default: interact_duration or 2000)
whois: ["term1", "term2"] Check whois data for ALL specified terms (AND logic)
whois-or: ["term1", "term2"] Check whois data for ANY specified term (OR logic)
whois_server_mode: "random" or "cycle" Server selection mode: random (default) or cycle through list
whois_server: "whois.domain.com" or ["server1", "server2"] Custom whois server(s) - single server or randomized list (default: system default)
whois_max_retries: 2 Maximum retry attempts per domain (default: 2)
whois_timeout_multiplier: 1.5 Timeout increase multiplier per retry (default: 1.5)
whois_use_fallback: true Add TLD-specific fallback servers (default: true)
whois_retry_on_timeout: true Retry on timeout errors (default: true)
whois_retry_on_error: true Retry on connection/other errors (default: true)
whois_delay: <milliseconds> Delay between whois requests for this site (default: global whois_delay)
dig: ["term1", "term2"] Check dig output for ALL specified terms (AND logic)
dig-or: ["term1", "term2"] Check dig output for ANY specified term (OR logic)
goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "load"})
dig_subdomain: true/false Use subdomain for dig lookup instead of root domain (default: false)
digRecordType: "A" DNS record type for dig (default: A)
VPN Options (requires sudo, affects system routing — not isolated per-site during concurrent scans):
vpn: "/etc/wireguard/wg0.conf" WireGuard config file path
vpn: { config: "wg-us", interface: "wg0", WireGuard with options: health_check, test_host,
health_check: true, retry: true } retry, max_retries
openvpn: "/path/to/server.ovpn" OpenVPN config file path (uses embedded credentials)
openvpn: { config: "server.ovpn", OpenVPN with options: username, password,
username: "user", auth_file, health_check, test_host, retry,
password: "pass", max_retries, connect_timeout, extra_args
health_check: true,
retry: true,
max_retries: 2,
connect_timeout: 30000 }
window_cleanup: true/false/"realtime"/"all" Window cleanup mode:
true/false - Close extra windows after URL group completes (default: false)
"realtime" - Continuously cleanup oldest pages when threshold exceeded
"all" - Aggressive cleanup of all content pages after group
window_cleanup_threshold: <number> For realtime mode: max pages to keep open (default: 8)
Referrer Header Options:
referrer_headers: "https://google.com" Single referrer URL
referrer_headers: ["url1", "url2"] Random selection from array
referrer_headers: {"mode": "random_search", "search_terms": ["term1"]} Smart search engine traffic
referrer_headers: {"mode": "social_media"} Random social media referrers
referrer_headers: {"mode": "direct_navigation"} No referrer (direct access)
referrer_headers: {"mode": "news_sites"} Random news website referrers
referrer_headers: {"mode": "custom", "url": "https://example.com"} Custom referrer URL
referrer_headers: {"mode": "mixed"} Mixed referrer types for varied traffic
referrer_disable: ["https://example.com/no-ref", "sensitive-site.com"] Disable referrer for specific URLs
custom_headers: {"Header": "Value"} Additional HTTP headers
`);
process.exit(0);
}
// --- Configuration File Loading ---
const configPathIndex = args.findIndex(arg => arg === '--custom-json');
const configPath = (configPathIndex !== -1 && args[configPathIndex + 1]) ? args[configPathIndex + 1] : 'config.json';
let config;
try {
if (!fs.existsSync(configPath)) {
console.error(`❌ Config file not found: ${configPath}`);
process.exit(1);
}
if (forceDebug && configPath !== 'config.json') {
console.log(formatLogMessage('debug', `Using custom config file: ${configPath}`));
}
const raw = fs.readFileSync(configPath, 'utf8');
config = JSON.parse(raw);
} catch (e) {
console.error(`❌ Failed to load config file (${configPath}):`, e.message);
process.exit(1);
}
// Extract config values while ignoring 'comments' field at global and site levels
const {
sites = [],
ignoreDomains = [],
ignoreDomainsByUrl = [],
blockDomainsByUrl = [],
blocked: globalBlocked = [],
whois_delay = 3000,
whois_server_mode = 'random',
ignore_similar = true,
ignore_similar_threshold = 80,
ignore_similar_ignored_domains = true,
disable_ad_tagging = true,
max_concurrent_sites = 6,
resource_cleanup_interval = 80,
comments: globalComments,
...otherGlobalConfig
} = config;
// --validate-config runs here, after `config` and `sites` are populated.
// Previously this block lived above t