@fanboynz/network-scanner
Version:
A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.
1,079 lines (958 loc) • 124 kB
JavaScript
// === Network scanner script (nwss.js) v1.0.57 ===
// puppeteer for browser automation, fs for file system operations, psl for domain parsing.
// const pLimit = require('p-limit'); // Will be dynamically imported
const puppeteer = require('puppeteer');
const fs = require('fs');
const psl = require('psl');
const path = require('path');
const { createGrepHandler, validateGrepAvailability } = require('./lib/grep');
const { compressMultipleFiles, formatFileSize } = require('./lib/compress');
const { parseSearchStrings, createResponseHandler, createCurlHandler } = require('./lib/searchstring');
const { applyAllFingerprintSpoofing } = require('./lib/fingerprint');
const { formatRules, handleOutput, getFormatDescription } = require('./lib/output');
// Rule validation
const { validateRulesetFile, validateFullConfig, testDomainValidation, cleanRulesetFile } = require('./lib/validate_rules');
// CF Bypass
const { handleCloudflareProtection } = require('./lib/cloudflare');
// FP Bypass
const { handleFlowProxyProtection, getFlowProxyTimeouts } = require('./lib/flowproxy');
// ignore_similar rules
const { shouldIgnoreSimilarDomain, calculateSimilarity } = require('./lib/ignore_similar');
// Graceful exit
const { handleBrowserExit, cleanupChromeTempFiles } = require('./lib/browserexit');
// Whois & Dig
const { createNetToolsHandler, createEnhancedDryRunCallback, validateWhoisAvailability, validateDigAvailability } = require('./lib/nettools');
// File compare
const { loadComparisonRules, filterUniqueRules } = require('./lib/compare');
// CDP functionality
const { createCDPSession } = require('./lib/cdp');
// Colorize various text when used
const { colorize, colors, messageColors, tags, formatLogMessage } = require('./lib/colorize');
// Enhanced mouse interaction and page simulation
const { performPageInteraction, createInteractionConfig } = require('./lib/interaction');
// Domain detection cache for performance optimization
const { createGlobalHelpers, getTotalDomainsSkipped, getDetectedDomainsCount } = require('./lib/domain-cache');
const { createSmartCache } = require('./lib/smart-cache'); // Smart cache system
// Enhanced redirect handling
const { navigateWithRedirectHandling, handleRedirectTimeout } = require('./lib/redirect');
// Ensure web browser is working correctly
const { monitorBrowserHealth, isBrowserHealthy } = require('./lib/browserhealth');
// --- Script Configuration & Constants ---
const VERSION = '1.0.57'; // Script version
// get startTime
const startTime = Date.now();
// Initialize domain cache helpers with debug logging if enabled
const domainCacheOptions = { enableLogging: false }; // Set to true for cache debug logs
const { isDomainAlreadyDetected, markDomainAsDetected } = createGlobalHelpers(domainCacheOptions);
// Smart cache will be initialized after config is loaded
let smartCache = null;
// --- Command-Line Argument Parsing ---
const args = process.argv.slice(2);
if (args.length === 0) {
args.push('--help');
}
const headfulMode = args.includes('--headful');
const SOURCES_FOLDER = 'sources';
let outputFile = null;
const outputIndex = args.findIndex(arg => arg === '--output' || arg === '-o');
if (outputIndex !== -1 && args[outputIndex + 1]) {
outputFile = args[outputIndex + 1];
}
const appendMode = args.includes('--append');
let compareFile = null;
const compareIndex = args.findIndex(arg => arg === '--compare');
if (compareIndex !== -1 && args[compareIndex + 1]) {
compareFile = args[compareIndex + 1];
}
const forceVerbose = args.includes('--verbose');
const forceDebug = args.includes('--debug');
const silentMode = args.includes('--silent');
const showTitles = args.includes('--titles');
const dumpUrls = args.includes('--dumpurls');
const subDomainsMode = args.includes('--sub-domains');
const localhostMode = args.includes('--localhost');
const localhostModeAlt = args.includes('--localhost-0.0.0.0');
const disableInteract = args.includes('--no-interact');
const plainOutput = args.includes('--plain');
const enableCDP = args.includes('--cdp');
const dnsmasqMode = args.includes('--dnsmasq');
const dnsmasqOldMode = args.includes('--dnsmasq-old');
const unboundMode = args.includes('--unbound');
const removeDupes = args.includes('--remove-dupes') || args.includes('--remove-dubes');
const privoxyMode = args.includes('--privoxy');
const piholeMode = args.includes('--pihole');
const globalEvalOnDoc = args.includes('--eval-on-doc'); // For Fetch/XHR interception
const dryRunMode = args.includes('--dry-run');
const compressLogs = args.includes('--compress-logs');
const removeTempFiles = args.includes('--remove-tempfiles');
const validateConfig = args.includes('--validate-config');
const validateRules = args.includes('--validate-rules');
const testValidation = args.includes('--test-validation');
let cleanRules = args.includes('--clean-rules');
let validateRulesFile = null;
const validateRulesIndex = args.findIndex(arg => arg === '--validate-rules');
if (validateRulesIndex !== -1 && args[validateRulesIndex + 1] && !args[validateRulesIndex + 1].startsWith('--')) {
validateRulesFile = args[validateRulesIndex + 1];
validateRules = true; // Override the boolean if file specified
}
let cleanRulesFile = null;
const cleanRulesIndex = args.findIndex(arg => arg === '--clean-rules');
if (cleanRulesIndex !== -1 && args[cleanRulesIndex + 1] && !args[cleanRulesIndex + 1].startsWith('--')) {
cleanRulesFile = args[cleanRulesIndex + 1];
cleanRules = true; // Override the boolean if file specified
}
let maxConcurrentSites = null;
const maxConcurrentIndex = args.findIndex(arg => arg === '--max-concurrent');
if (maxConcurrentIndex !== -1 && args[maxConcurrentIndex + 1]) {
maxConcurrentSites = parseInt(args[maxConcurrentIndex + 1]);
}
let cleanupInterval = null;
const cleanupIntervalIndex = args.findIndex(arg => arg === '--cleanup-interval');
if (cleanupIntervalIndex !== -1 && args[cleanupIntervalIndex + 1]) {
cleanupInterval = parseInt(args[cleanupIntervalIndex + 1]);
}
const enableColors = args.includes('--color') || args.includes('--colour');
let adblockRulesMode = args.includes('--adblock-rules');
// Validate --adblock-rules usage - ignore if used incorrectly instead of erroring
if (adblockRulesMode) {
if (!outputFile) {
if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: requires --output (-o) to specify an output file`));
adblockRulesMode = false;
} else if (localhostMode || localhostModeAlt || plainOutput || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--adblock-rules ignored: incompatible with localhost/plain output modes`));
adblockRulesMode = false;
}
}
// Validate --dnsmasq usage
if (dnsmasqMode) {
if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqOldMode || unboundMode || privoxyMode || piholeMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
dnsmasqMode = false;
}
}
// Validate --dnsmasq-old usage
if (dnsmasqOldMode) {
if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || unboundMode || privoxyMode || piholeMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--dnsmasq-old ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
dnsmasqOldMode = false;
}
}
// Validate --unbound usage
if (unboundMode) {
if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || privoxyMode || piholeMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--unbound ignored: incompatible with localhost/plain/adblock-rules/dnsmasq output modes`));
unboundMode = false;
}
}
// Validate --privoxy usage
if (privoxyMode) {
if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || piholeMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--privoxy ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound output modes`));
privoxyMode = false;
}
}
// Validate --pihole usage
if (piholeMode) {
if (localhostMode || localhostModeAlt || plainOutput || adblockRulesMode || dnsmasqMode || dnsmasqOldMode || unboundMode || privoxyMode) {
if (forceDebug) console.log(formatLogMessage('debug', `--pihole ignored: incompatible with localhost/plain/adblock-rules/dnsmasq/unbound/privoxy output modes`));
piholeMode = false;
}
}
// Validate --compress-logs usage
if (compressLogs && !dumpUrls) {
console.error(`❌ --compress-logs can only be used with --dumpurls`);
process.exit(1);
}
// Validate --append usage
if (appendMode && !outputFile) {
console.error(`❌ --append requires --output (-o) to specify an output file`);
process.exit(1);
}
if (appendMode && (compareFile || dryRunMode)) {
console.error(`❌ --append cannot be used with --compare or --dry-run`);
process.exit(1);
}
// Validate --dry-run usage
if (dryRunMode) {
if (compressLogs || compareFile) {
console.error(`❌ --dry-run cannot be used with --compress-logs or --compare`);
process.exit(1);
}
}
// Validate --compare usage
if (compareFile && !outputFile) {
console.error(`❌ --compare requires --output (-o) to specify an output file`);
process.exit(1);
}
if (compareFile && !fs.existsSync(compareFile)) {
console.error(`❌ Compare file not found: ${compareFile}`);
process.exit(1);
}
if (args.includes('--version')) {
console.log(`nwss.js version ${VERSION}`);
process.exit(0);
}
// Handle validation-only operations before main help
if (testValidation) {
console.log(`\n${messageColors.processing('Running domain validation tests...')}`);
const testResult = testDomainValidation();
if (testResult) {
console.log(`${messageColors.success('✅ All validation tests passed!')}`);
process.exit(0);
} else {
console.log(`${messageColors.error('❌ Some validation tests failed!')}`);
process.exit(1);
}
}
if (validateConfig) {
console.log(`\n${messageColors.processing('Validating configuration file...')}`);
try {
const validation = validateFullConfig(config, { forceDebug, silentMode });
// Validate referrer_headers format
for (const site of sites) {
if (site.referrer_headers && typeof site.referrer_headers === 'object' && !Array.isArray(site.referrer_headers)) {
const validModes = ['random_search', 'social_media', 'direct_navigation', 'custom'];
if (site.referrer_headers.mode && !validModes.includes(site.referrer_headers.mode)) {
console.warn(`⚠ Invalid referrer_headers mode: ${site.referrer_headers.mode}. Valid modes: ${validModes.join(', ')}`);
}
}
}
if (validation.isValid) {
console.log(`${messageColors.success('✅ Configuration is valid!')}`);
console.log(`${messageColors.info('Summary:')} ${validation.summary.validSites}/${validation.summary.totalSites} sites valid`);
if (validation.summary.sitesWithWarnings > 0) {
console.log(`${messageColors.warn('⚠ Warnings:')} ${validation.summary.sitesWithWarnings} sites have warnings`);
}
process.exit(0);
} else {
console.log(`${messageColors.error('❌ Configuration validation failed!')}`);
console.log(`${messageColors.error('Errors:')} ${validation.globalErrors.length} global, ${validation.summary.sitesWithErrors} site-specific`);
process.exit(1);
}
} catch (validationErr) {
console.error(`❌ Validation failed: ${validationErr.message}`);
process.exit(1);
}
}
if (validateRules || validateRulesFile) {
const filesToValidate = validateRulesFile ? [validateRulesFile] : [outputFile, compareFile].filter(Boolean);
if (filesToValidate.length === 0) {
console.error('❌ --validate-rules requires either a file argument or --output/--compare files to be specified');
process.exit(1);
}
console.log(`\n${messageColors.processing('Validating rule files...')}`);
let overallValid = true;
for (const file of filesToValidate) {
console.log(`\n${messageColors.info('Validating:')} ${file}`);
try {
const validation = validateRulesetFile(file, { forceDebug, silentMode, maxErrors: 20 });
if (validation.isValid) {
console.log(`${messageColors.success('✅ Valid:')} ${validation.stats.valid} rules, ${validation.stats.comments} comments`);
if (validation.duplicates.length > 0) {
console.log(`${messageColors.warn('⚠ Duplicates:')} ${validation.duplicates.length} duplicate rules found`);
}
if (Object.keys(validation.stats.formats).length > 0) {
console.log(`${messageColors.info('Formats:')} ${Object.entries(validation.stats.formats).map(([f, c]) => `${f}(${c})`).join(', ')}`);
}
} else {
console.log(`${messageColors.error('❌ Invalid:')} ${validation.stats.invalid} invalid rules out of ${validation.stats.total} total`);
overallValid = false;
}
} catch (validationErr) {
console.error(`❌ Failed to validate ${file}: ${validationErr.message}`);
overallValid = false;
}
}
if (overallValid) {
console.log(`\n${messageColors.success('✅ All rule files are valid!')}`);
process.exit(0);
} else {
console.log(`\n${messageColors.error('❌ Some rule files have validation errors!')}`);
process.exit(1);
}
}
if (args.includes('--help') || args.includes('-h')) {
console.log(`Usage: node nwss.js [options]
Options:
--color, --colour Enable colored console output for status messages
-o, --output <file> Output file for rules. If omitted, prints to console
--compare <file> Remove rules that already exist in this file before output
--append Append new rules to output file instead of overwriting (requires -o)
Output Format Options:
--localhost Output as 127.0.0.1 domain.com
--localhost-0.0.0.0 Output as 0.0.0.0 domain.com
--plain Output just domains (no adblock formatting)
--dnsmasq Output as local=/domain.com/ (dnsmasq format)
--dnsmasq-old Output as server=/domain.com/ (dnsmasq old format)
--unbound Output as local-zone: "domain.com." always_null (unbound format)
--privoxy Output as { +block } .domain.com (Privoxy format)
--pihole Output as (^|\\.)domain\\.com$ (Pi-hole regex format)
--adblock-rules Generate adblock filter rules with resource type modifiers (requires -o)
General Options:
--verbose Force verbose mode globally
--debug Force debug mode globally
--silent Suppress normal console logs
--titles Add ! <url> title before each site's group
--dumpurls Dump matched URLs into matched_urls.log
--dry-run Console output only: show matching regex, titles, whois/dig/searchstring results, and adblock rules
--compress-logs Compress log files with gzip (requires --dumpurls)
--sub-domains Output full subdomains instead of collapsing to root
--no-interact Disable page interactions globally
--custom-json <file> Use a custom config JSON file instead of config.json
--headful Launch browser with GUI (not headless)
--cdp Enable Chrome DevTools Protocol logging (now per-page if enabled)
--remove-dupes Remove duplicate domains from output (only with -o)
--eval-on-doc Globally enable evaluateOnNewDocument() for Fetch/XHR interception
--help, -h Show this help menu
--version Show script version
--max-concurrent <number> Maximum concurrent site processing (1-50, overrides config/default)
--cleanup-interval <number> Browser restart interval in URLs processed (1-1000, overrides config/default)
--remove-tempfiles Remove Chrome/Puppeteer temporary files before exit
Validation Options:
--validate-config Validate config.json file and exit
--validate-rules [file] Validate rule file format (uses --output/--compare files if no file specified)
--clean-rules [file] Clean rule files by removing invalid lines and optionally duplicates (uses --output/--compare files if no file specified)
--test-validation Run domain validation tests and exit
Global config.json options:
ignoreDomains: ["domain.com", "*.ads.com"] Domains to completely ignore (supports wildcards)
blocked: ["regex1", "regex2"] Global regex patterns to block requests (combined with per-site blocked)
whois_server_mode: "random" or "cycle" Default server selection mode for all sites (default: random)
ignore_similar: true/false Ignore domains similar to already found domains (default: true)
ignore_similar_threshold: 80 Similarity threshold percentage for ignore_similar (default: 80)
ignore_similar_ignored_domains: true/false Ignore domains similar to ignoreDomains list (default: true)
max_concurrent_sites: 6 Maximum concurrent site processing (1-50, default: 6)
resource_cleanup_interval: 180 Browser restart interval in URLs processed (1-1000, default: 180)
Per-site config.json options:
url: "site" or ["site1", "site2"] Single URL or list of URLs
filterRegex: "regex" or ["regex1", "regex2"] Patterns to match requests
Redirect Handling Options:
follow_redirects: true/false Follow redirects to new domains (default: true)
max_redirects: 10 Maximum number of redirects to follow (default: 10)
js_redirect_timeout: 5000 Milliseconds to wait for JavaScript redirects (default: 5000)
detect_js_patterns: true/false Analyze page source for redirect patterns (default: true)
redirect_timeout_multiplier: 1.5 Increase timeout for redirected URLs (default: 1.5)
comments: "text" or ["text1", "text2"] Documentation/notes - ignored by script
searchstring: "text" or ["text1", "text2"] Text to search in response content (requires filterRegex match)
ignore_similar: true/false Override global ignore_similar setting for this site
ignore_similar_threshold: 80 Override global similarity threshold for this site
ignore_similar_ignored_domains: true/false Override global ignore_similar_ignored_domains for this site
searchstring_and: "text" or ["text1", "text2"] Text to search with AND logic - ALL terms must be present (requires filterRegex match)
curl: true/false Use curl to download content for analysis (default: false)
Note: curl respects filterRegex but ignores resourceTypes filtering
grep: true/false Use grep instead of JavaScript for pattern matching (default: false)
Note: requires curl=true, uses system grep command for faster searches
blocked: ["regex"] Regex patterns to block requests
css_blocked: ["#selector", ".class"] CSS selectors to hide elements
resourceTypes: ["script", "stylesheet"] Only process requests of these resource types (default: all types)
interact: true/false Simulate mouse movements/clicks
isBrave: true/false Spoof Brave browser detection
userAgent: "chrome"|"firefox"|"safari" Custom desktop User-Agent
interact_intensity: "low"|"medium"|"high" Interaction simulation intensity (default: medium)
delay: <milliseconds> Delay after load (default: 4000)
reload: <number> Reload page n times after load (default: 1)
forcereload: true/false Force an additional reload after reloads
clear_sitedata: true/false Clear all cookies, cache, storage before each load (default: false)
subDomains: 1/0 Output full subdomains (default: 0)
localhost: true/false Force localhost output (127.0.0.1)
localhost_0_0_0_0: true/false Force localhost output (0.0.0.0)
dnsmasq: true/false Force dnsmasq output (local=/domain.com/)
dnsmasq_old: true/false Force dnsmasq old output (server=/domain.com/)
unbound: true/false Force unbound output (local-zone: "domain.com." always_null)
privoxy: true/false Force Privoxy output ({ +block } .domain.com)
pihole: true/false Force Pi-hole regex output ((^|\\.)domain\\.com$)
source: true/false Save page source HTML after load
firstParty: true/false Allow first-party matches (default: false)
thirdParty: true/false Allow third-party matches (default: true)
screenshot: true/false Capture screenshot on load failure
headful: true/false Launch browser with GUI for this site
fingerprint_protection: true/false/"random" Enable fingerprint spoofing: true/false/"random"
adblock_rules: true/false Generate adblock filter rules with resource types for this site
even_blocked: true/false Add matching rules even if requests are blocked (default: false)
referrer_headers: "url" or ["url1", "url2"] Set referrer header for realistic traffic sources
custom_headers: {"Header": "value"} Add custom HTTP headers to requests
Cloudflare Protection Options:
cloudflare_phish: true/false Auto-click through Cloudflare phishing warnings (default: false)
cloudflare_bypass: true/false Auto-solve Cloudflare "Verify you are human" challenges (default: false)
FlowProxy Protection Options:
flowproxy_detection: true/false Enable flowProxy protection detection and handling (default: false)
flowproxy_page_timeout: <milliseconds> Page timeout for flowProxy sites (default: 45000)
flowproxy_nav_timeout: <milliseconds> Navigation timeout for flowProxy sites (default: 45000)
flowproxy_js_timeout: <milliseconds> JavaScript challenge timeout (default: 15000)
flowproxy_delay: <milliseconds> Delay for rate limiting (default: 30000)
flowproxy_additional_delay: <milliseconds> Additional processing delay (default: 5000)
Advanced Options:
evaluateOnNewDocument: true/false Inject fetch/XHR interceptor in page (for this site)
cdp: true/false Enable CDP logging for this site Inject fetch/XHR interceptor in page
interact_duration: <milliseconds> Duration of interaction simulation (default: 2000)
interact_scrolling: true/false Enable scrolling simulation (default: true)
interact_clicks: true/false Enable element clicking simulation (default: false)
interact_typing: true/false Enable typing simulation (default: false)
whois: ["term1", "term2"] Check whois data for ALL specified terms (AND logic)
whois-or: ["term1", "term2"] Check whois data for ANY specified term (OR logic)
whois_server_mode: "random" or "cycle" Server selection mode: random (default) or cycle through list
whois_server: "whois.domain.com" or ["server1", "server2"] Custom whois server(s) - single server or randomized list (default: system default)
whois_max_retries: 2 Maximum retry attempts per domain (default: 2)
whois_timeout_multiplier: 1.5 Timeout increase multiplier per retry (default: 1.5)
whois_use_fallback: true Add TLD-specific fallback servers (default: true)
whois_retry_on_timeout: true Retry on timeout errors (default: true)
whois_retry_on_error: false Retry on connection/other errors (default: false)
whois_delay: <milliseconds> Delay between whois requests for this site (default: global whois_delay)
dig: ["term1", "term2"] Check dig output for ALL specified terms (AND logic)
dig-or: ["term1", "term2"] Check dig output for ANY specified term (OR logic)
goto_options: {"waitUntil": "domcontentloaded"} Custom page.goto() options (default: {"waitUntil": "load"})
dig_subdomain: true/false Use subdomain for dig lookup instead of root domain (default: false)
digRecordType: "A" DNS record type for dig (default: A)
Referrer Header Options:
referrer_headers: "https://google.com" Single referrer URL
referrer_headers: ["url1", "url2"] Random selection from array
referrer_headers: {"mode": "random_search", "search_terms": ["term1"]} Smart search engine traffic
referrer_headers: {"mode": "social_media"} Random social media referrers
referrer_headers: {"mode": "direct_navigation"} No referrer (direct access)
custom_headers: {"Header": "Value"} Additional HTTP headers
`);
process.exit(0);
}
// --- Configuration File Loading ---
const configPathIndex = args.findIndex(arg => arg === '--custom-json');
const configPath = (configPathIndex !== -1 && args[configPathIndex + 1]) ? args[configPathIndex + 1] : 'config.json';
let config;
try {
if (!fs.existsSync(configPath)) {
console.error(`❌ Config file not found: ${configPath}`);
process.exit(1);
}
if (forceDebug && configPath !== 'config.json') {
console.log(formatLogMessage('debug', `Using custom config file: ${configPath}`));
}
const raw = fs.readFileSync(configPath, 'utf8');
config = JSON.parse(raw);
} catch (e) {
console.error(`❌ Failed to load config file (${configPath}):`, e.message);
process.exit(1);
}
// Extract config values while ignoring 'comments' field at global and site levels
const {
sites = [],
ignoreDomains = [],
blocked: globalBlocked = [],
whois_delay = 3000,
whois_server_mode = 'random',
ignore_similar = true,
ignore_similar_threshold = 80,
ignore_similar_ignored_domains = true,
max_concurrent_sites = 6,
resource_cleanup_interval = 180,
comments: globalComments,
...otherGlobalConfig
} = config;
// Apply global configuration overrides with validation
// Priority: Command line args > config.json > defaults
const MAX_CONCURRENT_SITES = (() => {
// Check command line argument first
if (maxConcurrentSites !== null) {
if (maxConcurrentSites > 0 && maxConcurrentSites <= 50) {
if (forceDebug) console.log(formatLogMessage('debug', `Using command line max_concurrent_sites: ${maxConcurrentSites}`));
return maxConcurrentSites;
} else {
console.warn(`⚠ Invalid --max-concurrent value: ${maxConcurrentSites}. Must be 1-50. Using config/default value.`);
}
}
// Check config.json value
if (typeof max_concurrent_sites === 'number' && max_concurrent_sites > 0 && max_concurrent_sites <= 50) {
if (forceDebug) console.log(formatLogMessage('debug', `Using config max_concurrent_sites: ${max_concurrent_sites}`));
return max_concurrent_sites;
} else if (max_concurrent_sites !== 6) {
console.warn(`⚠ Invalid config max_concurrent_sites value: ${max_concurrent_sites}. Using default: 6`);
}
// Use default
return 6;
})();
const RESOURCE_CLEANUP_INTERVAL = (() => {
// Check command line argument first
if (cleanupInterval !== null) {
if (cleanupInterval > 0 && cleanupInterval <= 1000) {
if (forceDebug) console.log(formatLogMessage('debug', `Using command line resource_cleanup_interval: ${cleanupInterval}`));
return cleanupInterval;
} else {
console.warn(`⚠ Invalid --cleanup-interval value: ${cleanupInterval}. Must be 1-1000. Using config/default value.`);
}
}
// Check config.json value
if (typeof resource_cleanup_interval === 'number' && resource_cleanup_interval > 0 && resource_cleanup_interval <= 1000) {
if (forceDebug) console.log(formatLogMessage('debug', `Using config resource_cleanup_interval: ${resource_cleanup_interval}`));
return resource_cleanup_interval;
} else if (resource_cleanup_interval !== 180) {
console.warn(`⚠ Invalid config resource_cleanup_interval value: ${resource_cleanup_interval}. Using default: 180`);
}
// Use default
return 180;
})();
// Initialize smart cache system AFTER config is loaded
smartCache = createSmartCache({
...config,
forceDebug,
cache_persistence: config.cache_persistence !== false, // Enable by default
cache_autosave: config.cache_autosave !== false,
cache_autosave_minutes: config.cache_autosave_minutes || 1,
cache_max_size: config.cache_max_size || 5000
});
// Handle --clean-rules after config is loaded (so we have access to sites)
if (cleanRules || cleanRulesFile) {
const filesToClean = cleanRulesFile ? [cleanRulesFile] : [outputFile, compareFile].filter(Boolean);
if (filesToClean.length === 0) {
console.error('❌ --clean-rules requires either a file argument or --output/--compare files to be specified');
process.exit(1);
}
console.log(`\n${messageColors.processing('Cleaning rule files...')}`);
let overallSuccess = true;
let totalCleaned = 0;
// Check if we're cleaning the same file we want to use for output
const cleaningOutputFile = outputFile && filesToClean.includes(outputFile);
if (cleaningOutputFile && forceDebug) {
console.log(formatLogMessage('debug', `Output file detected: will clean ${outputFile} first, then continue with scan`));
}
for (const file of filesToClean) {
console.log(`\n${messageColors.info('Cleaning:')} ${file}`);
// Check if file exists before trying to clean it
if (!fs.existsSync(file)) {
if (file === outputFile) {
// If it's the output file that doesn't exist, that's OK - we'll create it during scan
const modeText = appendMode ? 'created (append mode)' : 'created';
console.log(`${messageColors.info('📄 Note:')} Output file ${file} doesn't exist yet - will be ${modeText} during scan`);
continue;
} else {
// For other files (like compare files), this is an error
console.log(`${messageColors.error('❌ Failed:')} File not found: ${file}`);
overallSuccess = false;
continue;
}
}
try {
const cleanResult = cleanRulesetFile(file, null, {
forceDebug,
silentMode,
removeDuplicates: removeDupes,
backupOriginal: true,
dryRun: dryRunMode
});
if (cleanResult.success) {
if (dryRunMode) {
if (cleanResult.wouldModify) {
console.log(`${messageColors.info('🔍 Dry run:')} Would remove ${cleanResult.stats.removed} lines (${cleanResult.stats.invalid} invalid, ${cleanResult.stats.duplicates} duplicates)`);
} else {
console.log(`${messageColors.success('✅ Dry run:')} File is already clean - no changes needed`);
}
} else {
if (cleanResult.modified) {
console.log(`${messageColors.success('✅ Cleaned:')} Removed ${cleanResult.stats.removed} lines, preserved ${cleanResult.stats.valid} valid rules`);
if (cleanResult.backupCreated) {
console.log(`${messageColors.info('💾 Backup:')} Original file backed up`);
}
totalCleaned += cleanResult.stats.removed;
if (cleaningOutputFile && file === outputFile) {
console.log(`${messageColors.info('📄 Note:')} File cleaned - new rules will be ${appendMode ? 'appended' : 'written'} during scan`);
}
} else {
console.log(`${messageColors.success('✅ Clean:')} File was already valid - no changes needed`);
}
}
} else {
console.log(`${messageColors.error('❌ Failed:')} ${cleanResult.error}`);
overallSuccess = false;
}
} catch (cleanErr) {
console.error(`❌ Failed to clean ${file}: ${cleanErr.message}`);
overallSuccess = false;
}
}
// Determine if we should continue with scanning
const shouldContinueScanning = sites && sites.length > 0 && outputFile;
const cleanedOutputFileForScanning = outputFile && filesToClean.includes(outputFile);
if (overallSuccess) {
if (dryRunMode) {
console.log(`\n${messageColors.info('🔍 Dry run completed successfully!')}`);
process.exit(0);
} else {
console.log(`\n${messageColors.success('✅ All rule files cleaned successfully!')} Total lines removed: ${totalCleaned}`);
// Continue with scan if we have sites to process and we cleaned the output file
if (shouldContinueScanning && cleanedOutputFileForScanning) {
const actionText = appendMode ? 'append new rules to' : 'write rules to';
console.log(`${messageColors.info('📄 Continuing:')} Proceeding with scan to ${actionText} ${outputFile}`);
// Don't exit - continue with scanning
} else {
process.exit(0);
}
}
} else {
console.log(`\n${messageColors.error('❌ Some rule files failed to clean!')}`);
process.exit(1);
}
}
// Add global cycling index tracker for whois server selection
let globalWhoisServerIndex = 0;
// Track dry run output for file writing
let dryRunOutput = [];
// --- Log File Setup ---
let debugLogFile = null;
let matchedUrlsLogFile = null;
let adblockRulesLogFile = null;
if (forceDebug || dumpUrls) {
// Create logs folder if it doesn't exist
const logsFolder = 'logs';
if (!fs.existsSync(logsFolder)) {
fs.mkdirSync(logsFolder, { recursive: true });
console.log(formatLogMessage('debug', `Created logs folder: ${logsFolder}`));
}
// Generate timestamped log filenames
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').replace('T', '_').slice(0, -5);
if (forceDebug) {
debugLogFile = path.join(logsFolder, `debug_requests_${timestamp}.log`);
console.log(formatLogMessage('debug', `Debug requests will be logged to: ${debugLogFile}`));
}
if (dumpUrls) {
matchedUrlsLogFile = path.join(logsFolder, `matched_urls_${timestamp}.log`);
console.log(messageColors.processing('Matched URLs will be logged to:') + ` ${matchedUrlsLogFile}`);
// Also create adblock rules log file with same timestamp
adblockRulesLogFile = path.join(logsFolder, `adblock_rules_${timestamp}.txt`);
console.log(messageColors.processing('Adblock rules will be saved to:') + ` ${adblockRulesLogFile}`);
}
}
// Log comments if debug mode is enabled and comments exist
if (forceDebug && globalComments) {
const commentList = Array.isArray(globalComments) ? globalComments : [globalComments];
console.log(formatLogMessage('debug', `Global comments found: ${commentList.length} item(s)`));
commentList.forEach((comment, idx) => console.log(formatLogMessage('debug', ` Comment ${idx + 1}: ${comment}`)));
}
// --- Global CDP Override Logic --- [COMMENT RE-ADDED PREVIOUSLY, relevant to old logic]
// If globalCDP is not already enabled by the --cdp flag,
// check if any site in config.json has `cdp: true`. If so, enable globalCDP.
// This allows site-specific config to trigger CDP logging for the entire session.
// Note: Analysis suggests CDP should ideally be managed per-page for comprehensive logging.
// (The code block that utilized this logic for a global CDP variable has been removed
// as CDP is now handled per-page based on 'enableCDP' and 'siteConfig.cdp')
/**
* Extracts the root domain from a given URL string using the psl library.
* For example, for 'http://sub.example.com/path', it returns 'example.com'.
*
* @param {string} url - The URL string to parse.
* @returns {string} The root domain, or the original hostname if parsing fails (e.g., for IP addresses or invalid URLs), or an empty string on error.
*/
function getRootDomain(url) {
try {
const { hostname } = new URL(url);
const parsed = psl.parse(hostname);
return parsed.domain || hostname;
} catch {
return '';
}
}
/**
* Safely extracts hostname from a URL, handling malformed URLs gracefully
* @param {string} url - The URL string to parse
* @param {boolean} getFullHostname - If true, returns full hostname; if false, returns root domain
* @returns {string} The hostname/domain, or empty string if URL is invalid
*/
function safeGetDomain(url, getFullHostname = false) {
try {
const parsedUrl = new URL(url);
if (getFullHostname) {
return parsedUrl.hostname;
} else {
return getRootDomain(url);
}
} catch (urlError) {
// Log malformed URLs for debugging
if (forceDebug) {
console.log(formatLogMessage('debug', `Malformed URL skipped: ${url} (${urlError.message})`));
}
return '';
}
}
/**
* Outputs dry run results to console with formatted display
* If outputFile is specified, also captures output for file writing
* @param {string} url - The URL being processed
* @param {Array} matchedItems - Array of matched items with regex, domain, and resource type
* @param {Array} netToolsResults - Array of whois/dig results
* @param {string} pageTitle - Title of the page (if available)
*/
function outputDryRunResults(url, matchedItems, netToolsResults, pageTitle) {
const lines = [];
lines.push(`\n=== DRY RUN RESULTS === ${url}`);
console.log(`\n${messageColors.scanning('=== DRY RUN RESULTS ===')} ${url}`);
if (pageTitle && pageTitle.trim()) {
lines.push(`Title: ${pageTitle.trim()}`);
console.log(`${messageColors.info('Title:')} ${pageTitle.trim()}`);
}
if (matchedItems.length === 0 && netToolsResults.length === 0) {
lines.push(`No matching rules found on ${url}`);
// Store output for file writing if outputFile is specified
if (outputFile) {
dryRunOutput.push(...lines);
dryRunOutput.push(''); // Add empty line
}
console.log(messageColors.warn(`No matching rules found on ${url}`));
return;
}
const totalMatches = matchedItems.length + netToolsResults.length;
lines.push(`Matches found: ${totalMatches}`);
console.log(`${messageColors.success('Matches found:')} ${totalMatches}`);
matchedItems.forEach((item, index) => {
lines.push('');
lines.push(`[${index + 1}] Regex Match:`);
lines.push(` Pattern: ${item.regex}`);
lines.push(` Domain: ${item.domain}`);
lines.push(` Resource Type: ${item.resourceType}`);
lines.push(` Full URL: ${item.fullUrl}`);
console.log(`\n${messageColors.highlight(`[${index + 1}]`)} ${messageColors.match('Regex Match:')}`);
console.log(` Pattern: ${item.regex}`);
console.log(` Domain: ${item.domain}`);
console.log(` Resource Type: ${item.resourceType}`);
console.log(` Full URL: ${item.fullUrl}`);
// Show searchstring results if available
if (item.searchStringMatch) {
lines.push(` ✓ Searchstring Match: ${item.searchStringMatch.type} - "${item.searchStringMatch.term}"`);
console.log(` ${messageColors.success('✓ Searchstring Match:')} ${item.searchStringMatch.type} - "${item.searchStringMatch.term}"`);
} else if (item.searchStringChecked) {
lines.push(` ✗ Searchstring: No matches found in content`);
console.log(` ${messageColors.warn('✗ Searchstring:')} No matches found in content`);
}
// Generate adblock rule
const adblockRule = `||${item.domain}^$${item.resourceType}`;
lines.push(` Adblock Rule: ${adblockRule}`);
console.log(` ${messageColors.info('Adblock Rule:')} ${adblockRule}`);
});
// Display nettools results
netToolsResults.forEach((result, index) => {
const resultIndex = matchedItems.length + index + 1;
lines.push('');
lines.push(`[${resultIndex}] NetTools Match:`);
lines.push(` Domain: ${result.domain}`);
lines.push(` Tool: ${result.tool.toUpperCase()}`);
lines.push(` ✓ Match: ${result.matchType} - "${result.matchedTerm}"`);
if (result.details) {
lines.push(` Details: ${result.details}`);
}
console.log(`\n${messageColors.highlight(`[${resultIndex}]`)} ${messageColors.match('NetTools Match:')}`);
console.log(` Domain: ${result.domain}`);
console.log(` Tool: ${result.tool.toUpperCase()}`);
console.log(` ${messageColors.success('✓ Match:')} ${result.matchType} - "${result.matchedTerm}"`);
if (result.details) {
console.log(` Details: ${result.details}`);
}
// Generate adblock rule for nettools matches
const adblockRule = `||${result.domain}^`;
lines.push(` Adblock Rule: ${adblockRule}`);
console.log(` ${messageColors.info('Adblock Rule:')} ${adblockRule}`);
});
// Store output for file writing if outputFile is specified
if (outputFile) {
dryRunOutput.push(...lines);
dryRunOutput.push(''); // Add empty line between sites
}
}
// ability to use widcards in ignoreDomains
function matchesIgnoreDomain(domain, ignorePatterns) {
return ignorePatterns.some(pattern => {
if (pattern.includes('*')) {
// Convert wildcard pattern to regex
const regexPattern = pattern
.replace(/\./g, '\\.') // Escape dots
.replace(/\*/g, '.*'); // Convert * to .*
return new RegExp(`^${regexPattern}$`).test(domain);
}
return domain.endsWith(pattern);
});
}
function setupFrameHandling(page, forceDebug) {
// Handle frame creation with error suppression
page.on('frameattached', async (frame) => {
if (frame.parentFrame()) { // Only handle child frames, not main frame
try {
const frameUrl = frame.url();
if (forceDebug) {
console.log(formatLogMessage('debug', `New frame attached: ${frameUrl || 'about:blank'}`));
}
// Don't try to navigate to frames with invalid/empty URLs
if (!frameUrl ||
frameUrl === 'about:blank' ||
frameUrl === '' ||
frameUrl === 'about:srcdoc' ||
frameUrl.startsWith('about:') ||
frameUrl.startsWith('data:') ||
frameUrl.startsWith('blob:') ||
frameUrl.startsWith('chrome-error://') ||
frameUrl.startsWith('chrome-extension://')) {
if (forceDebug) {
console.log(formatLogMessage('debug', `Skipping frame with invalid/special URL: ${frameUrl}`));
}
return;
}
// Validate URL format before attempting navigation
try {
const parsedUrl = new URL(frameUrl);
// Only process http/https URLs
if (!['http:', 'https:'].includes(parsedUrl.protocol)) {
if (forceDebug) {
console.log(formatLogMessage('debug', `Skipping frame with non-http protocol: ${frameUrl}`));
}
return;
}
} catch (urlErr) {
if (forceDebug) {
console.log(formatLogMessage('debug', `Skipping frame with malformed URL: ${frameUrl}`));
}
return;
}
// REMOVED: Don't try to manually navigate frames
// Let frames load naturally - manual navigation often causes Protocol errors
// await frame.goto(frame.url(), { waitUntil: 'domcontentloaded', timeout: 5000 });
if (forceDebug) {
console.log(formatLogMessage('debug', `Frame will load naturally: ${frameUrl}`));
}
} catch (err) {
// Suppress "Cannot navigate to invalid URL" errors but log others
if (!err.message.includes('Cannot navigate to invalid URL') &&
!err.message.includes('Protocol error')) {
if (forceDebug) {
console.log(formatLogMessage('debug', `Frame handling error: ${err.message}`));
}
}
}
}
});
// Handle frame navigations (keep this for monitoring)
page.on('framenavigated', (frame) => {
const frameUrl = frame.url();
if (forceDebug &&
frameUrl &&
frameUrl !== 'about:blank' &&
frameUrl !== 'about:srcdoc' &&
!frameUrl.startsWith('about:') &&
!frameUrl.startsWith('data:') &&
!frameUrl.startsWith('chrome-error://') &&
!frameUrl.startsWith('chrome-extension://')) {
console.log(formatLogMessage('debug', `Frame navigated to: ${frameUrl}`));
}
});
// Optional: Handle frame detachment for cleanup
page.on('framedetached', (frame) => {
if (forceDebug) {
const frameUrl = frame.url();
if (frameUrl &&
frameUrl !== 'about:blank' &&
frameUrl !== 'about:srcdoc' &&
!frameUrl.startsWith('about:') &&
!frameUrl.startsWith('chrome-error://') &&
!frameUrl.startsWith('chrome-extension://')) {
console.log(formatLogMessage('debug', `Frame detached: ${frameUrl}`));
}
}
});
}
// --- Main Asynchronous IIFE (Immediately Invoked Function Expression) ---
// This is the main entry point and execution block for the network scanner script.
(async () => {
// Declare userDataDir in outer scope for cleanup access
let userDataDir = null;
/**
* Creates a new browser instance with consistent configuration
* Uses system Chrome and temporary directories to minimize disk usage
* @returns {Promise<import('puppeteer').Browser>} Browser instance
*/
async function createBrowser() {
// Create temporary user data directory that we can fully control and clean up
const tempUserDataDir = `/tmp/puppeteer-${Date.now()}-${Math.random().toString(36).substring(7)}`;
userDataDir = tempUserDataDir; // Store for cleanup tracking (use outer scope variable)
// Try to find system Chrome installation to avoid Puppeteer downloads
const systemChromePaths = [
'/usr/bin/google-chrome-stable',
'/usr/bin/google-chrome',
'/usr/bin/chromium-browser',
'/usr/bin/chromium',
'/snap/bin/chromium'
];
let executablePath = null;
for (const chromePath of systemChromePaths) {
if (fs.existsSync(chromePath)) {
executablePath = chromePath;
if (forceDebug) {
console.log(formatLogMessage('debug', `Using system Chrome: ${chromePath}`));
}
break;
}
}
const browser = await puppeteer.launch({
// Use system Chrome if available to avoid downloads
executablePath: executablePath,
// Force temporary user data directory for complete cleanup control
userDataDir: tempUserDataDir,
args: [
// Disk space controls - 50MB cache limits
'--disk-cache-size=52428800', // 50MB disk cache (50 * 1024 * 1024)
'--media-cache-size=52428800', // 50MB media cache
'--disable-application-cache',
'--disable-offline-load-stale-cache',
'--disable-background-downloads',
'--no-first-run',
'--disable-default-apps',
'--disable-component-extensions-with-background-pages',
'--disable-background-networking',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-features=SafeBrowsing',
'--disable-dev-shm-usage',
'--disable-sync',
'--disable-gpu',
'--mute-audio',
'--disable-translate',
'--window-size=1920,1080',
'--disable-extensions',
'--no-default-browser-check',
'--safebrowsing-disable-auto-update',
'--max_old_space_size=1024',
'--ignore-ssl-errors',
'--ignore-certificate-errors',
'--ignore-certificate-errors-spki-list',
'--ignore-certificate-errors-ca-list',
'--disable-web-security',
'--allow-running-insecure-content',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-features=TranslateUI',
'--disable-features=VizDisplayCompositor',
'--run-all-compositor-stages-before-draw',
'--disable-threaded-animation',
'--disable-threaded-scrolling',
'--disable-checker-imaging',
'--disable-image-animation-resync'
],
headless: launchHeadless ? 'shell' : false,
protocolTimeout: 60000 // 60 seconds
});
// Store the user data directory on the browser object for cleanup
browser._nwssUserDataDir = tempUserDataDir;
return browser;
}
const pLimit = (await import('p-limit')).default;
const limit = pLimit(MAX_CONCURRENT_SITES);
const perSiteHeadful = sites.some(site => site.headful === true);
const launchHeadless = !(headfulMode || perSiteHeadful);
// launch with no safe browsing
let browser = await createBrowser();
if (forceDebug) console.log(formatLogMessage('debug', `Launching browser with headless: ${launchHeadless}`));
// Log which headless mode is being used
if (forceDebug && launchHeadless) {
console.log(formatLogMessage('debug', `Using chrome-headless-shell for maximum performance`));
}
// Initial cleanup of any existing Chrome temp files - always comprehensive on startup
if (forceDebug) console.log(formatLogMessage('debug', 'Cleaning up any leftover temp files from previous runs...'));
await cleanupChromeTempFiles({
includeSnapTemp: true, // Always clean snap dirs on startup
forceDebug,
comprehensive: true // Always comprehensive on startup to clean leftovers
});
// Set up cleanup on process termination
process.on('SIGINT', async () => {
if (forceDebug) console.log(formatLogMessage('debug', 'SIGINT received, performing cle