UNPKG

@fanboynz/network-scanner

Version:

A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.

710 lines (642 loc) 26.5 kB
const fs = require('fs'); const path = require('path'); const { loadComparisonRules, filterUniqueRules } = require('./compare'); const { messageColors, formatLogMessage } = require('./colorize'); const OUTPUT_FILTER_TAG = messageColors.processing('[output-filter]'); // Cache for compiled wildcard regex patterns in matchesIgnoreDomain (capped to prevent memory leak) const wildcardRegexCache = new Map(); const WILDCARD_CACHE_MAX = 500; // Hoisted resource type map — avoid recreating per call const RESOURCE_TYPE_TO_ADBLOCK = { 'script': 'script', 'xhr': 'xmlhttprequest', 'fetch': 'xmlhttprequest', 'stylesheet': 'stylesheet', 'image': 'image', 'font': 'font', 'document': 'document', 'subdocument': 'subdocument', 'iframe': 'subdocument', 'websocket': 'websocket', 'media': 'media', 'ping': 'ping', 'other': null }; /** * Check if domain matches any ignore patterns (supports wildcards) * @param {string} domain - Domain to check * @param {string[]} ignorePatterns - Array of ignore patterns * @returns {boolean} True if domain should be ignored */ function matchesIgnoreDomain(domain, ignorePatterns) { if (!ignorePatterns || !Array.isArray(ignorePatterns) || ignorePatterns.length === 0) { return false; } return ignorePatterns.some(pattern => { if (pattern.includes('*')) { // Enhanced wildcard pattern handling if (pattern.startsWith('*.')) { // Pattern: *.example.com — match exact or any subdomain const suffix = pattern.substring(2); return domain === suffix || domain.endsWith('.' + suffix); } else if (pattern.endsWith('.*')) { // Pattern: example.* const baseDomain = pattern.slice(0, -2); // Remove ".*" return domain.startsWith(baseDomain + '.'); } else { // Complex wildcard pattern (cached). Escape every regex meta-char // EXCEPT '*' first, then expand '*' to '.*'. The old code only // escaped '.', so a pattern containing '+', '(', '[', '?', etc. // would either misbehave (e.g. 'foo+bar.com' would treat '+' as a // quantifier) or throw synchronously (unmatched '(' / '[') and the // exception would propagate out of .some(). Domain names can't // legally contain those chars, but a typo in a user's ignore list // would crash the output stage. if (!wildcardRegexCache.has(pattern)) { if (wildcardRegexCache.size >= WILDCARD_CACHE_MAX) { wildcardRegexCache.delete(wildcardRegexCache.keys().next().value); } const regexPattern = pattern .replace(/[.+?^${}()|[\]\\]/g, '\\$&') // Escape all regex meta-chars except '*' .replace(/\*/g, '.*'); // Now expand '*' to '.*' try { wildcardRegexCache.set(pattern, new RegExp(`^${regexPattern}$`)); } catch (_) { // Defensive: a still-malformed regex (shouldn't happen after the // escape above) becomes a never-match instead of a crash. wildcardRegexCache.set(pattern, /(?!)/); } } return wildcardRegexCache.get(pattern).test(domain); } } else { // Exact pattern matching return domain === pattern || domain.endsWith('.' + pattern); } }); } /** * Extract domain from a formatted rule back to plain domain * @param {string} rule - Formatted rule (e.g., "||domain.com^", "127.0.0.1 domain.com", etc.) * @returns {string|null} Plain domain or null if cannot extract */ function extractDomainFromRule(rule) { if (!rule || rule.startsWith('!')) { return null; // Skip comments } // Handle different output formats if (rule.startsWith('||') && rule.includes('^')) { // Adblock format: ||domain.com^ or ||domain.com^$script return rule.substring(2).split('^')[0]; } else if (rule.match(/^(127\.0\.0\.1|0\.0\.0\.0)\s+/)) { // Localhost format: 127.0.0.1 domain.com or 0.0.0.0 domain.com return rule.split(/\s+/)[1]; } else if (rule.startsWith('local=/') && rule.endsWith('/')) { // DNSmasq format: local=/domain.com/ return rule.substring(6, rule.length - 1); } else if (rule.startsWith('server=/') && rule.endsWith('/')) { // DNSmasq old format: server=/domain.com/ return rule.substring(7, rule.length - 1); } else if (rule.startsWith('local-zone: "') && rule.includes('" always_null')) { // Unbound format: local-zone: "domain.com." always_null const domain = rule.substring(13).split('"')[0]; return domain.endsWith('.') ? domain.slice(0, -1) : domain; } else if (rule.startsWith('{ +block } .')) { // Privoxy format: { +block } .domain.com return rule.substring(12); } else if (rule.match(/^\(\^\|\\\.\)/)) { // Pi-hole regex format: (^|\.)domain\.com$ return rule.replace(/^\(\^\|\\?\.\)/, '').replace(/\\\./g, '.').replace(/\$$/, ''); } // If no format matches, assume it's already a plain domain return rule.includes('.') ? rule : null; } /** * Formats a domain according to the specified output mode * @param {string} domain - The domain to format * @param {object} options - Formatting options * @param {string|null} options.localhostIP - Use custom IP format (e.g., '127.0.0.1', '0.0.0.0') * @param {boolean} options.plain - Use plain domain format (no adblock syntax) * @param {boolean} options.adblockRules - Generate adblock filter rules with resource types * @param {boolean} options.dnsmasq - Use dnsmasq local format * @param {boolean} options.dnsmasqOld - Use dnsmasq old server format * @param {boolean} options.unbound - Use unbound local-zone format * @param {boolean} options.privoxy - Use Privoxy block format * @param {boolean} options.pihole - Use Pi-hole regex format * @param {string} options.resourceType - Resource type for adblock rules (script, xhr, iframe, css, image, etc.) * @returns {string} The formatted domain */ function formatDomain(domain, options = {}) { const { localhostIP = null, plain = false, adblockRules = false, dnsmasq = false, dnsmasqOld = false, unbound = false, privoxy = false, pihole = false, resourceType = null } = options; // Validate domain length and format if (!domain || domain.length <= 6 || !domain.includes('.')) { return null; } // If plain is true, always return just the domain regardless of other options if (plain) { return domain; } // Apply specific format based on output mode if (pihole) { // Escape dots for regex and use Pi-hole format: (^|\.)domain\.com$ const escapedDomain = domain.replace(/\./g, '\\.'); return `(^|\\.)${escapedDomain}$`; } else if (privoxy) { return `{ +block } .${domain}`; } else if (dnsmasq) { return `local=/${domain}/`; } else if (dnsmasqOld) { return `server=/${domain}/`; } else if (unbound) { return `local-zone: "${domain}." always_null`; } else if (localhostIP) { return `${localhostIP} ${domain}`; } else if (adblockRules && resourceType) { // Generate adblock filter rules with resource type modifiers return `||${domain}^${resourceType}`; } else { return `||${domain}^`; } } /** * Maps Puppeteer resource types to adblock filter modifiers * @param {string} resourceType - Puppeteer resource type * @returns {string|null} Adblock filter modifier, or null if should be ignored */ function mapResourceTypeToAdblockModifier(resourceType) { return RESOURCE_TYPE_TO_ADBLOCK[resourceType] || null; } /** * Formats an array of domains according to site and global settings * @param {Set<string>|Map<string, Set<string>>} matchedDomains - Set of matched domains or Map of domain -> resource types * @param {object} siteConfig - Site-specific configuration * @param {object} globalOptions - Global formatting options * @returns {string[]} Array of formatted rules */ function formatRules(matchedDomains, siteConfig = {}, globalOptions = {}) { const { localhostIP = null, plainOutput = false, adblockRulesMode = false, dnsmasqMode = false, dnsmasqOldMode = false, unboundMode = false, privoxyMode = false, piholeMode = false } = globalOptions; // Site-level overrides const siteLocalhostIP = siteConfig.localhost || null; const sitePlainSetting = siteConfig.plain === true; const siteAdblockRules = siteConfig.adblock_rules === true; const siteDnsmasq = siteConfig.dnsmasq === true; const siteDnsmasqOld = siteConfig.dnsmasq_old === true; const siteUnbound = siteConfig.unbound === true; const sitePrivoxy = siteConfig.privoxy === true; const sitePihole = siteConfig.pihole === true; // Validate output format compatibility - silently ignore incompatible combinations const activeFormats = [ dnsmasqMode || siteDnsmasq, dnsmasqOldMode || siteDnsmasqOld, unboundMode || siteUnbound, privoxyMode || sitePrivoxy, piholeMode || sitePihole, adblockRulesMode || siteAdblockRules, (localhostIP || siteLocalhostIP) ? true : false, plainOutput || sitePlainSetting ].filter(Boolean).length; if (activeFormats > 1) { // Multiple formats specified - fall back to standard adblock format const formatOptions = { localhostIP: null, plain: false, adblockRules: false, dnsmasq: false, dnsmasqOld: false, unbound: false, privoxy: false, pihole: false }; const formattedRules = []; const domainsToProcess = matchedDomains instanceof Set ? matchedDomains : new Set(matchedDomains.keys()); domainsToProcess.forEach(domain => { const formatted = formatDomain(domain, formatOptions); if (formatted) { formattedRules.push(formatted); } }); return formattedRules; } // Determine final formatting options const formatOptions = { localhostIP: siteLocalhostIP || localhostIP, plain: plainOutput || sitePlainSetting, adblockRules: adblockRulesMode || siteAdblockRules, dnsmasq: dnsmasqMode || siteDnsmasq, dnsmasqOld: dnsmasqOldMode || siteDnsmasqOld, unbound: unboundMode || siteUnbound, privoxy: privoxyMode || sitePrivoxy, pihole: piholeMode || sitePihole }; const formattedRules = []; if (matchedDomains instanceof Map && formatOptions.adblockRules) { // Handle Map format with resource types for --adblock-rules matchedDomains.forEach((resourceTypes, domain) => { if (resourceTypes.size > 0) { let hasValidResourceType = false; // Generate one rule per resource type found for this domain resourceTypes.forEach(resourceType => { const adblockModifier = mapResourceTypeToAdblockModifier(resourceType); // Skip if modifier is null (e.g., 'other' type) if (adblockModifier) { hasValidResourceType = true; const formatted = formatDomain(domain, { ...formatOptions, resourceType: adblockModifier }); if (formatted) { formattedRules.push(formatted); } } }); // If no valid resource types were found, add a generic rule if (!hasValidResourceType) { const formatted = formatDomain(domain, formatOptions); if (formatted) { formattedRules.push(formatted); } } } else { // Fallback to generic rule if no resource types const formatted = formatDomain(domain, formatOptions); if (formatted) { formattedRules.push(formatted); } } }); } else { // Handle Set format (legacy behavior) or other modes (including privoxy and pihole) const domainsToProcess = matchedDomains instanceof Set ? matchedDomains : new Set(matchedDomains.keys()); domainsToProcess.forEach(domain => { const formatted = formatDomain(domain, formatOptions); if (formatted) { formattedRules.push(formatted); } }); } return formattedRules; } /** * Removes duplicate rules while preserving comments (lines starting with !) * @param {string[]} lines - Array of output lines * @returns {string[]} Array with duplicates removed */ function removeDuplicates(lines) { const uniqueLines = []; const seenRules = new Set(); for (const line of lines) { if (line.startsWith('!') || !seenRules.has(line)) { uniqueLines.push(line); if (!line.startsWith('!')) { seenRules.add(line); } } } return uniqueLines; } /** * Builds the final output lines from processing results * @param {Array} results - Array of processing results from processUrl * @param {object} options - Output options * @param {boolean} options.showTitles - Include URL titles in output * @param {boolean} options.removeDupes - Remove duplicate rules * @param {string[]} options.ignoreDomains - Domains to filter out from final output * @param {boolean} options.forLogFile - Include titles regardless of showTitles (for log files) * @returns {object} Object containing outputLines and outputLinesWithTitles */ function buildOutputLines(results, options = {}) { const { showTitles = false, removeDupes = false, ignoreDomains = [], forLogFile = false } = options; // Consolidate rules from all results, handling multiple results for same URL const consolidatedRules = new Map(); // URL -> { rules: Set, originalUrl, regexes: Set } let successfulPageLoads = 0; results.forEach(result => { if (result) { if (result.success) { successfulPageLoads++; } if (result.rules && result.rules.length > 0) { if (!consolidatedRules.has(result.url)) { consolidatedRules.set(result.url, { rules: new Set(), originalUrl: result.originalUrl || result.url, regexes: new Set() }); } const entry = consolidatedRules.get(result.url); result.rules.forEach(rule => entry.rules.add(rule)); if (Array.isArray(result.matchedRegexes)) { result.matchedRegexes.forEach(rx => entry.regexes.add(rx)); } // Prefer the original URL from any result entry that has one different from final if (result.originalUrl && result.originalUrl !== result.url) { entry.originalUrl = result.originalUrl; } } } }); // Convert consolidated rules back to array format const finalSiteRules = []; consolidatedRules.forEach((entry, url) => { if (entry.rules.size > 0) { finalSiteRules.push({ url: url, originalUrl: entry.originalUrl, regexes: Array.from(entry.regexes), rules: Array.from(entry.rules) }); } }); // Build output lines const outputLines = []; const outputLinesWithTitles = []; let filteredOutCount = 0; for (const { url, originalUrl, regexes, rules } of finalSiteRules) { if (rules.length > 0) { // Build title comments — include redirect source if URL changed and matched regex(es) const titleLines = [`! ${url}`]; if (originalUrl && originalUrl !== url) { titleLines.push(`! Redirected from: ${originalUrl}`); } if (regexes && regexes.length > 0) { titleLines.push(`! Regex: ${regexes.join(', ')}`); } // Regular output (for -o files and console) - only add titles if --titles flag used if (showTitles) { outputLines.push(...titleLines); } // Filter out ignored domains from rules const filteredRules = rules.filter(rule => { const domain = extractDomainFromRule(rule); if (domain && matchesIgnoreDomain(domain, ignoreDomains)) { filteredOutCount++; if (options.forceDebug) { console.log(formatLogMessage('debug', `${OUTPUT_FILTER_TAG} Removed rule matching ignoreDomains: ${rule} (domain: ${domain})`)); } else if (!options.silentMode) { console.log(formatLogMessage('info', `Filtered out: ${domain}`)); } return false; } return true; }); outputLines.push(...filteredRules); // Output with titles (for auto-saved log files) - always add titles outputLinesWithTitles.push(...titleLines); outputLinesWithTitles.push(...filteredRules); } } // Log filtered domains if any were removed if (filteredOutCount > 0) { if (options.forceDebug) { console.log(formatLogMessage('debug', `${OUTPUT_FILTER_TAG} Total: ${filteredOutCount} rules filtered out matching ignoreDomains patterns`)); } else if (!options.silentMode) { console.log(formatLogMessage('info', `${filteredOutCount} domains filtered out by ignoreDomains`)); } } // Remove duplicates if requested const finalOutputLines = removeDupes ? removeDuplicates(outputLines) : outputLines; return { outputLines: finalOutputLines, outputLinesWithTitles, successfulPageLoads, totalRules: finalOutputLines.filter(line => !line.startsWith('!')).length, filteredOutCount }; } /** * Writes output to file or console * @param {string[]} lines - Lines to output * @param {string|null} outputFile - File path to write to, or null for console output * @param {boolean} silentMode - Suppress console messages * @returns {boolean} Success status */ function writeOutput(lines, outputFile = null, silentMode = false) { try { if (outputFile) { // Ensure output directory exists const outputDir = path.dirname(outputFile); if (outputDir !== '.') { fs.mkdirSync(outputDir, { recursive: true }); } fs.writeFileSync(outputFile, lines.join('\n') + '\n'); if (!silentMode) { console.log(`\n${messageColors.success('Rules saved to')} ${outputFile}`); } } else { // Console output if (lines.length > 0 && !silentMode) { console.log(`\n${messageColors.highlight('--- Generated Rules ---')}`); } console.log(lines.join('\n')); } return true; } catch (error) { console.error(`Failed to write output: ${error.message}`); return false; } } /** * Main output handler that combines all output operations * @param {Array} results - Processing results from scanner * @param {object} config - Output configuration * @returns {object} Output statistics and file paths */ function handleOutput(results, config = {}) { const { outputFile = null, compareFile = null, appendMode = false, showTitles = false, removeDupes = false, silentMode = false, dumpUrls = false, adblockRulesLogFile = null, forceDebug = false, ignoreDomains = [] } = config; // Handle append mode if (outputFile && appendMode) { try { // Build output lines first. buildOutputLines already applies // removeDuplicates internally when removeDupes is true, so we don't // need a second pass here. const { outputLines, outputLinesWithTitles, successfulPageLoads, totalRules, filteredOutCount } = buildOutputLines(results, { showTitles, removeDupes, ignoreDomains, forceDebug }); const deduplicatedOutputLines = outputLines; // Read existing file content via a single open() instead of stat+open // (and avoid TOCTOU between an existsSync check and the read). let existingContent = ''; try { existingContent = fs.readFileSync(outputFile, 'utf8'); } catch (readErr) { if (readErr.code !== 'ENOENT') throw readErr; // File doesn't exist - append mode should create it if (forceDebug) console.log(formatLogMessage('debug', `Append mode: Creating new file ${outputFile}`)); } // Parse existing rules for comparison (exclude comments). Hoist the // single .trim() into a local so we don't walk the file content twice. const existingRules = new Set(); const trimmedExisting = existingContent.trim(); if (trimmedExisting) { const lines = trimmedExisting.split('\n'); lines.forEach(line => { const cleanLine = line.trim(); if (cleanLine && !cleanLine.startsWith('!') && !cleanLine.startsWith('#')) { existingRules.add(cleanLine); } }); } // Filter out rules that already exist (exclude comments from filtering) const newRules = deduplicatedOutputLines.filter(rule => { return rule.startsWith('!') || !existingRules.has(rule); }); // Count non-comment rules once and reuse below (was three throwaway // filter-array allocations: success log, else-branch log, return obj). let newRuleCount = 0; for (let i = 0; i < newRules.length; i++) { if (!newRules[i].startsWith('!')) newRuleCount++; } if (newRules.length > 0) { // Prepare content to append let appendContent = ''; // Ensure there's a newline before appending if file has content if (existingContent && !existingContent.endsWith('\n')) { appendContent = '\n'; } // Add new rules appendContent += newRules.join('\n') + '\n'; // Append to file fs.appendFileSync(outputFile, appendContent); if (!silentMode) { console.log(`${messageColors.success('Appended')} ${newRuleCount} new rules to: ${outputFile} (${existingRules.size} rules already existed${removeDupes ? ', duplicates removed' : ''})`); } } else if (!silentMode) { // No new rules — report the dedup'd input count instead. Same loop // pattern as above to avoid filter().length allocating an array. let ruleCount = 0; for (let i = 0; i < deduplicatedOutputLines.length; i++) { if (!deduplicatedOutputLines[i].startsWith('!')) ruleCount++; } console.log(`${messageColors.info('No new rules')} to append - all ${ruleCount} rules already exist in: ${outputFile}`); } // Write log file output if --dumpurls is enabled let logSuccess = true; if (dumpUrls && adblockRulesLogFile) { logSuccess = writeOutput(outputLinesWithTitles, adblockRulesLogFile, silentMode); } return { success: logSuccess, outputFile, adblockRulesLogFile, successfulPageLoads, totalRules: newRuleCount, filteredOutCount, totalLines: newRules.length, outputLines: null, appendedRules: newRuleCount, existingRules: existingRules.size }; } catch (appendErr) { console.error(`Failed to append to ${outputFile}: ${appendErr.message}`); return { success: false }; } } // Build output lines const { outputLines, outputLinesWithTitles, successfulPageLoads, totalRules, filteredOutCount } = buildOutputLines(results, { showTitles, removeDupes, ignoreDomains, forceDebug }); // Apply comparison filtering if compareFile is specified let filteredOutputLines = outputLines; if (compareFile && outputLines.length > 0) { try { const comparisonRules = loadComparisonRules(compareFile, forceDebug); // Count non-comment lines once each side instead of building filter // arrays just to read .length (was three allocations per log line). let originalCount = 0; for (let i = 0; i < outputLines.length; i++) { if (!outputLines[i].startsWith('!')) originalCount++; } filteredOutputLines = filterUniqueRules(outputLines, comparisonRules, forceDebug); if (!silentMode) { let uniqueCount = 0; for (let i = 0; i < filteredOutputLines.length; i++) { if (!filteredOutputLines[i].startsWith('!')) uniqueCount++; } console.log(formatLogMessage('compare', `Filtered ${originalCount - uniqueCount} existing rules, ${uniqueCount} unique rules remaining`)); } } catch (compareError) { console.error(messageColors.error('Compare operation failed:') + ` ${compareError.message}`); return { success: false, totalRules: 0, successfulPageLoads: 0 }; } } // Write main output const mainSuccess = writeOutput(filteredOutputLines, outputFile, silentMode); // Write log file output if --dumpurls is enabled let logSuccess = true; if (dumpUrls && adblockRulesLogFile) { logSuccess = writeOutput(outputLinesWithTitles, adblockRulesLogFile, silentMode); } // Count non-comment lines once (used by totalRules below). Doing this with a // single loop avoids the .filter().length pattern that allocates a throwaway // array. Callers that want totalDomainsSkipped should call // getTotalDomainsSkipped() from ./domain-cache directly. let finalRuleCount = 0; for (let i = 0; i < filteredOutputLines.length; i++) { if (!filteredOutputLines[i].startsWith('!')) finalRuleCount++; } return { success: mainSuccess && logSuccess, outputFile, adblockRulesLogFile, successfulPageLoads, totalRules: finalRuleCount, filteredOutCount, totalLines: filteredOutputLines.length, outputLines: outputFile ? null : filteredOutputLines // Only return lines if not written to file }; } /** * Get output format description for debugging/logging * @param {object} options - Format options * @returns {string} Human-readable format description */ function getFormatDescription(options = {}) { const { localhostIP = null, plain = false, adblockRules = false, dnsmasq = false, dnsmasqOld = false, unbound = false, privoxy = false, pihole = false } = options; // Plain always takes precedence if (plain) { return 'Plain domains only'; } if (pihole) { return 'Pi-hole regex format ((^|\\.)domain\\.com$)'; } else if (privoxy) { return 'Privoxy format ({ +block } .domain.com)'; } else if (dnsmasq) { return 'DNSmasq format (local=/domain.com/)'; } else if (dnsmasqOld) { return 'DNSmasq old format (server=/domain.com/)'; } else if (unbound) { return 'Unbound format (local-zone: "domain.com." always_null)'; } else if (adblockRules) { return 'Adblock filter rules with resource type modifiers (||domain.com^$script)'; } else if (localhostIP) { return `Localhost format (${localhostIP} domain.com)`; } else { return 'Adblock format (||domain.com^)'; } } module.exports = { formatDomain, formatRules, removeDuplicates, buildOutputLines, writeOutput, handleOutput, getFormatDescription, mapResourceTypeToAdblockModifier, matchesIgnoreDomain, extractDomainFromRule };