@fanboynz/network-scanner
Version:
A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.
710 lines (642 loc) • 26.5 kB
JavaScript
const fs = require('fs');
const path = require('path');
const { loadComparisonRules, filterUniqueRules } = require('./compare');
const { messageColors, formatLogMessage } = require('./colorize');
const OUTPUT_FILTER_TAG = messageColors.processing('[output-filter]');
// Cache for compiled wildcard regex patterns in matchesIgnoreDomain (capped to prevent memory leak)
const wildcardRegexCache = new Map();
const WILDCARD_CACHE_MAX = 500;
// Hoisted resource type map — avoid recreating per call
const RESOURCE_TYPE_TO_ADBLOCK = {
'script': 'script', 'xhr': 'xmlhttprequest', 'fetch': 'xmlhttprequest',
'stylesheet': 'stylesheet', 'image': 'image', 'font': 'font',
'document': 'document', 'subdocument': 'subdocument', 'iframe': 'subdocument',
'websocket': 'websocket', 'media': 'media', 'ping': 'ping', 'other': null
};
/**
* Check if domain matches any ignore patterns (supports wildcards)
* @param {string} domain - Domain to check
* @param {string[]} ignorePatterns - Array of ignore patterns
* @returns {boolean} True if domain should be ignored
*/
function matchesIgnoreDomain(domain, ignorePatterns) {
if (!ignorePatterns || !Array.isArray(ignorePatterns) || ignorePatterns.length === 0) {
return false;
}
return ignorePatterns.some(pattern => {
if (pattern.includes('*')) {
// Enhanced wildcard pattern handling
if (pattern.startsWith('*.')) {
// Pattern: *.example.com — match exact or any subdomain
const suffix = pattern.substring(2);
return domain === suffix || domain.endsWith('.' + suffix);
} else if (pattern.endsWith('.*')) {
// Pattern: example.*
const baseDomain = pattern.slice(0, -2); // Remove ".*"
return domain.startsWith(baseDomain + '.');
} else {
// Complex wildcard pattern (cached). Escape every regex meta-char
// EXCEPT '*' first, then expand '*' to '.*'. The old code only
// escaped '.', so a pattern containing '+', '(', '[', '?', etc.
// would either misbehave (e.g. 'foo+bar.com' would treat '+' as a
// quantifier) or throw synchronously (unmatched '(' / '[') and the
// exception would propagate out of .some(). Domain names can't
// legally contain those chars, but a typo in a user's ignore list
// would crash the output stage.
if (!wildcardRegexCache.has(pattern)) {
if (wildcardRegexCache.size >= WILDCARD_CACHE_MAX) {
wildcardRegexCache.delete(wildcardRegexCache.keys().next().value);
}
const regexPattern = pattern
.replace(/[.+?^${}()|[\]\\]/g, '\\$&') // Escape all regex meta-chars except '*'
.replace(/\*/g, '.*'); // Now expand '*' to '.*'
try {
wildcardRegexCache.set(pattern, new RegExp(`^${regexPattern}$`));
} catch (_) {
// Defensive: a still-malformed regex (shouldn't happen after the
// escape above) becomes a never-match instead of a crash.
wildcardRegexCache.set(pattern, /(?!)/);
}
}
return wildcardRegexCache.get(pattern).test(domain);
}
} else {
// Exact pattern matching
return domain === pattern || domain.endsWith('.' + pattern);
}
});
}
/**
* Extract domain from a formatted rule back to plain domain
* @param {string} rule - Formatted rule (e.g., "||domain.com^", "127.0.0.1 domain.com", etc.)
* @returns {string|null} Plain domain or null if cannot extract
*/
function extractDomainFromRule(rule) {
if (!rule || rule.startsWith('!')) {
return null; // Skip comments
}
// Handle different output formats
if (rule.startsWith('||') && rule.includes('^')) {
// Adblock format: ||domain.com^ or ||domain.com^$script
return rule.substring(2).split('^')[0];
} else if (rule.match(/^(127\.0\.0\.1|0\.0\.0\.0)\s+/)) {
// Localhost format: 127.0.0.1 domain.com or 0.0.0.0 domain.com
return rule.split(/\s+/)[1];
} else if (rule.startsWith('local=/') && rule.endsWith('/')) {
// DNSmasq format: local=/domain.com/
return rule.substring(6, rule.length - 1);
} else if (rule.startsWith('server=/') && rule.endsWith('/')) {
// DNSmasq old format: server=/domain.com/
return rule.substring(7, rule.length - 1);
} else if (rule.startsWith('local-zone: "') && rule.includes('" always_null')) {
// Unbound format: local-zone: "domain.com." always_null
const domain = rule.substring(13).split('"')[0];
return domain.endsWith('.') ? domain.slice(0, -1) : domain;
} else if (rule.startsWith('{ +block } .')) {
// Privoxy format: { +block } .domain.com
return rule.substring(12);
} else if (rule.match(/^\(\^\|\\\.\)/)) {
// Pi-hole regex format: (^|\.)domain\.com$
return rule.replace(/^\(\^\|\\?\.\)/, '').replace(/\\\./g, '.').replace(/\$$/, '');
}
// If no format matches, assume it's already a plain domain
return rule.includes('.') ? rule : null;
}
/**
* Formats a domain according to the specified output mode
* @param {string} domain - The domain to format
* @param {object} options - Formatting options
* @param {string|null} options.localhostIP - Use custom IP format (e.g., '127.0.0.1', '0.0.0.0')
* @param {boolean} options.plain - Use plain domain format (no adblock syntax)
* @param {boolean} options.adblockRules - Generate adblock filter rules with resource types
* @param {boolean} options.dnsmasq - Use dnsmasq local format
* @param {boolean} options.dnsmasqOld - Use dnsmasq old server format
* @param {boolean} options.unbound - Use unbound local-zone format
* @param {boolean} options.privoxy - Use Privoxy block format
* @param {boolean} options.pihole - Use Pi-hole regex format
* @param {string} options.resourceType - Resource type for adblock rules (script, xhr, iframe, css, image, etc.)
* @returns {string} The formatted domain
*/
function formatDomain(domain, options = {}) {
const { localhostIP = null, plain = false, adblockRules = false, dnsmasq = false, dnsmasqOld = false, unbound = false, privoxy = false, pihole = false, resourceType = null } = options;
// Validate domain length and format
if (!domain || domain.length <= 6 || !domain.includes('.')) {
return null;
}
// If plain is true, always return just the domain regardless of other options
if (plain) {
return domain;
}
// Apply specific format based on output mode
if (pihole) {
// Escape dots for regex and use Pi-hole format: (^|\.)domain\.com$
const escapedDomain = domain.replace(/\./g, '\\.');
return `(^|\\.)${escapedDomain}$`;
} else if (privoxy) {
return `{ +block } .${domain}`;
} else if (dnsmasq) {
return `local=/${domain}/`;
} else if (dnsmasqOld) {
return `server=/${domain}/`;
} else if (unbound) {
return `local-zone: "${domain}." always_null`;
} else if (localhostIP) {
return `${localhostIP} ${domain}`;
} else if (adblockRules && resourceType) {
// Generate adblock filter rules with resource type modifiers
return `||${domain}^${resourceType}`;
} else {
return `||${domain}^`;
}
}
/**
* Maps Puppeteer resource types to adblock filter modifiers
* @param {string} resourceType - Puppeteer resource type
* @returns {string|null} Adblock filter modifier, or null if should be ignored
*/
function mapResourceTypeToAdblockModifier(resourceType) {
return RESOURCE_TYPE_TO_ADBLOCK[resourceType] || null;
}
/**
* Formats an array of domains according to site and global settings
* @param {Set<string>|Map<string, Set<string>>} matchedDomains - Set of matched domains or Map of domain -> resource types
* @param {object} siteConfig - Site-specific configuration
* @param {object} globalOptions - Global formatting options
* @returns {string[]} Array of formatted rules
*/
function formatRules(matchedDomains, siteConfig = {}, globalOptions = {}) {
const {
localhostIP = null,
plainOutput = false,
adblockRulesMode = false,
dnsmasqMode = false,
dnsmasqOldMode = false,
unboundMode = false,
privoxyMode = false,
piholeMode = false
} = globalOptions;
// Site-level overrides
const siteLocalhostIP = siteConfig.localhost || null;
const sitePlainSetting = siteConfig.plain === true;
const siteAdblockRules = siteConfig.adblock_rules === true;
const siteDnsmasq = siteConfig.dnsmasq === true;
const siteDnsmasqOld = siteConfig.dnsmasq_old === true;
const siteUnbound = siteConfig.unbound === true;
const sitePrivoxy = siteConfig.privoxy === true;
const sitePihole = siteConfig.pihole === true;
// Validate output format compatibility - silently ignore incompatible combinations
const activeFormats = [
dnsmasqMode || siteDnsmasq,
dnsmasqOldMode || siteDnsmasqOld,
unboundMode || siteUnbound,
privoxyMode || sitePrivoxy,
piholeMode || sitePihole,
adblockRulesMode || siteAdblockRules,
(localhostIP || siteLocalhostIP) ? true : false,
plainOutput || sitePlainSetting
].filter(Boolean).length;
if (activeFormats > 1) {
// Multiple formats specified - fall back to standard adblock format
const formatOptions = {
localhostIP: null,
plain: false,
adblockRules: false,
dnsmasq: false,
dnsmasqOld: false,
unbound: false,
privoxy: false,
pihole: false
};
const formattedRules = [];
const domainsToProcess = matchedDomains instanceof Set ? matchedDomains : new Set(matchedDomains.keys());
domainsToProcess.forEach(domain => {
const formatted = formatDomain(domain, formatOptions);
if (formatted) {
formattedRules.push(formatted);
}
});
return formattedRules;
}
// Determine final formatting options
const formatOptions = {
localhostIP: siteLocalhostIP || localhostIP,
plain: plainOutput || sitePlainSetting,
adblockRules: adblockRulesMode || siteAdblockRules,
dnsmasq: dnsmasqMode || siteDnsmasq,
dnsmasqOld: dnsmasqOldMode || siteDnsmasqOld,
unbound: unboundMode || siteUnbound,
privoxy: privoxyMode || sitePrivoxy,
pihole: piholeMode || sitePihole
};
const formattedRules = [];
if (matchedDomains instanceof Map && formatOptions.adblockRules) {
// Handle Map format with resource types for --adblock-rules
matchedDomains.forEach((resourceTypes, domain) => {
if (resourceTypes.size > 0) {
let hasValidResourceType = false;
// Generate one rule per resource type found for this domain
resourceTypes.forEach(resourceType => {
const adblockModifier = mapResourceTypeToAdblockModifier(resourceType);
// Skip if modifier is null (e.g., 'other' type)
if (adblockModifier) {
hasValidResourceType = true;
const formatted = formatDomain(domain, {
...formatOptions,
resourceType: adblockModifier
});
if (formatted) {
formattedRules.push(formatted);
}
}
});
// If no valid resource types were found, add a generic rule
if (!hasValidResourceType) {
const formatted = formatDomain(domain, formatOptions);
if (formatted) {
formattedRules.push(formatted);
}
}
} else {
// Fallback to generic rule if no resource types
const formatted = formatDomain(domain, formatOptions);
if (formatted) {
formattedRules.push(formatted);
}
}
});
} else {
// Handle Set format (legacy behavior) or other modes (including privoxy and pihole)
const domainsToProcess = matchedDomains instanceof Set ? matchedDomains : new Set(matchedDomains.keys());
domainsToProcess.forEach(domain => {
const formatted = formatDomain(domain, formatOptions);
if (formatted) {
formattedRules.push(formatted);
}
});
}
return formattedRules;
}
/**
* Removes duplicate rules while preserving comments (lines starting with !)
* @param {string[]} lines - Array of output lines
* @returns {string[]} Array with duplicates removed
*/
function removeDuplicates(lines) {
const uniqueLines = [];
const seenRules = new Set();
for (const line of lines) {
if (line.startsWith('!') || !seenRules.has(line)) {
uniqueLines.push(line);
if (!line.startsWith('!')) {
seenRules.add(line);
}
}
}
return uniqueLines;
}
/**
* Builds the final output lines from processing results
* @param {Array} results - Array of processing results from processUrl
* @param {object} options - Output options
* @param {boolean} options.showTitles - Include URL titles in output
* @param {boolean} options.removeDupes - Remove duplicate rules
* @param {string[]} options.ignoreDomains - Domains to filter out from final output
* @param {boolean} options.forLogFile - Include titles regardless of showTitles (for log files)
* @returns {object} Object containing outputLines and outputLinesWithTitles
*/
function buildOutputLines(results, options = {}) {
const { showTitles = false, removeDupes = false, ignoreDomains = [], forLogFile = false } = options;
// Consolidate rules from all results, handling multiple results for same URL
const consolidatedRules = new Map(); // URL -> { rules: Set, originalUrl, regexes: Set }
let successfulPageLoads = 0;
results.forEach(result => {
if (result) {
if (result.success) {
successfulPageLoads++;
}
if (result.rules && result.rules.length > 0) {
if (!consolidatedRules.has(result.url)) {
consolidatedRules.set(result.url, { rules: new Set(), originalUrl: result.originalUrl || result.url, regexes: new Set() });
}
const entry = consolidatedRules.get(result.url);
result.rules.forEach(rule => entry.rules.add(rule));
if (Array.isArray(result.matchedRegexes)) {
result.matchedRegexes.forEach(rx => entry.regexes.add(rx));
}
// Prefer the original URL from any result entry that has one different from final
if (result.originalUrl && result.originalUrl !== result.url) {
entry.originalUrl = result.originalUrl;
}
}
}
});
// Convert consolidated rules back to array format
const finalSiteRules = [];
consolidatedRules.forEach((entry, url) => {
if (entry.rules.size > 0) {
finalSiteRules.push({
url: url,
originalUrl: entry.originalUrl,
regexes: Array.from(entry.regexes),
rules: Array.from(entry.rules)
});
}
});
// Build output lines
const outputLines = [];
const outputLinesWithTitles = [];
let filteredOutCount = 0;
for (const { url, originalUrl, regexes, rules } of finalSiteRules) {
if (rules.length > 0) {
// Build title comments — include redirect source if URL changed and matched regex(es)
const titleLines = [`! ${url}`];
if (originalUrl && originalUrl !== url) {
titleLines.push(`! Redirected from: ${originalUrl}`);
}
if (regexes && regexes.length > 0) {
titleLines.push(`! Regex: ${regexes.join(', ')}`);
}
// Regular output (for -o files and console) - only add titles if --titles flag used
if (showTitles) {
outputLines.push(...titleLines);
}
// Filter out ignored domains from rules
const filteredRules = rules.filter(rule => {
const domain = extractDomainFromRule(rule);
if (domain && matchesIgnoreDomain(domain, ignoreDomains)) {
filteredOutCount++;
if (options.forceDebug) {
console.log(formatLogMessage('debug', `${OUTPUT_FILTER_TAG} Removed rule matching ignoreDomains: ${rule} (domain: ${domain})`));
} else if (!options.silentMode) {
console.log(formatLogMessage('info', `Filtered out: ${domain}`));
}
return false;
}
return true;
});
outputLines.push(...filteredRules);
// Output with titles (for auto-saved log files) - always add titles
outputLinesWithTitles.push(...titleLines);
outputLinesWithTitles.push(...filteredRules);
}
}
// Log filtered domains if any were removed
if (filteredOutCount > 0) {
if (options.forceDebug) {
console.log(formatLogMessage('debug', `${OUTPUT_FILTER_TAG} Total: ${filteredOutCount} rules filtered out matching ignoreDomains patterns`));
} else if (!options.silentMode) {
console.log(formatLogMessage('info', `${filteredOutCount} domains filtered out by ignoreDomains`));
}
}
// Remove duplicates if requested
const finalOutputLines = removeDupes ? removeDuplicates(outputLines) : outputLines;
return {
outputLines: finalOutputLines,
outputLinesWithTitles,
successfulPageLoads,
totalRules: finalOutputLines.filter(line => !line.startsWith('!')).length,
filteredOutCount
};
}
/**
* Writes output to file or console
* @param {string[]} lines - Lines to output
* @param {string|null} outputFile - File path to write to, or null for console output
* @param {boolean} silentMode - Suppress console messages
* @returns {boolean} Success status
*/
function writeOutput(lines, outputFile = null, silentMode = false) {
try {
if (outputFile) {
// Ensure output directory exists
const outputDir = path.dirname(outputFile);
if (outputDir !== '.') {
fs.mkdirSync(outputDir, { recursive: true });
}
fs.writeFileSync(outputFile, lines.join('\n') + '\n');
if (!silentMode) {
console.log(`\n${messageColors.success('Rules saved to')} ${outputFile}`);
}
} else {
// Console output
if (lines.length > 0 && !silentMode) {
console.log(`\n${messageColors.highlight('--- Generated Rules ---')}`);
}
console.log(lines.join('\n'));
}
return true;
} catch (error) {
console.error(`Failed to write output: ${error.message}`);
return false;
}
}
/**
* Main output handler that combines all output operations
* @param {Array} results - Processing results from scanner
* @param {object} config - Output configuration
* @returns {object} Output statistics and file paths
*/
function handleOutput(results, config = {}) {
const {
outputFile = null,
compareFile = null,
appendMode = false,
showTitles = false,
removeDupes = false,
silentMode = false,
dumpUrls = false,
adblockRulesLogFile = null,
forceDebug = false,
ignoreDomains = []
} = config;
// Handle append mode
if (outputFile && appendMode) {
try {
// Build output lines first. buildOutputLines already applies
// removeDuplicates internally when removeDupes is true, so we don't
// need a second pass here.
const {
outputLines,
outputLinesWithTitles,
successfulPageLoads,
totalRules,
filteredOutCount
} = buildOutputLines(results, { showTitles, removeDupes, ignoreDomains, forceDebug });
const deduplicatedOutputLines = outputLines;
// Read existing file content via a single open() instead of stat+open
// (and avoid TOCTOU between an existsSync check and the read).
let existingContent = '';
try {
existingContent = fs.readFileSync(outputFile, 'utf8');
} catch (readErr) {
if (readErr.code !== 'ENOENT') throw readErr;
// File doesn't exist - append mode should create it
if (forceDebug) console.log(formatLogMessage('debug', `Append mode: Creating new file ${outputFile}`));
}
// Parse existing rules for comparison (exclude comments). Hoist the
// single .trim() into a local so we don't walk the file content twice.
const existingRules = new Set();
const trimmedExisting = existingContent.trim();
if (trimmedExisting) {
const lines = trimmedExisting.split('\n');
lines.forEach(line => {
const cleanLine = line.trim();
if (cleanLine && !cleanLine.startsWith('!') && !cleanLine.startsWith('#')) {
existingRules.add(cleanLine);
}
});
}
// Filter out rules that already exist (exclude comments from filtering)
const newRules = deduplicatedOutputLines.filter(rule => {
return rule.startsWith('!') || !existingRules.has(rule);
});
// Count non-comment rules once and reuse below (was three throwaway
// filter-array allocations: success log, else-branch log, return obj).
let newRuleCount = 0;
for (let i = 0; i < newRules.length; i++) {
if (!newRules[i].startsWith('!')) newRuleCount++;
}
if (newRules.length > 0) {
// Prepare content to append
let appendContent = '';
// Ensure there's a newline before appending if file has content
if (existingContent && !existingContent.endsWith('\n')) {
appendContent = '\n';
}
// Add new rules
appendContent += newRules.join('\n') + '\n';
// Append to file
fs.appendFileSync(outputFile, appendContent);
if (!silentMode) {
console.log(`${messageColors.success('Appended')} ${newRuleCount} new rules to: ${outputFile} (${existingRules.size} rules already existed${removeDupes ? ', duplicates removed' : ''})`);
}
} else if (!silentMode) {
// No new rules — report the dedup'd input count instead. Same loop
// pattern as above to avoid filter().length allocating an array.
let ruleCount = 0;
for (let i = 0; i < deduplicatedOutputLines.length; i++) {
if (!deduplicatedOutputLines[i].startsWith('!')) ruleCount++;
}
console.log(`${messageColors.info('No new rules')} to append - all ${ruleCount} rules already exist in: ${outputFile}`);
}
// Write log file output if --dumpurls is enabled
let logSuccess = true;
if (dumpUrls && adblockRulesLogFile) {
logSuccess = writeOutput(outputLinesWithTitles, adblockRulesLogFile, silentMode);
}
return {
success: logSuccess,
outputFile,
adblockRulesLogFile,
successfulPageLoads,
totalRules: newRuleCount,
filteredOutCount,
totalLines: newRules.length,
outputLines: null,
appendedRules: newRuleCount,
existingRules: existingRules.size
};
} catch (appendErr) {
console.error(`Failed to append to ${outputFile}: ${appendErr.message}`);
return { success: false };
}
}
// Build output lines
const {
outputLines,
outputLinesWithTitles,
successfulPageLoads,
totalRules,
filteredOutCount
} = buildOutputLines(results, { showTitles, removeDupes, ignoreDomains, forceDebug });
// Apply comparison filtering if compareFile is specified
let filteredOutputLines = outputLines;
if (compareFile && outputLines.length > 0) {
try {
const comparisonRules = loadComparisonRules(compareFile, forceDebug);
// Count non-comment lines once each side instead of building filter
// arrays just to read .length (was three allocations per log line).
let originalCount = 0;
for (let i = 0; i < outputLines.length; i++) {
if (!outputLines[i].startsWith('!')) originalCount++;
}
filteredOutputLines = filterUniqueRules(outputLines, comparisonRules, forceDebug);
if (!silentMode) {
let uniqueCount = 0;
for (let i = 0; i < filteredOutputLines.length; i++) {
if (!filteredOutputLines[i].startsWith('!')) uniqueCount++;
}
console.log(formatLogMessage('compare', `Filtered ${originalCount - uniqueCount} existing rules, ${uniqueCount} unique rules remaining`));
}
} catch (compareError) {
console.error(messageColors.error('Compare operation failed:') + ` ${compareError.message}`);
return { success: false, totalRules: 0, successfulPageLoads: 0 };
}
}
// Write main output
const mainSuccess = writeOutput(filteredOutputLines, outputFile, silentMode);
// Write log file output if --dumpurls is enabled
let logSuccess = true;
if (dumpUrls && adblockRulesLogFile) {
logSuccess = writeOutput(outputLinesWithTitles, adblockRulesLogFile, silentMode);
}
// Count non-comment lines once (used by totalRules below). Doing this with a
// single loop avoids the .filter().length pattern that allocates a throwaway
// array. Callers that want totalDomainsSkipped should call
// getTotalDomainsSkipped() from ./domain-cache directly.
let finalRuleCount = 0;
for (let i = 0; i < filteredOutputLines.length; i++) {
if (!filteredOutputLines[i].startsWith('!')) finalRuleCount++;
}
return {
success: mainSuccess && logSuccess,
outputFile,
adblockRulesLogFile,
successfulPageLoads,
totalRules: finalRuleCount,
filteredOutCount,
totalLines: filteredOutputLines.length,
outputLines: outputFile ? null : filteredOutputLines // Only return lines if not written to file
};
}
/**
* Get output format description for debugging/logging
* @param {object} options - Format options
* @returns {string} Human-readable format description
*/
function getFormatDescription(options = {}) {
const { localhostIP = null, plain = false, adblockRules = false, dnsmasq = false, dnsmasqOld = false, unbound = false, privoxy = false, pihole = false } = options;
// Plain always takes precedence
if (plain) {
return 'Plain domains only';
}
if (pihole) {
return 'Pi-hole regex format ((^|\\.)domain\\.com$)';
} else if (privoxy) {
return 'Privoxy format ({ +block } .domain.com)';
} else if (dnsmasq) {
return 'DNSmasq format (local=/domain.com/)';
} else if (dnsmasqOld) {
return 'DNSmasq old format (server=/domain.com/)';
} else if (unbound) {
return 'Unbound format (local-zone: "domain.com." always_null)';
} else if (adblockRules) {
return 'Adblock filter rules with resource type modifiers (||domain.com^$script)';
} else if (localhostIP) {
return `Localhost format (${localhostIP} domain.com)`;
} else {
return 'Adblock format (||domain.com^)';
}
}
module.exports = {
formatDomain,
formatRules,
removeDuplicates,
buildOutputLines,
writeOutput,
handleOutput,
getFormatDescription,
mapResourceTypeToAdblockModifier,
matchesIgnoreDomain,
extractDomainFromRule
};