@fanboynz/network-scanner
Version:
A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.
295 lines (268 loc) • 12.5 kB
JavaScript
// === Chrome DevTools Protocol (CDP) Module ===
// Handles CDP session management and network request logging for enhanced browser monitoring
//
// INTEGRATION GUIDE FOR OTHER APPLICATIONS:
// This module provides a clean interface for Chrome DevTools Protocol integration with Puppeteer.
// It can be easily integrated into any Node.js application that uses Puppeteer for browser automation.
//
// BASIC USAGE:
// const { createCDPSession } = require('./lib/cdp');
// const cdpManager = await createCDPSession(page, url, options);
// // ... do your work ...
// await cdpManager.cleanup(); // Always cleanup when done
//
// DEPENDENCIES:
// - Puppeteer (any recent version)
// - ./colorize module (for logging) - can be replaced with console.log if needed
//
// PERFORMANCE CONSIDERATIONS:
// - CDP adds ~10-20% overhead to page processing
// - Use selectively on complex sites that need deep network visibility
// - Avoid on high-volume batch processing unless debugging
//
// COMPATIBILITY:
// - Works with Chrome/Chromium browsers
// - Compatible with headless and headful modes
// - Tested with Puppeteer 13+ but should work with older versions
const { formatLogMessage, messageColors } = require('./colorize');
// Precomputed colored '[cdp]' subsystem prefix. formatLogMessage only colors
// the [severity] tag; '[cdp]' was sitting plain inside the message string.
const CDP_TAG = messageColors.processing('[cdp]');
/**
* Race a promise against a timeout, clearing the timer when the promise settles.
* Prevents leaked setTimeout handles that hold closure references until they fire.
* @param {Promise} promise - The operation to race
* @param {number} ms - Timeout in milliseconds
* @param {string} message - Error message for timeout
* @returns {Promise} Resolves/rejects with the operation result, or rejects on timeout
*/
function raceWithTimeout(promise, ms, message) {
let timeoutId;
const timeoutPromise = new Promise((_, reject) => {
timeoutId = setTimeout(() => reject(new Error(message)), ms);
});
return Promise.race([promise, timeoutPromise]).finally(() => clearTimeout(timeoutId));
}
// Shared no-op cleanup used by every no-CDP / CDP-failed return path. Hoisted
// so createSessionResult() doesn't allocate a fresh `async () => {}` per call.
const NOOP_CLEANUP = async () => {};
/**
* Safely extract a hostname from a URL string with a fallback for malformed URLs.
* Used in logs where 'unknown' or a truncated URL is acceptable on parse failure.
*/
function safeHostname(url, fallback = 'unknown') {
try { return new URL(url).hostname; } catch { return fallback; }
}
/**
* Recognize CDP errors that mean the browser is broken and needs restarting.
* Centralized so setRequestInterceptionWithTimeout and createCDPSession's catch
* stay in sync — previously each had its own slightly-different pattern list.
*/
function isCriticalCDPError(message) {
if (!message) return false;
return message.includes('Network.enable timed out') ||
message.includes('Protocol error') ||
message.includes('ProtocolError') ||
message.includes('Session closed') ||
message.includes('Target closed') ||
message.includes('Browser has been closed');
}
/**
* Creates a standardized session result object for consistent V8 optimization
* @param {object|null} session - CDP session or null
* @param {Function} cleanup - Cleanup function
* @param {boolean} isEnhanced - Whether enhanced features are active
* @returns {object} Standardized session object
*/
const createSessionResult = (session = null, cleanup = NOOP_CLEANUP, isEnhanced = false) => ({
session,
cleanup,
isEnhanced
});
/**
* Creates a new page with timeout protection to prevent CDP hangs
* @param {import('puppeteer').Browser} browser - Browser instance
* @param {number} timeout - Timeout in milliseconds (default: 30000)
* @returns {Promise<import('puppeteer').Page>} Page instance
*/
async function createPageWithTimeout(browser, timeout = 30000) {
return raceWithTimeout(browser.newPage(), timeout, 'Page creation timeout - browser may be unresponsive');
}
/**
* Sets request interception with timeout protection
* @param {import('puppeteer').Page} page - Page instance
* @param {number} timeout - Timeout in milliseconds (default: 15000)
* @returns {Promise<void>}
*/
async function setRequestInterceptionWithTimeout(page, timeout = 15000) {
try {
await raceWithTimeout(page.setRequestInterception(true), timeout, 'Request interception timeout - first attempt');
} catch (firstError) {
// Don't retry if the browser/session is already gone — escalate immediately.
if (isCriticalCDPError(firstError.message)) {
throw new Error('CRITICAL_BROWSER_ERROR: ' + firstError.message);
}
// Retry with extended timeout
try {
await raceWithTimeout(page.setRequestInterception(true), timeout * 2, 'Request interception timeout - retry failed');
} catch (retryError) {
if (isCriticalCDPError(retryError.message)) {
throw new Error('CRITICAL_NETWORK_ERROR: ' + retryError.message);
}
throw retryError;
}
}
}
/**
* Creates and manages a CDP session for network monitoring
*
* INTEGRATION EXAMPLE:
* const cdpManager = await createCDPSession(page, 'https://example.com', {
* enableCDP: true, // Global CDP flag
* siteSpecificCDP: true, // Site-specific CDP flag
* forceDebug: true // When true, install the Network.requestWillBeSent log listener
* });
*
* // Your page automation code here...
* await page.goto('https://example.com');
*
* // Always cleanup when done
* await cdpManager.cleanup();
*
* WHAT IT MONITORS:
* - All network requests (GET, POST, etc.)
* - Request initiators (script, parser, user, etc.)
* - Request/response timing
* - Failed requests and errors
*
* ERROR HANDLING:
* - Gracefully handles CDP connection failures
* - Distinguishes between critical and non-critical errors
* - Returns null session object if CDP setup fails
* - Never throws on cleanup operations
*
* @param {import('puppeteer').Page} page - The Puppeteer page instance
* @param {string} currentUrl - The URL being processed (used for logging context)
* @param {object} options - Configuration options
* @param {boolean} options.enableCDP - Global CDP flag (from --cdp command line)
* @param {boolean} options.siteSpecificCDP - Site-specific CDP flag (from config)
* @param {boolean} options.forceDebug - Debug logging flag
* @returns {Promise<object>} CDP session object with cleanup method
*/
async function createCDPSession(page, currentUrl, options = {}) {
const { enableCDP, siteSpecificCDP, forceDebug } = options;
// The only thing this function's CDP session does is feed a debug-gated
// Network.requestWillBeSent listener. With !forceDebug the listener body is
// a no-op, so setting up CDP (and paying Network.enable's overhead) buys
// nothing. Skip entirely in that case — same observable behavior as before,
// minus the wasted protocol traffic.
const cdpLoggingNeeded = (enableCDP || siteSpecificCDP === true) && forceDebug;
if (!cdpLoggingNeeded) {
return createSessionResult();
}
// Parse the current URL hostname once and reuse it for the mode-log line,
// the per-request listener's first-vs-third-party comparison, and (with a
// different fallback) the catch-block error context.
const currentHostname = safeHostname(currentUrl);
// Log which CDP mode is being used
if (enableCDP) {
console.log(formatLogMessage('debug', `${CDP_TAG} Global CDP enabled by --cdp flag for ${currentHostname}`));
} else if (siteSpecificCDP === true) {
console.log(formatLogMessage('debug', `${CDP_TAG} Site-specific CDP enabled for ${currentHostname} (via cdp: true or cdp_specific domain match)`));
}
let cdpSession = null;
try {
// Create CDP session using modern Puppeteer 20+ API
// Add timeout protection for CDP session creation
cdpSession = await raceWithTimeout(page.createCDPSession(), 20000, 'CDP session creation timeout');
// Enable network domain — required for network event monitoring. This is
// the operation the rest of the codebase has learned can hang under
// overload; race against a watchdog so we don't block the page load.
await raceWithTimeout(
cdpSession.send('Network.enable'),
15000,
'Network.enable timed out'
);
// Set up network request monitoring
// This captures ALL network requests at the browser engine level.
// (We've already established forceDebug is true at this point — no inner
// check needed.)
cdpSession.on('Network.requestWillBeSent', (params) => {
const { url: requestUrl, method } = params.request;
const initiator = params.initiator?.type ?? 'unknown';
let hostnameForLog = currentHostname;
try {
const requestHostname = new URL(requestUrl).hostname;
if (currentHostname !== requestHostname) {
hostnameForLog = `${currentHostname}?${requestHostname}`;
}
} catch (_) {}
console.log(formatLogMessage('debug', `${CDP_TAG}[${hostnameForLog}] ${method} ${requestUrl} (initiator: ${initiator})`));
});
console.log(formatLogMessage('debug', `${CDP_TAG} CDP session created successfully for ${currentUrl}`));
return createSessionResult(
cdpSession,
async () => {
// Safe cleanup that never throws errors
if (cdpSession) {
try {
await cdpSession.detach();
console.log(formatLogMessage('debug', `${CDP_TAG} CDP session detached for ${currentUrl}`));
} catch (cdpCleanupErr) {
// Log cleanup errors but don't throw - cleanup should never fail the calling code
console.log(formatLogMessage('debug', `${CDP_TAG} Failed to detach CDP session for ${currentUrl}: ${cdpCleanupErr.message}`));
}
}
},
false
);
} catch (cdpErr) {
// If the session was created but a subsequent send/wire-up failed, detach
// it so we don't leak a half-attached session. Previously the code just
// nulled the local and orphaned the session. We're already past the
// cdpLoggingNeeded gate here so forceDebug is true — log a failed detach
// instead of swallowing it, so partial-cleanup failures aren't invisible.
if (cdpSession) {
try { await cdpSession.detach(); }
catch (partialDetachErr) {
console.log(formatLogMessage('debug', `${CDP_TAG} Partial-session detach failed for ${currentUrl}: ${partialDetachErr.message}`));
}
cdpSession = null;
}
// Enhanced error context for CDP domain-specific debugging
const urlContext = safeHostname(currentUrl, `${currentUrl.substring(0, 50)}...`);
// Critical errors: browser is broken, propagate so the caller can restart.
if (isCriticalCDPError(cdpErr.message)) {
throw new Error(`Browser protocol broken (${urlContext}): ${cdpErr.message}`);
}
// NON-CRITICAL ERROR: CDP failed but browser is still usable
// Log warning but return working session object
console.warn(formatLogMessage('warn', `${CDP_TAG} Failed to attach CDP session for ${urlContext}: ${cdpErr.message}`));
// Return null session with no-op cleanup for consistent API
return createSessionResult();
}
}
// EXPORT INTERFACE FOR OTHER APPLICATIONS:
// This module provides a clean, reusable interface for CDP integration.
// Simply require this module and use the exported functions.
//
// CUSTOMIZATION TIPS:
// 1. Replace './colorize' import with your own logging system
// 2. Modify the request logging format in the Network.requestWillBeSent handler
// 3. Add additional CDP domain subscriptions in createCDPSession
// 4. Customize error categorization in the catch blocks
//
// TROUBLESHOOTING:
// - If you get "Protocol error" frequently, the browser may be overloaded
// - Timeout errors usually indicate the browser needs to be restarted
// - "Target closed" means the page was closed while CDP was active
//
// BROWSER COMPATIBILITY:
// - Chrome/Chromium 60+ (older versions may have limited CDP support)
// - Works in both headless and headed modes
// - Some features may not work in --no-sandbox mode
module.exports = {
createCDPSession,
createPageWithTimeout,
setRequestInterceptionWithTimeout
};