UNPKG

@fanboynz/network-scanner

Version:

A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.

1,201 lines (1,065 loc) 98.7 kB
/** * Cloudflare bypass and challenge handling module - Optimized with smart detection and adaptive timeouts * Version: 2.7.0 - Major fixes and performance overhaul * - Fix: Challenge solvers had empty if-blocks (JS/Turnstile/Legacy never executed in non-debug mode) * - Fix: a[href*="continue"] false positive removed (matched nearly every website) * - Perf: Domain-level detection cache (was per-URL, now per-hostname) * - Perf: Timeout outcome caching (domain times out once -> all subsequent URLs skip instantly) * - Perf: Short-circuit quick detection (title/URL -> fast selectors -> slow text, early return at each stage) * - Perf: Eliminated body.textContent in quick detection (was extracting entire DOM text tree) * - Perf: Capped body.textContent to 2KB in analyzeCloudflareChallenge * - Perf: No-indicator pages skip immediately regardless of config (was 10-15s wasted) * - Perf: Quick detection timeout 4s->2s, retries 2->1 * - Perf: PAGE_EVALUATION timeout 12s->5s, detached frame delay 3s->1s * - Perf: Inner timeouts tightened to fit within outer adaptive timeouts * - Perf: CHALLENGE_SOLVING 30s->12s, TURNSTILE_COMPLETION 20s->10s, JS_CHALLENGE_BUFFER 26s->12s * - Perf: MAX_RETRIES 3->2, baseDelay 1000->800ms, maxDelay 8000->5000ms * - Perf: Parallel detection gated behind cloudflare config (was running on every URL) * Version: 2.6.3 - Fixes Cannot read properties of undefined (reading 'hasIndicators') * Version: 2.6.2 - Further detached Frame fixes * Version: 2.6.1 - timeoutId is not defined & race condition fix * Version: 2.6.0 - Memory leak fixes and timeout cleanup * Version: 2.5.0 - Fix Frame Lifecycle issue, Timing and Race condition * Version: 2.4.1 - Bump timeout values * Version: 2.4.0 - Fix possible endless loops with retry logic and loop detection * Version: 2.3.1 - Colorize CF * Version: 2.3.0 - Support CF iframe challenges, and better error handling * Version: 2.2.0 - Enhanced with retry logic, caching, and improved error handling * Version: 2.1.0 - Enhanced with quick detection, adaptive timeouts, and comprehensive debug logging * Handles phishing warnings, Turnstile challenges, and modern Cloudflare protections */ // Import color utilities const { formatLogMessage, messageColors } = require('./colorize'); const URL_VALIDATION_TAG = messageColors.processing('[url-validation]'); const CLOUDFLARE_TAG = messageColors.cloudflare('[cloudflare]'); /** * Module version information */ const CLOUDFLARE_MODULE_VERSION = '2.7.0'; /** * Timeout constants for various operations (in milliseconds) * Optimized timeout constants for Puppeteer 22.x performance (in milliseconds) * All values tuned for maximum scanning speed while maintaining functionality */ const TIMEOUTS = { PAGE_EVALUATION: 5000, // Standard page evaluation timeout (DOM queries are instant) PAGE_EVALUATION_SAFE: 5000, // Safe page evaluation with extra buffer PHISHING_CLICK: 3000, // Timeout for clicking phishing continue button PHISHING_NAVIGATION: 8000, // Wait for navigation after phishing bypass JS_CHALLENGE_BUFFER: 12000, // JS challenge -- must fit within 15s adaptive outer timeout TURNSTILE_COMPLETION: 10000, // Turnstile completion check -- fits within adaptive timeout TURNSTILE_COMPLETION_BUFFER: 12000, // Turnstile completion with buffer CLICK_TIMEOUT: 5000, // Standard click operation timeout CLICK_TIMEOUT_BUFFER: 1000, // Click timeout safety buffer NAVIGATION_TIMEOUT: 15000, // Standard navigation timeout NAVIGATION_TIMEOUT_BUFFER: 2000, // Navigation timeout safety buffer // Adaptive timeouts are only consulted AFTER the no-indicators early // return in handleCloudflareProtection, so the WITHOUT_INDICATORS // variants were unreachable and have been removed. ADAPTIVE_TIMEOUT_WITH_INDICATORS: 25000, // Indicators present + explicit config ADAPTIVE_TIMEOUT_AUTO_WITH_INDICATORS: 15000, // Indicators present, auto-detected only // Removed: RETRY_DELAY, CHALLENGE_POLL_INTERVAL, CHALLENGE_MAX_POLLS -- // defined but never read. Backoff uses RETRY_CONFIG.baseDelay + // getRetryDelay(); challenges aren't polled via fixed interval. MAX_RETRIES: 2 // Maximum retry attempts (only 2 fit within 25s outer timeout) }; // Fast timeout constants - optimized for speed const FAST_TIMEOUTS = { QUICK_DETECTION: 2000, // Fast Cloudflare detection (DOM check, instant on loaded pages) PHISHING_WAIT: 1000, // Fast phishing check CHALLENGE_WAIT: 500, // Fast challenge detection ELEMENT_INTERACTION_DELAY: 250, // Fast element interactions SELECTOR_WAIT: 3000, // Fast selector waits // Removed: TURNSTILE_OPERATION -- defined but never read. The // turnstileTimeout local var that referenced it was also dead. JS_CHALLENGE: 10000, // Fast JS challenge completion CHALLENGE_SOLVING: 12000, // Overall challenge solving -- fits within 15s adaptive outer CHALLENGE_COMPLETION: 8000 // Fast completion check }; /** * Finds and clicks an element inside shadow DOM trees via page.evaluate * Returns {found, clicked, x, y} - coordinates allow fallback mouse.click */ async function clickInShadowDOM(context, selectors, forceDebug = false, waitMs = 1500) { // Try Puppeteer's pierce/ selector first -- handles CLOSED shadow roots via CDP for (const selector of selectors) { try { // Wait for element to appear (handles delayed rendering) const start = Date.now(); const element = await context.waitForSelector(`pierce/${selector}`, { timeout: waitMs }); if (element) { const box = await element.boundingBox(); if (box && box.width > 0 && box.height > 0) { if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} pierce/${selector} matched in ${Date.now() - start}ms -- box: ${box.width}x${box.height} at (${box.x},${box.y})`)); await element.click(); await element.dispose(); return { found: true, clicked: true, selector, x: box.x + box.width / 2, y: box.y + box.height / 2 }; } if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} pierce/${selector} found but not visible (0x0)`)); await element.dispose(); // Element found but not visible return { found: true, clicked: false, selector, x: 0, y: 0 }; } } catch (e) { if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} pierce/${selector} timeout after ${waitMs}ms`)); continue; } } // Fallback: manual traversal for open shadow roots const result = await context.evaluate((sels) => { function deepQuery(root, selector) { // Try direct query first const el = root.querySelector(selector); if (el) return el; // Traverse shadow roots const allElements = root.querySelectorAll('*'); for (const node of allElements) { if (node.shadowRoot) { const found = deepQuery(node.shadowRoot, selector); if (found) return found; } } return null; } for (const selector of sels) { const el = deepQuery(document, selector); if (el) { const rect = el.getBoundingClientRect(); if (rect.width > 0 && rect.height > 0) { el.click(); return { found: true, clicked: true, selector, x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 }; } return { found: true, clicked: false, selector, x: 0, y: 0 }; } } return { found: false, clicked: false, selector: null, x: 0, y: 0 }; }, selectors); return result; } /** * Error categories for better handling */ const ERROR_TYPES = { NETWORK: 'network', TIMEOUT: 'timeout', ELEMENT_NOT_FOUND: 'element_not_found', EVALUATION_FAILED: 'evaluation_failed', NAVIGATION_FAILED: 'navigation_failed', DETACHED_FRAME: 'detached_frame', UNKNOWN: 'unknown' }; /** * Retry configuration with exponential backoff. * Defined before getRetryConfig so the reference order is structurally * sound — previously getRetryConfig was hoisted above this const and only * worked because the function was never called during module load. */ const RETRY_CONFIG = { maxAttempts: 2, // Only 2 attempts fit within 25s outer timeout baseDelay: 800, // Slightly faster retry delay maxDelay: 5000, // Lower max delay cap backoffMultiplier: 2, retryableErrors: [ERROR_TYPES.NETWORK, ERROR_TYPES.TIMEOUT, ERROR_TYPES.ELEMENT_NOT_FOUND, ERROR_TYPES.DETACHED_FRAME] }; /** * Gets the retry configuration for a site, merging site-specific and global settings * @param {Object} siteConfig - Site configuration object * @returns {Object} Merged retry configuration */ function getRetryConfig(siteConfig) { return { maxAttempts: siteConfig.cloudflare_max_retries || RETRY_CONFIG.maxAttempts, baseDelay: RETRY_CONFIG.baseDelay, maxDelay: RETRY_CONFIG.maxDelay, backoffMultiplier: RETRY_CONFIG.backoffMultiplier, retryableErrors: RETRY_CONFIG.retryableErrors, retryOnError: siteConfig.cloudflare_retry_on_error !== false // Default to true }; } /** * Detects if we're in a challenge redirect loop by checking URL patterns */ function detectChallengeLoop(url, previousUrls = []) { // Check if current URL contains challenge indicators and we've seen similar URLs const isChallengeUrl = url.includes('/cdn-cgi/challenge-platform/') || url.includes('challenges.cloudflare.com') || url.includes('cf-ray'); if (!isChallengeUrl) return false; // Two loop signals with different sensitivities: // // exactMatches — page reloaded between retries but came back to the // identical URL. Strong signal: the reload didn't // advance the challenge state. Trips on a single // prior visit, which means it actually fires under // the default RETRY_CONFIG.maxAttempts = 2 (where // you only ever have one prior URL to compare to). // Previously the threshold was a flat >= 2 which // silently never fired with default config. // // cdnCgiMatches — both URLs are cdn-cgi challenge URLs (different // ray IDs). Weaker signal: a reload that yields a // fresh challenge is normal retry behavior, not a // loop. Keep the original >= 2 threshold so this // only trips with custom cloudflare_max_retries set // to 3+ (i.e. you've seen 2 fresh challenges and // the 3rd is still a challenge -- genuinely stuck). const urlIsCdnCgi = url.includes('/cdn-cgi/challenge-platform/'); let exactMatches = 0; let cdnCgiMatches = 0; for (const prevUrl of previousUrls) { if (prevUrl === url) exactMatches++; else if (urlIsCdnCgi && prevUrl.includes('/cdn-cgi/challenge-platform/')) cdnCgiMatches++; } return exactMatches >= 1 || cdnCgiMatches >= 2; } /** * Performance cache for detection results * Stores detection results per domain to avoid redundant checks */ class CloudflareDetectionCache { constructor(ttl = 300000) { // 5 minutes TTL by default this.cache = new Map(); // Outcomes live in a separate Map so the 1000-entry eviction on the // detection cache doesn't randomly drop "this domain timed out" entries // and re-permit expensive retries. Same TTL applies to both. this.outcomes = new Map(); this.ttl = ttl; this.hits = 0; this.misses = 0; // Prevent memory buildup in long-running processes. unref() so the // interval never prevents the Node process from exiting on its own — // nwss.js calls cleanup() explicitly on scan completion, but any other // consumer of this module that forgets to is still safe. this.cleanupInterval = setInterval(() => this.cleanupExpired(), ttl / 10); this.cleanupInterval.unref(); } getCacheKey(url) { try { const urlObj = new URL(url); return urlObj.hostname; // Domain-level caching: all URLs from same host share one entry } catch { return url; } } get(url) { const key = this.getCacheKey(url); const cached = this.cache.get(key); if (cached && Date.now() - cached.timestamp < this.ttl) { this.hits++; return cached.data; } if (cached) { this.cache.delete(key); // Remove expired entry } this.misses++; return null; } set(url, data) { const key = this.getCacheKey(url); this.cache.set(key, { data, timestamp: Date.now() }); // Prevent cache from growing too large if (this.cache.size > 1000) { const firstKey = this.cache.keys().next().value; this.cache.delete(firstKey); } } /** * Per-domain handling-outcome cache. Used to skip subsequent URLs on a * domain that already timed out, without polluting the detection cache. * Returns the cached outcome data or null (TTL-checked). */ getOutcome(url) { const key = this.getCacheKey(url); const entry = this.outcomes.get(key); if (entry && Date.now() - entry.timestamp < this.ttl) { return entry.data; } if (entry) this.outcomes.delete(key); return null; } setOutcome(url, data) { const key = this.getCacheKey(url); this.outcomes.set(key, { data, timestamp: Date.now() }); if (this.outcomes.size > 1000) { this.outcomes.delete(this.outcomes.keys().next().value); } } cleanupExpired() { const now = Date.now(); for (const [key, value] of this.cache.entries()) { if (now - value.timestamp >= this.ttl) { this.cache.delete(key); } } for (const [key, value] of this.outcomes.entries()) { if (now - value.timestamp >= this.ttl) { this.outcomes.delete(key); } } } destroy() { if (this.cleanupInterval) clearInterval(this.cleanupInterval); this.clear(); } clear() { this.cache.clear(); this.outcomes.clear(); this.hits = 0; this.misses = 0; } getStats() { const total = this.hits + this.misses; return { hits: this.hits, misses: this.misses, hitRate: total > 0 ? (this.hits / total * 100).toFixed(2) + '%' : '0%', size: this.cache.size, outcomes: this.outcomes.size }; } } // Initialize cache singleton const detectionCache = new CloudflareDetectionCache(); // One-shot flag for the per-process module-version banner. Was previously // logged once per URL in handleCloudflareProtection's debug header, which // produces N=URL-count copies for no useful signal beyond the first. let _moduleVersionLogged = false; // Per-scan aggregate stats. Updated on every handleCloudflareProtection // completion regardless of debug mode so nwss.js can print an end-of-scan // summary ("Of 200 URLs: 47 challenged, 31 solved via JS, 12 via Turnstile, // 4 timed out") without needing to thread the per-URL results back into the // orchestration layer. Reset via resetAggregateStats() or implicitly by // cleanup(). const aggregateStats = { total: 0, byOutcome: Object.create(null), // 'ok' -> N, 'solved(turnstile)' -> N, etc. bySolveMethod: Object.create(null), // Includes BOTH verification-challenge // methods ('js_challenge_wait', // 'turnstile', 'legacy_checkbox') and // the phishing-bypass method // ('phishing_continue'). totalDurationMs: 0, maxDurationMs: 0, // Cheap to track; surfaces the // worst-case URL when avg gets // dominated by timeouts. failures: 0, // !overallSuccess count timedOut: 0 // adaptive-timeout count (subset of failures) }; function bumpAggregate(outcome, result, durationMs) { aggregateStats.total++; aggregateStats.byOutcome[outcome] = (aggregateStats.byOutcome[outcome] || 0) + 1; aggregateStats.totalDurationMs += durationMs; if (durationMs > aggregateStats.maxDurationMs) aggregateStats.maxDurationMs = durationMs; if (!result.overallSuccess) aggregateStats.failures++; if (result.timedOut) aggregateStats.timedOut++; // Method-of-resolution tracking. Mirrors buildOutcomeString's branch // order: prefer the verification-challenge method, fall back to the // phishing-continue path. A URL where both succeeded gets counted under // the challenge method (matches `solved(turnstile)` etc. in byOutcome). const vMethod = result.verificationChallenge && result.verificationChallenge.method; if (vMethod) { aggregateStats.bySolveMethod[vMethod] = (aggregateStats.bySolveMethod[vMethod] || 0) + 1; } else if (result.phishingWarning && result.phishingWarning.attempted && result.phishingWarning.success) { aggregateStats.bySolveMethod['phishing_continue'] = (aggregateStats.bySolveMethod['phishing_continue'] || 0) + 1; } } /** * Returns a snapshot of per-scan aggregate stats. nwss.js can call this at * scan end to print a summary. Pass {reset:true} to atomically read+reset * so multi-scan processes don't accumulate across runs. */ function getAggregateStats({ reset = false } = {}) { const snap = { total: aggregateStats.total, failures: aggregateStats.failures, timedOut: aggregateStats.timedOut, byOutcome: { ...aggregateStats.byOutcome }, bySolveMethod: { ...aggregateStats.bySolveMethod }, avgDurationMs: aggregateStats.total > 0 ? Math.round(aggregateStats.totalDurationMs / aggregateStats.total) : 0, maxDurationMs: aggregateStats.maxDurationMs }; if (reset) resetAggregateStats(); return snap; } function resetAggregateStats() { aggregateStats.total = 0; aggregateStats.failures = 0; aggregateStats.timedOut = 0; aggregateStats.byOutcome = Object.create(null); aggregateStats.bySolveMethod = Object.create(null); aggregateStats.totalDurationMs = 0; aggregateStats.maxDurationMs = 0; } // Note: getModuleInfo() helper was removed -- had zero callers internal // or external. CLOUDFLARE_MODULE_VERSION stays as it's read by the // once-per-process version banner in handleCloudflareProtection. /** * Validates if a URL should be processed by Cloudflare protection * Only allows HTTP/HTTPS URLs, skips browser-internal and special protocols * @param {string} url - URL to validate * @param {boolean} forceDebug - Debug logging flag * @returns {boolean} True if URL should be processed */ // Single precompiled regex anchored to URL start. Matches any of the // browser-internal / special protocols we want to skip, plus succeeds on // http(s):// for the inverse check below. Faster than running 13 sequential // startsWith comparisons per URL. const SKIP_PROTO_RE = /^(?:about|chrome|chrome-extension|chrome-error|chrome-search|devtools|edge|moz-extension|safari-extension|webkit|data|blob|javascript|vbscript|file|ftp|ftps):/i; const HTTP_PROTO_RE = /^https?:\/\//i; function shouldProcessUrl(url, forceDebug = false) { if (!url || typeof url !== 'string') { if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} ${URL_VALIDATION_TAG} Skipping invalid URL: ${url}`)); return false; } const skipMatch = url.match(SKIP_PROTO_RE); if (skipMatch) { if (forceDebug) { console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} ${URL_VALIDATION_TAG} Skipping ${skipMatch[0].toLowerCase()} URL: ${url.substring(0, 100)}${url.length > 100 ? '...' : ''}`)); } return false; } if (!HTTP_PROTO_RE.test(url)) { if (forceDebug) { console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} ${URL_VALIDATION_TAG} Skipping non-HTTP(S) URL: ${url.substring(0, 100)}${url.length > 100 ? '...' : ''}`)); } return false; } return true; } /** * Fast timeout helper for Puppeteer 22.x compatibility. Replaces deprecated * page.waitForTimeout() with a standard Promise-based delay. The `page` arg * used to be required for the deprecated API; it's been dropped now that * every call site is just sleeping. Renamed from waitForTimeout to fastTimeout * to match the CLAUDE.md convention used across the codebase. */ function fastTimeout(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } /** * Captures whether the page currently has Cloudflare's two key cookies. * cf_clearance is the post-challenge clearance token — its presence is the * single most reliable "did the bypass actually succeed" signal, beating * any DOM-side completion check. __cf_bm is the bot-mitigation cookie * (typically set on every request that goes through CF's edge). * Errors swallowed: cookie read failures should not affect bypass logic. */ async function getCfCookieState(page) { try { const cookies = await page.cookies(); let cf_clearance = false; let cf_bm = false; for (const c of cookies) { if (c.name === 'cf_clearance') cf_clearance = true; else if (c.name === '__cf_bm') cf_bm = true; } return { cf_clearance, cf_bm }; } catch { return { cf_clearance: false, cf_bm: false }; } } /** * Maps a handleCloudflareProtection result back to a short outcome tag * for the per-URL summary log. The tag is grep-friendly (no spaces) so * users can post-process scan logs by outcome category. */ function buildOutcomeString(result, errorCode) { if (!result) return 'unknown'; if (result.skippedInvalidUrl) return 'skipped(non-http)'; if (result.quickDetectionFailed) return 'detection_failed'; if (result.cloudflareErrorPage) return `error_page(${errorCode || '5xx'})`; if (result.timedOut) return 'timeout'; if (result.verificationChallenge?.requiresHuman) return 'captcha_required'; if (result.verificationChallenge?.attempted && result.verificationChallenge?.success) { return `solved(${result.verificationChallenge.method || 'unknown'})`; } if (result.phishingWarning?.attempted && result.phishingWarning?.success) { return 'solved(phishing_continue)'; } if (result.skippedNoIndicators) return 'no_indicators'; if (!result.overallSuccess) return 'failed'; return 'ok'; } /** * Categorizes errors for better handling */ function categorizeError(error) { // Guard against null/undefined error so callers using categorizeError in // safe-defaults return paths (e.g. safePageEvaluate's final fallback when // lastError was never assigned) don't blow up reading .message. if (!error) return ERROR_TYPES.UNKNOWN; const errorMessage = error.message || ''; if (errorMessage.includes('detached Frame') || errorMessage.includes('Attempted to use detached')) { return ERROR_TYPES.DETACHED_FRAME; } if (errorMessage.includes('timeout') || errorMessage.includes('Timeout')) { return ERROR_TYPES.TIMEOUT; } if (errorMessage.includes('Protocol error') || errorMessage.includes('Target closed')) { return ERROR_TYPES.NETWORK; } if (errorMessage.includes('evaluation') || errorMessage.includes('Evaluation')) { return ERROR_TYPES.EVALUATION_FAILED; } if (errorMessage.includes('navigation') || errorMessage.includes('Navigation')) { return ERROR_TYPES.NAVIGATION_FAILED; } return ERROR_TYPES.UNKNOWN; } /** * Implements exponential backoff delay */ function getRetryDelay(attempt) { const delay = Math.min( RETRY_CONFIG.baseDelay * Math.pow(RETRY_CONFIG.backoffMultiplier, attempt - 1), RETRY_CONFIG.maxDelay ); return delay; } /** * Enhanced safe page evaluation with retry logic and better error handling */ async function safePageEvaluate(page, func, timeout = TIMEOUTS.PAGE_EVALUATION_SAFE, options = {}) { const { maxRetries = RETRY_CONFIG.maxAttempts, forceDebug = false } = options; let lastError = null; for (let attempt = 1; attempt <= maxRetries; attempt++) { let timeoutId = null; try { // Multi-layered page state validation if (page.isClosed()) { throw new Error('Page is closed or invalid'); } // Check if page is still navigating or has valid context let currentUrl; try { currentUrl = await page.url(); if (!currentUrl || currentUrl === 'about:blank') { throw new Error('Page URL is invalid or blank'); } } catch (urlError) { throw new Error('Page URL access failed - likely detached'); } const result = await Promise.race([ page.evaluate(func), new Promise((_, reject) => { timeoutId = setTimeout(() => reject(new Error('Page evaluation timeout')), timeout); }) ]); // Clear timeout if evaluation completed first if (timeoutId) { clearTimeout(timeoutId); } if (forceDebug && attempt > 1) { console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Page evaluation succeeded on attempt ${attempt}`)); } return result; } catch (error) { // Ensure timeout is cleared on any error if (timeoutId) { clearTimeout(timeoutId); } lastError = error; const errorType = categorizeError(error); if (forceDebug) { console.warn(formatLogMessage('warn', `${CLOUDFLARE_TAG} Page evaluation failed (attempt ${attempt}/${maxRetries}): ${error.message} [${errorType}]`)); } // Handle detached frame errors specifically if (errorType === ERROR_TYPES.DETACHED_FRAME) { if (forceDebug) { console.warn(formatLogMessage('warn', `${CLOUDFLARE_TAG} Detached frame detected on attempt ${attempt}/${maxRetries} - using longer delay`)); } // For detached frames, brief delay before retry await new Promise(resolve => setTimeout(resolve, 1000)); // For detached frames, only retry once more if (attempt >= 2) { break; } continue; } // Don't retry if error type is not retryable or if it's the last attempt if (!RETRY_CONFIG.retryableErrors.includes(errorType) || attempt === maxRetries) { break; } // Wait before retrying with exponential backoff await new Promise(resolve => setTimeout(resolve, getRetryDelay(attempt))); } } // Return safe defaults if all retries failed return { isChallengePresent: false, isPhishingWarning: false, isTurnstile: false, isJSChallenge: false, isChallengeCompleted: false, error: lastError?.message || 'Unknown error', errorType: categorizeError(lastError), attempts: maxRetries }; } /** * Safe element clicking with timeout protection */ async function safeClick(page, selector, timeout = TIMEOUTS.CLICK_TIMEOUT) { let timeoutId; try { return await Promise.race([ page.click(selector, { timeout: timeout }), new Promise((_, reject) => { timeoutId = setTimeout(() => reject(new Error('Click timeout')), timeout + TIMEOUTS.CLICK_TIMEOUT_BUFFER); }) ]); } catch (error) { throw new Error(`Click failed: ${error.message}`); } finally { if (timeoutId) clearTimeout(timeoutId); } } /** * Safe navigation waiting with timeout protection. The warn on timeout is * forceDebug-gated to match the convention of the other warn sites in this * file -- previously it fired unconditionally, which spammed stderr on every * phishing-bypass click that didn't trigger a clean redirect. */ async function safeWaitForNavigation(page, timeout = TIMEOUTS.NAVIGATION_TIMEOUT, forceDebug = false) { let timeoutId; try { return await Promise.race([ page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: timeout }), new Promise((_, reject) => { timeoutId = setTimeout(() => reject(new Error('Navigation timeout')), timeout + TIMEOUTS.NAVIGATION_TIMEOUT_BUFFER); }) ]); } catch (error) { if (forceDebug) console.warn(formatLogMessage('warn', `${CLOUDFLARE_TAG} Navigation wait failed: ${error.message}`)); } finally { if (timeoutId) clearTimeout(timeoutId); } } /** * Quick Cloudflare detection with caching for performance */ async function quickCloudflareDetection(page, forceDebug = false) { try { // Get current page URL and validate it const currentPageUrl = await page.url(); if (!shouldProcessUrl(currentPageUrl, forceDebug)) { if (forceDebug) { console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Quick detection skipping non-HTTP(S) page: ${currentPageUrl}`)); } return { hasIndicators: false, skippedInvalidUrl: true }; } // Check cache first const cachedResult = detectionCache.get(currentPageUrl); if (cachedResult !== null) { if (forceDebug) { const stats = detectionCache.getStats(); console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Using cached detection result (cache hit rate: ${stats.hitRate})`)); } // Return a fresh shallow copy tagged _fromCache so the handler's // logging can say "[cached]" instead of presenting cached title/body // details as if they were fresh. return { ...cachedResult, _fromCache: true }; } // Perform actual detection with enhanced error handling const quickCheck = await safePageEvaluate(page, () => { const title = document.title || ''; const url = window.location.href; // Cloudflare-served 5xx origin-error pages (522/523/524/525/526/527/530). // Title format is reliable: "<domain> | 5xx: <reason>". These are NOT // bypass-able challenges — the origin is unreachable. Mark as // recognized (hasErrorPage) but NOT as a bypass target (hasIndicators // stays false) so the early-skip path still fires and the log can say // "Cloudflare error page" instead of the misleading "No Cloudflare // indicators found". errorCode is the captured 5xx digit so outcome // logs can grep by specific error type. const titleErrorMatch = title.match(/\|\s*(5\d\d):/); if (titleErrorMatch) { return { hasIndicators: false, hasErrorPage: true, errorCode: parseInt(titleErrorMatch[1], 10), title, url, bodySnippet: '' }; } // FAST PATH: Check title + URL first (string ops, no DOM traversal) const titleMatch = title.includes('Just a moment') || title.includes('Checking your browser') || title.includes('Attention Required') || title.includes('Security check'); const urlMatch = url.includes('/cdn-cgi/challenge-platform/') || url.includes('cloudflare.com'); if (titleMatch || urlMatch) { return { hasIndicators: true, title, url, bodySnippet: '' }; } // MEDIUM PATH: Combine fast-path selectors into one query — one DOM // walk for all 7 alternatives instead of up to 7 separate walks. const selectorMatch = document.querySelector( '[data-ray], [data-cf-challenge], .cf-challenge-running, .cf-turnstile, .cf-managed-challenge, [data-cf-managed], script[src*="/cdn-cgi/challenge-platform/"]' ); if (selectorMatch) { return { hasIndicators: true, title, url, bodySnippet: '' }; } // SLOW PATH: Extract limited body text only if fast checks failed // Use body.innerText capped to first child nodes instead of full textContent let bodyText = ''; if (document.body) { const el = document.body.querySelector('.main-wrapper, .main-content, #challenge-body-text, .cf-challenge-container'); bodyText = el ? el.textContent.substring(0, 300) : (document.body.firstElementChild ? document.body.firstElementChild.textContent.substring(0, 300) : ''); } const textMatch = bodyText.includes('Cloudflare') || bodyText.includes('cf-ray') || bodyText.includes('Verify you are human') || bodyText.includes('This website has been reported for potential phishing') || bodyText.includes('Please wait while we verify') || bodyText.includes('Checking if the site connection is secure'); // Remaining slower selectors — combined into one query for the same reason. const slowSelectorMatch = document.querySelector( '.cf-challenge-container, .ctp-checkbox-container, iframe[src*="challenges.cloudflare.com"], iframe[title*="Cloudflare security challenge"]' ); // Body-text fallback for error pages with non-standard titles. // Same rationale as the early title check: recognize but don't bypass. const bodyErrorMatch = bodyText.match(/Error code (5\d\d)/); if (bodyErrorMatch && !textMatch && !slowSelectorMatch) { return { hasIndicators: false, hasErrorPage: true, errorCode: parseInt(bodyErrorMatch[1], 10), title, url, bodySnippet: bodyText.substring(0, 200) }; } return { hasIndicators: !!(textMatch || slowSelectorMatch), title, url, bodySnippet: bodyText.substring(0, 200) }; }, FAST_TIMEOUTS.QUICK_DETECTION, { maxRetries: 1, forceDebug }); // Cache the result detectionCache.set(currentPageUrl, quickCheck); if (forceDebug) { if (quickCheck.hasIndicators) { console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Quick detection found Cloudflare indicators on ${quickCheck.url}`)); } // hasErrorPage and no-indicators cases are deliberately silent here — // handleCloudflareProtection prints a clearer per-action line right // after ("Cloudflare error page detected..." or "No Cloudflare // indicators found, skipping protection handling..."), so logging // here would just duplicate it. if (quickCheck.attempts && quickCheck.attempts > 1) { console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Detection required ${quickCheck.attempts} attempts`)); } } return quickCheck; } catch (error) { if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Quick detection failed: ${error.message}`)); return { hasIndicators: false, error: error.message }; } } /** * Analyzes the current page to detect Cloudflare challenges - Enhanced with timeout protection and detailed debug logging */ async function analyzeCloudflareChallenge(page) { try { // CDP-level frame check -- bypasses closed shadow roots const frames = page.frames(); const hasChallengeFrame = frames.some(f => { const url = f.url(); return url.includes('challenges.cloudflare.com') || url.includes('/cdn-cgi/challenge-platform/'); }); const result = await safePageEvaluate(page, () => { const title = document.title || ''; // Cap text extraction -- on content-heavy pages body.textContent can be megabytes const bodyText = document.body ? document.body.textContent.substring(0, 2000) : ''; // Updated selectors for 2025 Cloudflare challenges. Each category groups // its alternatives into a single comma-separated selector so the browser // walks the DOM once per category instead of once per alternative. const hasTurnstileIframe = !!document.querySelector( 'iframe[title*="Cloudflare security challenge"], iframe[src*="challenges.cloudflare.com"], iframe[title*="Widget containing a Cloudflare"]' ); const hasTurnstileContainer = !!document.querySelector( '.cf-turnstile, .ctp-checkbox-container, .ctp-checkbox-label' ); const hasTurnstileCheckbox = !!document.querySelector( 'input[type="checkbox"].ctp-checkbox, .ctp-checkbox' ); const hasLegacyCheckbox = !!document.querySelector( 'input[type="checkbox"]#challenge-form, input[type="checkbox"][name="cf_captcha_kind"]' ); const hasChallengeRunning = !!document.querySelector( '.cf-challenge-running, .cf-challenge-container, .challenge-stage, .challenge-form' ); const hasDataRay = !!document.querySelector('[data-ray], [data-cf-challenge]'); // Managed challenges (cf-managed). parallelChallengeDetection and the // quick-detection slow path both look for these, but the main analyzer // used to ignore them — a managed-challenge-only page would then slip // past isChallengePresent. Now folded in below. const hasManagedChallenge = !!document.querySelector( '.cf-managed-challenge, [data-cf-managed]' ); const hasCaptcha = bodyText.includes('CAPTCHA') || bodyText.includes('captcha') || bodyText.includes('hCaptcha') || bodyText.includes('reCAPTCHA'); const hasJSChallenge = document.querySelector('script[src*="/cdn-cgi/challenge-platform/"]') !== null || bodyText.includes('Checking your browser') || bodyText.includes('Please wait while we verify'); const hasPhishingWarning = bodyText.includes('This website has been reported for potential phishing') || title.includes('Attention Required'); // Cache the element once -- isChallengeCompleted used to re-query the // same selector after hasTurnstileResponse had already located it. const turnstileInput = document.querySelector('input[name="cf-turnstile-response"]'); const hasTurnstileResponse = turnstileInput !== null; const isChallengeCompleted = hasTurnstileResponse && !!turnstileInput.value; const isChallengePresent = title.includes('Just a moment') || title.includes('Checking your browser') || bodyText.includes('Verify you are human') || hasLegacyCheckbox || hasChallengeRunning || hasDataRay || hasTurnstileIframe || hasTurnstileContainer || hasJSChallenge || hasManagedChallenge; return { isChallengePresent, isPhishingWarning: hasPhishingWarning, isTurnstile: hasTurnstileIframe || hasTurnstileContainer || hasTurnstileCheckbox, isJSChallenge: hasJSChallenge, hasManagedChallenge, isChallengeCompleted, title, hasLegacyCheckbox, hasTurnstileIframe, hasTurnstileContainer, hasTurnstileCheckbox, hasChallengeRunning, hasDataRay, hasCaptcha, hasTurnstileResponse, url: window.location.href, bodySnippet: bodyText.substring(0, 200) }; }, TIMEOUTS.PAGE_EVALUATION); // Merge CDP frame detection -- catches iframes behind closed shadow roots if (hasChallengeFrame && !result.hasTurnstileIframe) { result.hasTurnstileIframe = true; result.isTurnstile = true; result.isChallengePresent = true; } return result; } catch (error) { return { isChallengePresent: false, isPhishingWarning: false, isTurnstile: false, isJSChallenge: false, isChallengeCompleted: false, error: error.message }; } } /** * Handles Cloudflare phishing warnings with timeout protection and enhanced debug logging * * @param {Object} page - Puppeteer page instance * @param {string} currentUrl - URL being processed * @param {boolean} forceDebug - Debug logging flag * @returns {Promise<Object>} Phishing warning result: * { * success: boolean, // True if no warning found OR successfully bypassed * attempted: boolean, // True if warning was detected and bypass attempted * error: string|null, // Error message if bypass failed * details: object|null // Analysis details from analyzeCloudflareChallenge() * } */ async function handlePhishingWarning(page, currentUrl, forceDebug = false) { const result = { success: false, attempted: false, error: null, details: null }; try { if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Checking for phishing warning on ${currentUrl}`)); // Shorter wait with timeout protection await fastTimeout(FAST_TIMEOUTS.PHISHING_WAIT); const challengeInfo = await analyzeCloudflareChallenge(page); if (challengeInfo.isPhishingWarning) { result.attempted = true; result.details = challengeInfo; if (forceDebug) { // One structured line; matches the collapsed Challenge-detected log. console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Phishing warning detected on ${currentUrl}: title="${challengeInfo.title}" url=${challengeInfo.url}`)); console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Body snippet: ${challengeInfo.bodySnippet}`)); } try { // Use safe click with shorter timeout await safeClick(page, 'a[href*="continue"]', TIMEOUTS.PHISHING_CLICK); await safeWaitForNavigation(page, TIMEOUTS.PHISHING_NAVIGATION, forceDebug); result.success = true; if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Successfully bypassed phishing warning for ${currentUrl}`)); } catch (clickError) { result.error = `Failed to click continue button: ${clickError.message}`; if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Failed to bypass phishing warning: ${clickError.message}`)); } } else { if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} No phishing warning detected on ${currentUrl}`)); result.success = true; // No warning to handle } } catch (error) { result.error = error.message; if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Phishing warning check failed for ${currentUrl}: ${error.message}`)); } return result; } /** * Attempts to solve Cloudflare challenges with timeout protection and enhanced debug logging * * @param {Object} page - Puppeteer page instance * @param {string} currentUrl - URL being processed * @param {boolean} forceDebug - Debug logging flag * @returns {Promise<Object>} Challenge verification result: * { * success: boolean, // True if no challenge found OR successfully solved * attempted: boolean, // True if challenge was detected and solving attempted * error: string|null, // Error message if solving failed * requiresHuman: boolean, // True if CAPTCHA detected (requires manual intervention) * method: string|null, // Method that succeeded: 'js_challenge_wait', 'turnstile', 'legacy_checkbox' * details: object|null // Analysis details from analyzeCloudflareChallenge() * } */ async function handleVerificationChallenge(page, currentUrl, forceDebug = false) { const result = { success: false, attempted: false, error: null, details: null, requiresHuman: false, method: null }; try { if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Checking for verification challenge on ${currentUrl}`)); // Reduced wait time await fastTimeout(FAST_TIMEOUTS.CHALLENGE_WAIT); const challengeInfo = await analyzeCloudflareChallenge(page); result.details = challengeInfo; if (challengeInfo.isChallengePresent) { result.attempted = true; if (forceDebug) { // One structured line instead of 14 separate log calls. Flags use // single-letter shorthand (t/f) to keep the line scannable; full // bodySnippet stays on its own line because it's the only field // that's worth more than a column-width of attention. const f = (v) => v ? 't' : 'f'; const ci = challengeInfo; console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Challenge detected on ${currentUrl}: title="${ci.title}" url=${ci.url} turnstile=${f(ci.isTurnstile)} js=${f(ci.isJSChallenge)} legacy=${f(ci.hasLegacyCheckbox)} iframe=${f(ci.hasTurnstileIframe)} container=${f(ci.hasTurnstileContainer)} checkbox=${f(ci.hasTurnstileCheckbox)} captcha=${f(ci.hasCaptcha)} running=${f(ci.hasChallengeRunning)} dataRay=${f(ci.hasDataRay)} tsResponse=${f(ci.hasTurnstileResponse)}`)); console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Body snippet: ${ci.bodySnippet}`)); } // Check for CAPTCHA that requires human intervention if (challengeInfo.hasCaptcha) { result.requiresHuman = true; result.error = 'CAPTCHA detected - requires human intervention'; if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Skipping automatic bypass due to CAPTCHA requirement`)); return result; } // Attempt to solve the challenge with timeout protection const solveResult = await attemptChallengeSolveWithTimeout(page, currentUrl, challengeInfo, forceDebug); result.success = solveResult.success; result.error = solveResult.error; result.method = solveResult.method; } else { if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} No verification challenge detected on ${currentUrl}`)); result.success = true; } } catch (error) { result.error = error.message; if (forceDebug) console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Challenge check failed for ${currentUrl}: ${error.message}`)); } return result; } /** * Enhanced challenge handling with retry logic and loop detection */ /** * Generic retry harness shared by the verification-challenge and * phishing-warning paths (was ~150 lines of duplicated try/catch/backoff * before extraction). Resolves with the inner result + bookkeeping fields * (attempts, optional maxRetriesExceeded, optional errorType). Never * rejects — the inner attemptFn's exceptions are categorized and either * retried or bundled into a failure-result return. * * @param {object} cfg * @param {string} cfg.label - Human label for logs ("Challenge" / "Phishing warning") * @param {object} cfg.retryConfig - From getRetryConfig(siteConfig) * @param {boolean} cfg.forceDebug * @param {(attempt:number) => Promise<object>} cfg.attemptFn * @param {object} [cfg.failureShape] - Extra fields merged into the * error/exhaustion return objects (e.g. {requiresHuman:false,method:null} * for the challenge path so its callers always see those keys). * @param {(attempt:number) => Promise<object|null>} [cfg.preIteration] * Optional hook fired before each attempt. Return a result object to * short-circuit the harness (e.g. challenge loop-detected); return null * to proceed with the attempt. * @param {(attempt:number) => Promise<void>} [cfg.betweenAttempts] * Optional hook fired after a failed attempt but before the next one * (e.g. page.reload() between challenge retries). */ async function runWithRetries(cfg) { const { label, retryConfig, forceDebug, attemptFn, failureShape = {}, preIteration, betweenAttempts } = cfg; let lastError = null; for (let attempt = 1; attempt <= retryConfig.maxAttempts; attempt++) { try { if (preIteration) { const earlyReturn = await preIteration(attempt); if (earlyReturn) return earlyReturn; } if (forceDebug && attempt > 1) { console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} ${label} attempt ${attempt}/${retryConfig.maxAttempts}`)); } const result = await attemptFn(attempt); if (result.success || result.requiresHuman || !retryConfig.retryOnError) { if (forceDebug && attempt > 1) { console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} ${label} ${result.success ? 'succeeded' : 'failed'} on attempt ${attempt}`)); } return { ...result, attempts: attempt }; } if (attempt < retryConfig.maxAttempts) { const delay = getRetryDelay(attempt); if (forceDebug) { console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} ${label} attempt ${attempt} failed, retrying in ${delay}ms: ${result.error}`)); } await new Promise(resolve => setTimeout(resolve, delay)); if (betweenAttempts) await betweenAttempts(attempt); } lastError = result.error; } catch (error) { lastError = error.message; const errorType = categorizeError(error); if (forceDebug) { console.warn(formatLogMessage('warn', `${CLOUDFLARE_TAG} ${label} attempt ${attempt}/${retryConfig.maxAttempts} failed: ${error.message} [${errorType}]`)); } if (!retryConfig.retryableErrors.includes(errorType) || attempt === retryConfig.maxAttempts) { return { success: false, attempted: true, error: lastError, details: null, attempts: attempt, errorType, ...failureShape }; } if (attempt < retryConfig.maxAttempts) { await new Promise(resolve => setTimeout(resolve, getRetryDelay(attempt))); } } } return { success: false, attempted: true, error: `All ${retryConfig.maxAttempts} ${label.toLowerCase()} attempts failed. Last error: ${lastError}`, details: null, attempts: retryConfig.maxAttempts, maxRetriesExceeded: true, ...failureShape }; } async function handleVerificationChallengeWithRetries(page, currentUrl, siteConfig, forceDebug = false) { const retryConfig = getRetryConfig(siteConfig); const visitedUrls = []; // Track URLs to detect redirect loops if (forceDebug) { console.log(formatLogMessage('debug', `${CLOUDFLARE_TAG} Starting verification challenge with max ${retryConfig.maxAttempts} attempts`)); } return runWithRetries({ label: 'Challenge', retryConfig, forceDebug,