UNPKG

@fanboynz/network-scanner

Version:

A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.

1,258 lines (1,105 loc) 85.1 kB
/** * Cloudflare bypass and challenge handling module - Optimized with smart detection and adaptive timeouts * Version: 2.7.0 - Major fixes and performance overhaul * - Fix: Challenge solvers had empty if-blocks (JS/Turnstile/Legacy never executed in non-debug mode) * - Fix: a[href*="continue"] false positive removed (matched nearly every website) * - Perf: Domain-level detection cache (was per-URL, now per-hostname) * - Perf: Timeout outcome caching (domain times out once -> all subsequent URLs skip instantly) * - Perf: Short-circuit quick detection (title/URL -> fast selectors -> slow text, early return at each stage) * - Perf: Eliminated body.textContent in quick detection (was extracting entire DOM text tree) * - Perf: Capped body.textContent to 2KB in analyzeCloudflareChallenge * - Perf: No-indicator pages skip immediately regardless of config (was 10-15s wasted) * - Perf: Quick detection timeout 4s->2s, retries 2->1 * - Perf: PAGE_EVALUATION timeout 12s->5s, detached frame delay 3s->1s * - Perf: Inner timeouts tightened to fit within outer adaptive timeouts * - Perf: CHALLENGE_SOLVING 30s->12s, TURNSTILE_COMPLETION 20s->10s, JS_CHALLENGE_BUFFER 26s->12s * - Perf: MAX_RETRIES 3->2, baseDelay 1000->800ms, maxDelay 8000->5000ms * - Perf: Parallel detection gated behind cloudflare config (was running on every URL) * Version: 2.6.3 - Fixes Cannot read properties of undefined (reading 'hasIndicators') * Version: 2.6.2 - Further detached Frame fixes * Version: 2.6.1 - timeoutId is not defined & race condition fix * Version: 2.6.0 - Memory leak fixes and timeout cleanup * Version: 2.5.0 - Fix Frame Lifecycle issue, Timing and Race condition * Version: 2.4.1 - Bump timeout values * Version: 2.4.0 - Fix possible endless loops with retry logic and loop detection * Version: 2.3.1 - Colorize CF * Version: 2.3.0 - Support CF iframe challenges, and better error handling * Version: 2.2.0 - Enhanced with retry logic, caching, and improved error handling * Version: 2.1.0 - Enhanced with quick detection, adaptive timeouts, and comprehensive debug logging * Handles phishing warnings, Turnstile challenges, and modern Cloudflare protections */ // Import color utilities const { formatLogMessage } = require('./colorize'); /** * Module version information */ const CLOUDFLARE_MODULE_VERSION = '2.7.0'; /** * Timeout constants for various operations (in milliseconds) * Optimized timeout constants for Puppeteer 22.x performance (in milliseconds) * All values tuned for maximum scanning speed while maintaining functionality */ const TIMEOUTS = { PAGE_EVALUATION: 5000, // Standard page evaluation timeout (DOM queries are instant) PAGE_EVALUATION_SAFE: 5000, // Safe page evaluation with extra buffer PHISHING_CLICK: 3000, // Timeout for clicking phishing continue button PHISHING_NAVIGATION: 8000, // Wait for navigation after phishing bypass JS_CHALLENGE_BUFFER: 12000, // JS challenge -- must fit within 15s adaptive outer timeout TURNSTILE_COMPLETION: 10000, // Turnstile completion check -- fits within adaptive timeout TURNSTILE_COMPLETION_BUFFER: 12000, // Turnstile completion with buffer CLICK_TIMEOUT: 5000, // Standard click operation timeout CLICK_TIMEOUT_BUFFER: 1000, // Click timeout safety buffer NAVIGATION_TIMEOUT: 15000, // Standard navigation timeout NAVIGATION_TIMEOUT_BUFFER: 2000, // Navigation timeout safety buffer ADAPTIVE_TIMEOUT_WITH_INDICATORS: 25000, // Adaptive timeout when indicators found + explicit config ADAPTIVE_TIMEOUT_WITHOUT_INDICATORS: 20000, // Adaptive timeout with explicit config only ADAPTIVE_TIMEOUT_AUTO_WITH_INDICATORS: 15000, // Adaptive timeout for auto-detected with indicators ADAPTIVE_TIMEOUT_AUTO_WITHOUT_INDICATORS: 10000, // Adaptive timeout for auto-detected without indicators // New timeouts for enhanced functionality RETRY_DELAY: 1000, // Delay between retry attempts MAX_RETRIES: 2, // Maximum retry attempts (only 2 fit within 25s outer timeout) CHALLENGE_POLL_INTERVAL: 500, // Interval for polling challenge completion CHALLENGE_MAX_POLLS: 20 // Maximum polling attempts }; // Fast timeout constants - optimized for speed const FAST_TIMEOUTS = { QUICK_DETECTION: 2000, // Fast Cloudflare detection (DOM check, instant on loaded pages) PHISHING_WAIT: 1000, // Fast phishing check CHALLENGE_WAIT: 500, // Fast challenge detection ELEMENT_INTERACTION_DELAY: 250, // Fast element interactions SELECTOR_WAIT: 3000, // Fast selector waits TURNSTILE_OPERATION: 6000, // Fast Turnstile operations JS_CHALLENGE: 10000, // Fast JS challenge completion CHALLENGE_SOLVING: 12000, // Overall challenge solving -- fits within 15s adaptive outer CHALLENGE_COMPLETION: 8000 // Fast completion check }; /** * Finds and clicks an element inside shadow DOM trees via page.evaluate * Returns {found, clicked, x, y} - coordinates allow fallback mouse.click */ async function clickInShadowDOM(context, selectors, forceDebug = false, waitMs = 1500) { // Try Puppeteer's pierce/ selector first -- handles CLOSED shadow roots via CDP for (const selector of selectors) { try { // Wait for element to appear (handles delayed rendering) const start = Date.now(); const element = await context.waitForSelector(`pierce/${selector}`, { timeout: waitMs }); if (element) { const box = await element.boundingBox(); if (box && box.width > 0 && box.height > 0) { if (forceDebug) console.log(formatLogMessage('cloudflare', `pierce/${selector} matched in ${Date.now() - start}ms -- box: ${box.width}x${box.height} at (${box.x},${box.y})`)); await element.click(); await element.dispose(); return { found: true, clicked: true, selector, x: box.x + box.width / 2, y: box.y + box.height / 2 }; } if (forceDebug) console.log(formatLogMessage('cloudflare', `pierce/${selector} found but not visible (0x0)`)); await element.dispose(); // Element found but not visible return { found: true, clicked: false, selector, x: 0, y: 0 }; } } catch (e) { if (forceDebug) console.log(formatLogMessage('cloudflare', `pierce/${selector} timeout after ${waitMs}ms`)); continue; } } // Fallback: manual traversal for open shadow roots const result = await context.evaluate((sels) => { function deepQuery(root, selector) { // Try direct query first const el = root.querySelector(selector); if (el) return el; // Traverse shadow roots const allElements = root.querySelectorAll('*'); for (const node of allElements) { if (node.shadowRoot) { const found = deepQuery(node.shadowRoot, selector); if (found) return found; } } return null; } for (const selector of sels) { const el = deepQuery(document, selector); if (el) { const rect = el.getBoundingClientRect(); if (rect.width > 0 && rect.height > 0) { el.click(); return { found: true, clicked: true, selector, x: rect.x + rect.width / 2, y: rect.y + rect.height / 2 }; } return { found: true, clicked: false, selector, x: 0, y: 0 }; } } return { found: false, clicked: false, selector: null, x: 0, y: 0 }; }, selectors); return result; } /** * Error categories for better handling */ const ERROR_TYPES = { NETWORK: 'network', TIMEOUT: 'timeout', ELEMENT_NOT_FOUND: 'element_not_found', EVALUATION_FAILED: 'evaluation_failed', NAVIGATION_FAILED: 'navigation_failed', DETACHED_FRAME: 'detached_frame', UNKNOWN: 'unknown' }; /** * Retry configuration with exponential backoff. * Defined before getRetryConfig so the reference order is structurally * sound — previously getRetryConfig was hoisted above this const and only * worked because the function was never called during module load. */ const RETRY_CONFIG = { maxAttempts: 2, // Only 2 attempts fit within 25s outer timeout baseDelay: 800, // Slightly faster retry delay maxDelay: 5000, // Lower max delay cap backoffMultiplier: 2, retryableErrors: [ERROR_TYPES.NETWORK, ERROR_TYPES.TIMEOUT, ERROR_TYPES.ELEMENT_NOT_FOUND, ERROR_TYPES.DETACHED_FRAME] }; /** * Gets the retry configuration for a site, merging site-specific and global settings * @param {Object} siteConfig - Site configuration object * @returns {Object} Merged retry configuration */ function getRetryConfig(siteConfig) { return { maxAttempts: siteConfig.cloudflare_max_retries || RETRY_CONFIG.maxAttempts, baseDelay: RETRY_CONFIG.baseDelay, maxDelay: RETRY_CONFIG.maxDelay, backoffMultiplier: RETRY_CONFIG.backoffMultiplier, retryableErrors: RETRY_CONFIG.retryableErrors, retryOnError: siteConfig.cloudflare_retry_on_error !== false // Default to true }; } /** * Detects if we're in a challenge redirect loop by checking URL patterns */ function detectChallengeLoop(url, previousUrls = []) { // Check if current URL contains challenge indicators and we've seen similar URLs const isChallengeUrl = url.includes('/cdn-cgi/challenge-platform/') || url.includes('challenges.cloudflare.com') || url.includes('cf-ray'); if (!isChallengeUrl) return false; // Check if we've seen this exact URL or very similar challenge URLs const similarUrls = previousUrls.filter(prevUrl => { if (prevUrl === url) return true; // Exact match // Check for similar challenge URLs with different ray IDs if (prevUrl.includes('/cdn-cgi/challenge-platform/') && url.includes('/cdn-cgi/challenge-platform/')) { return true; } return false; }); return similarUrls.length >= 2; // Loop detected if we've seen similar URLs 2+ times } /** * Performance cache for detection results * Stores detection results per domain to avoid redundant checks */ class CloudflareDetectionCache { constructor(ttl = 300000) { // 5 minutes TTL by default this.cache = new Map(); this.ttl = ttl; this.hits = 0; this.misses = 0; // Prevent memory buildup in long-running processes. unref() so the // interval never prevents the Node process from exiting on its own — // nwss.js calls cleanup() explicitly on scan completion, but any other // consumer of this module that forgets to is still safe. this.cleanupInterval = setInterval(() => this.cleanupExpired(), ttl / 10); this.cleanupInterval.unref(); } getCacheKey(url) { try { const urlObj = new URL(url); return urlObj.hostname; // Domain-level caching: all URLs from same host share one entry } catch { return url; } } get(url) { const key = this.getCacheKey(url); const cached = this.cache.get(key); if (cached && Date.now() - cached.timestamp < this.ttl) { this.hits++; return cached.data; } if (cached) { this.cache.delete(key); // Remove expired entry } this.misses++; return null; } set(url, data) { const key = this.getCacheKey(url); this.cache.set(key, { data, timestamp: Date.now() }); // Prevent cache from growing too large if (this.cache.size > 1000) { const firstKey = this.cache.keys().next().value; this.cache.delete(firstKey); } } cleanupExpired() { const now = Date.now(); for (const [key, value] of this.cache.entries()) { if (now - value.timestamp >= this.ttl) { this.cache.delete(key); } } } destroy() { if (this.cleanupInterval) clearInterval(this.cleanupInterval); this.clear(); } clear() { this.cache.clear(); this.hits = 0; this.misses = 0; } getStats() { const total = this.hits + this.misses; return { hits: this.hits, misses: this.misses, hitRate: total > 0 ? (this.hits / total * 100).toFixed(2) + '%' : '0%', size: this.cache.size }; } } // Initialize cache singleton const detectionCache = new CloudflareDetectionCache(); // One-shot flag for the per-process module-version banner. Was previously // logged once per URL in handleCloudflareProtection's debug header, which // produces N=URL-count copies for no useful signal beyond the first. let _moduleVersionLogged = false; /** * Gets module version information * @returns {object} Version information object */ function getModuleInfo() { return { version: CLOUDFLARE_MODULE_VERSION, name: 'Cloudflare Protection Handler' }; } /** * Validates if a URL should be processed by Cloudflare protection * Only allows HTTP/HTTPS URLs, skips browser-internal and special protocols * @param {string} url - URL to validate * @param {boolean} forceDebug - Debug logging flag * @returns {boolean} True if URL should be processed */ // Single precompiled regex anchored to URL start. Matches any of the // browser-internal / special protocols we want to skip, plus succeeds on // http(s):// for the inverse check below. Faster than running 13 sequential // startsWith comparisons per URL. const SKIP_PROTO_RE = /^(?:about|chrome|chrome-extension|chrome-error|chrome-search|devtools|edge|moz-extension|safari-extension|webkit|data|blob|javascript|vbscript|file|ftp|ftps):/i; const HTTP_PROTO_RE = /^https?:\/\//i; function shouldProcessUrl(url, forceDebug = false) { if (!url || typeof url !== 'string') { if (forceDebug) console.log(formatLogMessage('cloudflare', `[url-validation] Skipping invalid URL: ${url}`)); return false; } const skipMatch = url.match(SKIP_PROTO_RE); if (skipMatch) { if (forceDebug) { console.log(formatLogMessage('cloudflare', `[url-validation] Skipping ${skipMatch[0].toLowerCase()} URL: ${url.substring(0, 100)}${url.length > 100 ? '...' : ''}`)); } return false; } if (!HTTP_PROTO_RE.test(url)) { if (forceDebug) { console.log(formatLogMessage('cloudflare', `[url-validation] Skipping non-HTTP(S) URL: ${url.substring(0, 100)}${url.length > 100 ? '...' : ''}`)); } return false; } return true; } /** * Fast timeout helper for Puppeteer 22.x compatibility * Replaces deprecated page.waitForTimeout() with standard Promise-based approach */ async function waitForTimeout(page, timeout) { // Use fast Promise-based timeout for Puppeteer 22.x compatibility // This eliminates the deprecated API dependency and improves performance return new Promise(resolve => setTimeout(resolve, timeout)); } /** * Captures whether the page currently has Cloudflare's two key cookies. * cf_clearance is the post-challenge clearance token — its presence is the * single most reliable "did the bypass actually succeed" signal, beating * any DOM-side completion check. __cf_bm is the bot-mitigation cookie * (typically set on every request that goes through CF's edge). * Errors swallowed: cookie read failures should not affect bypass logic. */ async function getCfCookieState(page) { try { const cookies = await page.cookies(); let cf_clearance = false; let cf_bm = false; for (const c of cookies) { if (c.name === 'cf_clearance') cf_clearance = true; else if (c.name === '__cf_bm') cf_bm = true; } return { cf_clearance, cf_bm }; } catch { return { cf_clearance: false, cf_bm: false }; } } /** * Maps a handleCloudflareProtection result back to a short outcome tag * for the per-URL summary log. The tag is grep-friendly (no spaces) so * users can post-process scan logs by outcome category. */ function buildOutcomeString(result, errorCode) { if (!result) return 'unknown'; if (result.skippedInvalidUrl) return 'skipped(non-http)'; if (result.quickDetectionFailed) return 'detection_failed'; if (result.cloudflareErrorPage) return `error_page(${errorCode || '5xx'})`; if (result.timedOut) return 'timeout'; if (result.verificationChallenge?.requiresHuman) return 'captcha_required'; if (result.verificationChallenge?.attempted && result.verificationChallenge?.success) { return `solved(${result.verificationChallenge.method || 'unknown'})`; } if (result.phishingWarning?.attempted && result.phishingWarning?.success) { return 'solved(phishing_continue)'; } if (result.skippedNoIndicators) return 'no_indicators'; if (!result.overallSuccess) return 'failed'; return 'ok'; } /** * Categorizes errors for better handling */ function categorizeError(error) { // Guard against null/undefined error so callers using categorizeError in // safe-defaults return paths (e.g. safePageEvaluate's final fallback when // lastError was never assigned) don't blow up reading .message. if (!error) return ERROR_TYPES.UNKNOWN; const errorMessage = error.message || ''; if (errorMessage.includes('detached Frame') || errorMessage.includes('Attempted to use detached')) { return ERROR_TYPES.DETACHED_FRAME; } if (errorMessage.includes('timeout') || errorMessage.includes('Timeout')) { return ERROR_TYPES.TIMEOUT; } if (errorMessage.includes('Protocol error') || errorMessage.includes('Target closed')) { return ERROR_TYPES.NETWORK; } if (errorMessage.includes('evaluation') || errorMessage.includes('Evaluation')) { return ERROR_TYPES.EVALUATION_FAILED; } if (errorMessage.includes('navigation') || errorMessage.includes('Navigation')) { return ERROR_TYPES.NAVIGATION_FAILED; } return ERROR_TYPES.UNKNOWN; } /** * Implements exponential backoff delay */ function getRetryDelay(attempt) { const delay = Math.min( RETRY_CONFIG.baseDelay * Math.pow(RETRY_CONFIG.backoffMultiplier, attempt - 1), RETRY_CONFIG.maxDelay ); return delay; } /** * Enhanced safe page evaluation with retry logic and better error handling */ async function safePageEvaluate(page, func, timeout = TIMEOUTS.PAGE_EVALUATION_SAFE, options = {}) { const { maxRetries = RETRY_CONFIG.maxAttempts, forceDebug = false } = options; let lastError = null; for (let attempt = 1; attempt <= maxRetries; attempt++) { let timeoutId = null; try { // Multi-layered page state validation if (page.isClosed()) { throw new Error('Page is closed or invalid'); } // Check if page is still navigating or has valid context let currentUrl; try { currentUrl = await page.url(); if (!currentUrl || currentUrl === 'about:blank') { throw new Error('Page URL is invalid or blank'); } } catch (urlError) { throw new Error('Page URL access failed - likely detached'); } const result = await Promise.race([ page.evaluate(func), new Promise((_, reject) => { timeoutId = setTimeout(() => reject(new Error('Page evaluation timeout')), timeout); }) ]); // Clear timeout if evaluation completed first if (timeoutId) { clearTimeout(timeoutId); } if (forceDebug && attempt > 1) { console.log(formatLogMessage('cloudflare', `Page evaluation succeeded on attempt ${attempt}`)); } return result; } catch (error) { // Ensure timeout is cleared on any error if (timeoutId) { clearTimeout(timeoutId); } lastError = error; const errorType = categorizeError(error); if (forceDebug) { console.warn(formatLogMessage('cloudflare', `Page evaluation failed (attempt ${attempt}/${maxRetries}): ${error.message} [${errorType}]`)); } // Handle detached frame errors specifically if (errorType === ERROR_TYPES.DETACHED_FRAME) { if (forceDebug) { console.warn(formatLogMessage('cloudflare', `Detached frame detected on attempt ${attempt}/${maxRetries} - using longer delay`)); } // For detached frames, brief delay before retry await new Promise(resolve => setTimeout(resolve, 1000)); // For detached frames, only retry once more if (attempt >= 2) { break; } continue; } // Don't retry if error type is not retryable or if it's the last attempt if (!RETRY_CONFIG.retryableErrors.includes(errorType) || attempt === maxRetries) { break; } // Wait before retrying with exponential backoff await new Promise(resolve => setTimeout(resolve, getRetryDelay(attempt))); } } // Return safe defaults if all retries failed return { isChallengePresent: false, isPhishingWarning: false, isTurnstile: false, isJSChallenge: false, isChallengeCompleted: false, error: lastError?.message || 'Unknown error', errorType: categorizeError(lastError), attempts: maxRetries }; } /** * Safe element clicking with timeout protection */ async function safeClick(page, selector, timeout = TIMEOUTS.CLICK_TIMEOUT) { let timeoutId; try { return await Promise.race([ page.click(selector, { timeout: timeout }), new Promise((_, reject) => { timeoutId = setTimeout(() => reject(new Error('Click timeout')), timeout + TIMEOUTS.CLICK_TIMEOUT_BUFFER); }) ]); } catch (error) { throw new Error(`Click failed: ${error.message}`); } finally { if (timeoutId) clearTimeout(timeoutId); } } /** * Safe navigation waiting with timeout protection */ async function safeWaitForNavigation(page, timeout = TIMEOUTS.NAVIGATION_TIMEOUT) { let timeoutId; try { return await Promise.race([ page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: timeout }), new Promise((_, reject) => { timeoutId = setTimeout(() => reject(new Error('Navigation timeout')), timeout + TIMEOUTS.NAVIGATION_TIMEOUT_BUFFER); }) ]); } catch (error) { console.warn(formatLogMessage('cloudflare', `Navigation wait failed: ${error.message}`)); } finally { if (timeoutId) clearTimeout(timeoutId); } } /** * Quick Cloudflare detection with caching for performance */ async function quickCloudflareDetection(page, forceDebug = false) { try { // Get current page URL and validate it const currentPageUrl = await page.url(); if (!shouldProcessUrl(currentPageUrl, forceDebug)) { if (forceDebug) { console.log(formatLogMessage('cloudflare', `Quick detection skipping non-HTTP(S) page: ${currentPageUrl}`)); } return { hasIndicators: false, skippedInvalidUrl: true }; } // Check cache first const cachedResult = detectionCache.get(currentPageUrl); if (cachedResult !== null) { if (forceDebug) { const stats = detectionCache.getStats(); console.log(formatLogMessage('cloudflare', `Using cached detection result (cache hit rate: ${stats.hitRate})`)); } // Return a fresh shallow copy tagged _fromCache so the handler's // logging can say "[cached]" instead of presenting cached title/body // details as if they were fresh. return { ...cachedResult, _fromCache: true }; } // Perform actual detection with enhanced error handling const quickCheck = await safePageEvaluate(page, () => { const title = document.title || ''; const url = window.location.href; // Cloudflare-served 5xx origin-error pages (522/523/524/525/526/527/530). // Title format is reliable: "<domain> | 5xx: <reason>". These are NOT // bypass-able challenges — the origin is unreachable. Mark as // recognized (hasErrorPage) but NOT as a bypass target (hasIndicators // stays false) so the early-skip path still fires and the log can say // "Cloudflare error page" instead of the misleading "No Cloudflare // indicators found". errorCode is the captured 5xx digit so outcome // logs can grep by specific error type. const titleErrorMatch = title.match(/\|\s*(5\d\d):/); if (titleErrorMatch) { return { hasIndicators: false, hasErrorPage: true, errorCode: parseInt(titleErrorMatch[1], 10), title, url, bodySnippet: '' }; } // FAST PATH: Check title + URL first (string ops, no DOM traversal) const titleMatch = title.includes('Just a moment') || title.includes('Checking your browser') || title.includes('Attention Required') || title.includes('Security check'); const urlMatch = url.includes('/cdn-cgi/challenge-platform/') || url.includes('cloudflare.com'); if (titleMatch || urlMatch) { return { hasIndicators: true, title, url, bodySnippet: '' }; } // MEDIUM PATH: Combine fast-path selectors into one query — one DOM // walk for all 7 alternatives instead of up to 7 separate walks. const selectorMatch = document.querySelector( '[data-ray], [data-cf-challenge], .cf-challenge-running, .cf-turnstile, .cf-managed-challenge, [data-cf-managed], script[src*="/cdn-cgi/challenge-platform/"]' ); if (selectorMatch) { return { hasIndicators: true, title, url, bodySnippet: '' }; } // SLOW PATH: Extract limited body text only if fast checks failed // Use body.innerText capped to first child nodes instead of full textContent let bodyText = ''; if (document.body) { const el = document.body.querySelector('.main-wrapper, .main-content, #challenge-body-text, .cf-challenge-container'); bodyText = el ? el.textContent.substring(0, 300) : (document.body.firstElementChild ? document.body.firstElementChild.textContent.substring(0, 300) : ''); } const textMatch = bodyText.includes('Cloudflare') || bodyText.includes('cf-ray') || bodyText.includes('Verify you are human') || bodyText.includes('This website has been reported for potential phishing') || bodyText.includes('Please wait while we verify') || bodyText.includes('Checking if the site connection is secure'); // Remaining slower selectors — combined into one query for the same reason. const slowSelectorMatch = document.querySelector( '.cf-challenge-container, .ctp-checkbox-container, iframe[src*="challenges.cloudflare.com"], iframe[title*="Cloudflare security challenge"]' ); // Body-text fallback for error pages with non-standard titles. // Same rationale as the early title check: recognize but don't bypass. const bodyErrorMatch = bodyText.match(/Error code (5\d\d)/); if (bodyErrorMatch && !textMatch && !slowSelectorMatch) { return { hasIndicators: false, hasErrorPage: true, errorCode: parseInt(bodyErrorMatch[1], 10), title, url, bodySnippet: bodyText.substring(0, 200) }; } return { hasIndicators: !!(textMatch || slowSelectorMatch), title, url, bodySnippet: bodyText.substring(0, 200) }; }, FAST_TIMEOUTS.QUICK_DETECTION, { maxRetries: 1, forceDebug }); // Cache the result detectionCache.set(currentPageUrl, quickCheck); if (forceDebug) { if (quickCheck.hasIndicators) { console.log(formatLogMessage('cloudflare', `Quick detection found Cloudflare indicators on ${quickCheck.url}`)); } // hasErrorPage and no-indicators cases are deliberately silent here — // handleCloudflareProtection prints a clearer per-action line right // after ("Cloudflare error page detected..." or "No Cloudflare // indicators found, skipping protection handling..."), so logging // here would just duplicate it. if (quickCheck.attempts && quickCheck.attempts > 1) { console.log(formatLogMessage('cloudflare', `Detection required ${quickCheck.attempts} attempts`)); } } return quickCheck; } catch (error) { if (forceDebug) console.log(formatLogMessage('cloudflare', `Quick detection failed: ${error.message}`)); return { hasIndicators: false, error: error.message }; } } /** * Analyzes the current page to detect Cloudflare challenges - Enhanced with timeout protection and detailed debug logging */ async function analyzeCloudflareChallenge(page) { try { // CDP-level frame check -- bypasses closed shadow roots const frames = page.frames(); const hasChallengeFrame = frames.some(f => { const url = f.url(); return url.includes('challenges.cloudflare.com') || url.includes('/cdn-cgi/challenge-platform/'); }); const result = await safePageEvaluate(page, () => { const title = document.title || ''; // Cap text extraction -- on content-heavy pages body.textContent can be megabytes const bodyText = document.body ? document.body.textContent.substring(0, 2000) : ''; // Updated selectors for 2025 Cloudflare challenges. Each category groups // its alternatives into a single comma-separated selector so the browser // walks the DOM once per category instead of once per alternative. const hasTurnstileIframe = !!document.querySelector( 'iframe[title*="Cloudflare security challenge"], iframe[src*="challenges.cloudflare.com"], iframe[title*="Widget containing a Cloudflare"]' ); const hasTurnstileContainer = !!document.querySelector( '.cf-turnstile, .ctp-checkbox-container, .ctp-checkbox-label' ); const hasTurnstileCheckbox = !!document.querySelector( 'input[type="checkbox"].ctp-checkbox, .ctp-checkbox' ); const hasLegacyCheckbox = !!document.querySelector( 'input[type="checkbox"]#challenge-form, input[type="checkbox"][name="cf_captcha_kind"]' ); const hasChallengeRunning = !!document.querySelector( '.cf-challenge-running, .cf-challenge-container, .challenge-stage, .challenge-form' ); const hasDataRay = !!document.querySelector('[data-ray], [data-cf-challenge]'); const hasCaptcha = bodyText.includes('CAPTCHA') || bodyText.includes('captcha') || bodyText.includes('hCaptcha') || bodyText.includes('reCAPTCHA'); const hasJSChallenge = document.querySelector('script[src*="/cdn-cgi/challenge-platform/"]') !== null || bodyText.includes('Checking your browser') || bodyText.includes('Please wait while we verify'); const hasPhishingWarning = bodyText.includes('This website has been reported for potential phishing') || title.includes('Attention Required'); const hasTurnstileResponse = document.querySelector('input[name="cf-turnstile-response"]') !== null; const isChallengeCompleted = hasTurnstileResponse && document.querySelector('input[name="cf-turnstile-response"]')?.value; const isChallengePresent = title.includes('Just a moment') || title.includes('Checking your browser') || bodyText.includes('Verify you are human') || hasLegacyCheckbox || hasChallengeRunning || hasDataRay || hasTurnstileIframe || hasTurnstileContainer || hasJSChallenge; return { isChallengePresent, isPhishingWarning: hasPhishingWarning, isTurnstile: hasTurnstileIframe || hasTurnstileContainer || hasTurnstileCheckbox, isJSChallenge: hasJSChallenge, isChallengeCompleted, title, hasLegacyCheckbox, hasTurnstileIframe, hasTurnstileContainer, hasTurnstileCheckbox, hasChallengeRunning, hasDataRay, hasCaptcha, hasTurnstileResponse, url: window.location.href, bodySnippet: bodyText.substring(0, 200) }; }, TIMEOUTS.PAGE_EVALUATION); // Merge CDP frame detection -- catches iframes behind closed shadow roots if (hasChallengeFrame && !result.hasTurnstileIframe) { result.hasTurnstileIframe = true; result.isTurnstile = true; result.isChallengePresent = true; } return result; } catch (error) { return { isChallengePresent: false, isPhishingWarning: false, isTurnstile: false, isJSChallenge: false, isChallengeCompleted: false, error: error.message }; } } /** * Handles Cloudflare phishing warnings with timeout protection and enhanced debug logging * * @param {Object} page - Puppeteer page instance * @param {string} currentUrl - URL being processed * @param {boolean} forceDebug - Debug logging flag * @returns {Promise<Object>} Phishing warning result: * { * success: boolean, // True if no warning found OR successfully bypassed * attempted: boolean, // True if warning was detected and bypass attempted * error: string|null, // Error message if bypass failed * details: object|null // Analysis details from analyzeCloudflareChallenge() * } */ async function handlePhishingWarning(page, currentUrl, forceDebug = false) { const result = { success: false, attempted: false, error: null, details: null }; try { if (forceDebug) console.log(formatLogMessage('cloudflare', `Checking for phishing warning on ${currentUrl}`)); // Shorter wait with timeout protection await waitForTimeout(page, FAST_TIMEOUTS.PHISHING_WAIT); const challengeInfo = await analyzeCloudflareChallenge(page); if (challengeInfo.isPhishingWarning) { result.attempted = true; result.details = challengeInfo; if (forceDebug) { console.log(formatLogMessage('cloudflare', `Phishing warning detected on ${currentUrl}:`)); console.log(formatLogMessage('cloudflare', ` Page Title: "${challengeInfo.title}"`)); console.log(formatLogMessage('cloudflare', ` Current URL: ${challengeInfo.url}`)); console.log(formatLogMessage('cloudflare', ` Body snippet: ${challengeInfo.bodySnippet}`)); } try { // Use safe click with shorter timeout await safeClick(page, 'a[href*="continue"]', TIMEOUTS.PHISHING_CLICK); await safeWaitForNavigation(page, TIMEOUTS.PHISHING_NAVIGATION); result.success = true; if (forceDebug) console.log(formatLogMessage('cloudflare', `Successfully bypassed phishing warning for ${currentUrl}`)); } catch (clickError) { result.error = `Failed to click continue button: ${clickError.message}`; if (forceDebug) console.log(formatLogMessage('cloudflare', `Failed to bypass phishing warning: ${clickError.message}`)); } } else { if (forceDebug) console.log(formatLogMessage('cloudflare', `No phishing warning detected on ${currentUrl}`)); result.success = true; // No warning to handle } } catch (error) { result.error = error.message; if (forceDebug) console.log(formatLogMessage('cloudflare', `Phishing warning check failed for ${currentUrl}: ${error.message}`)); } return result; } /** * Attempts to solve Cloudflare challenges with timeout protection and enhanced debug logging * * @param {Object} page - Puppeteer page instance * @param {string} currentUrl - URL being processed * @param {boolean} forceDebug - Debug logging flag * @returns {Promise<Object>} Challenge verification result: * { * success: boolean, // True if no challenge found OR successfully solved * attempted: boolean, // True if challenge was detected and solving attempted * error: string|null, // Error message if solving failed * requiresHuman: boolean, // True if CAPTCHA detected (requires manual intervention) * method: string|null, // Method that succeeded: 'js_challenge_wait', 'turnstile', 'legacy_checkbox' * details: object|null // Analysis details from analyzeCloudflareChallenge() * } */ async function handleVerificationChallenge(page, currentUrl, forceDebug = false) { const result = { success: false, attempted: false, error: null, details: null, requiresHuman: false, method: null }; try { if (forceDebug) console.log(formatLogMessage('cloudflare', `Checking for verification challenge on ${currentUrl}`)); // Reduced wait time await waitForTimeout(page, FAST_TIMEOUTS.CHALLENGE_WAIT); const challengeInfo = await analyzeCloudflareChallenge(page); result.details = challengeInfo; if (challengeInfo.isChallengePresent) { result.attempted = true; if (forceDebug) { console.log(formatLogMessage('cloudflare', `Challenge detected on ${currentUrl}:`)); console.log(formatLogMessage('cloudflare', ` Page Title: "${challengeInfo.title}"`)); console.log(formatLogMessage('cloudflare', ` Current URL: ${challengeInfo.url}`)); console.log(formatLogMessage('cloudflare', ` Is Turnstile: ${challengeInfo.isTurnstile}`)); console.log(formatLogMessage('cloudflare', ` Is JS Challenge: ${challengeInfo.isJSChallenge}`)); console.log(formatLogMessage('cloudflare', ` Has Legacy Checkbox: ${challengeInfo.hasLegacyCheckbox}`)); console.log(formatLogMessage('cloudflare', ` Has Turnstile Iframe: ${challengeInfo.hasTurnstileIframe}`)); console.log(formatLogMessage('cloudflare', ` Has Turnstile Container: ${challengeInfo.hasTurnstileContainer}`)); console.log(formatLogMessage('cloudflare', ` Has Turnstile Checkbox: ${challengeInfo.hasTurnstileCheckbox}`)); console.log(formatLogMessage('cloudflare', ` Has CAPTCHA: ${challengeInfo.hasCaptcha}`)); console.log(formatLogMessage('cloudflare', ` Has Challenge Running: ${challengeInfo.hasChallengeRunning}`)); console.log(formatLogMessage('cloudflare', ` Has Data Ray: ${challengeInfo.hasDataRay}`)); console.log(formatLogMessage('cloudflare', ` Has Turnstile Response: ${challengeInfo.hasTurnstileResponse}`)); console.log(formatLogMessage('cloudflare', ` Body snippet: ${challengeInfo.bodySnippet}`)); } // Check for CAPTCHA that requires human intervention if (challengeInfo.hasCaptcha) { result.requiresHuman = true; result.error = 'CAPTCHA detected - requires human intervention'; if (forceDebug) console.log(formatLogMessage('cloudflare', `Skipping automatic bypass due to CAPTCHA requirement`)); return result; } // Attempt to solve the challenge with timeout protection const solveResult = await attemptChallengeSolveWithTimeout(page, currentUrl, challengeInfo, forceDebug); result.success = solveResult.success; result.error = solveResult.error; result.method = solveResult.method; } else { if (forceDebug) console.log(formatLogMessage('cloudflare', `No verification challenge detected on ${currentUrl}`)); result.success = true; } } catch (error) { result.error = error.message; if (forceDebug) console.log(formatLogMessage('cloudflare', `Challenge check failed for ${currentUrl}: ${error.message}`)); } return result; } /** * Enhanced challenge handling with retry logic and loop detection */ async function handleVerificationChallengeWithRetries(page, currentUrl, siteConfig, forceDebug = false) { const retryConfig = getRetryConfig(siteConfig); const visitedUrls = []; // Track URLs to detect redirect loops let lastError = null; if (forceDebug) { console.log(formatLogMessage('cloudflare', `Starting verification challenge with max ${retryConfig.maxAttempts} attempts`)); } for (let attempt = 1; attempt <= retryConfig.maxAttempts; attempt++) { try { const currentPageUrl = await page.url(); visitedUrls.push(currentPageUrl); // Check for redirect loops if (detectChallengeLoop(currentPageUrl, visitedUrls)) { const error = `Challenge redirect loop detected after ${attempt} attempts. URLs: ${visitedUrls.slice(-3).join(' -> ')}`; if (forceDebug) { console.log(formatLogMessage('cloudflare', error)); } return { success: false, attempted: true, error: error, details: null, requiresHuman: false, method: null, attempts: attempt, loopDetected: true }; } if (forceDebug && attempt > 1) { console.log(formatLogMessage('cloudflare', `Challenge attempt ${attempt}/${retryConfig.maxAttempts} for ${currentUrl}`)); } const result = await handleVerificationChallenge(page, currentUrl, forceDebug); if (result.success || result.requiresHuman || !retryConfig.retryOnError) { if (forceDebug && attempt > 1) { console.log(`[debug][cloudflare] Challenge ${result.success ? 'succeeded' : 'failed'} on attempt ${attempt}`); } return { ...result, attempts: attempt }; } // If this wasn't the last attempt, wait before retrying if (attempt < retryConfig.maxAttempts) { const delay = getRetryDelay(attempt); if (forceDebug) { console.log(formatLogMessage('cloudflare', `Challenge attempt ${attempt} failed, retrying in ${delay}ms: ${result.error}`)); } await new Promise(resolve => setTimeout(resolve, delay)); // Refresh the page to get a fresh challenge try { await page.reload({ waitUntil: 'domcontentloaded', timeout: 10000 }); await waitForTimeout(page, 2000); // Give challenge time to load } catch (reloadErr) { if (forceDebug) { console.log(formatLogMessage('cloudflare', `Page reload failed on attempt ${attempt}: ${reloadErr.message}`)); } } } lastError = result.error; } catch (error) { lastError = error.message; const errorType = categorizeError(error); if (forceDebug) { console.warn(formatLogMessage('cloudflare', `Challenge attempt ${attempt}/${retryConfig.maxAttempts} failed: ${error.message} [${errorType}]`)); } // Don't retry if error type is not retryable or if it's the last attempt if (!retryConfig.retryableErrors.includes(errorType) || attempt === retryConfig.maxAttempts) { return { success: false, attempted: true, error: lastError, details: null, requiresHuman: false, method: null, attempts: attempt, errorType: errorType }; } // Wait before retrying with exponential backoff if (attempt < retryConfig.maxAttempts) { await new Promise(resolve => setTimeout(resolve, getRetryDelay(attempt))); } } } return { success: false, attempted: true, error: `All ${retryConfig.maxAttempts} challenge attempts failed. Last error: ${lastError}`, details: null, requiresHuman: false, method: null, attempts: retryConfig.maxAttempts, maxRetriesExceeded: true }; } /** * Enhanced phishing warning handling with retry logic */ async function handlePhishingWarningWithRetries(page, currentUrl, siteConfig, forceDebug = false) { const retryConfig = getRetryConfig(siteConfig); let lastError = null; for (let attempt = 1; attempt <= retryConfig.maxAttempts; attempt++) { try { if (forceDebug && attempt > 1) { console.log(formatLogMessage('cloudflare', `Phishing warning attempt ${attempt}/${retryConfig.maxAttempts} for ${currentUrl}`)); } const result = await handlePhishingWarning(page, currentUrl, forceDebug); if (result.success || !retryConfig.retryOnError) { if (forceDebug && attempt > 1) { console.log(`[debug][cloudflare] Phishing warning ${result.success ? 'succeeded' : 'failed'} on attempt ${attempt}`); } return { ...result, attempts: attempt }; } // If this wasn't the last attempt, wait before retrying if (attempt < retryConfig.maxAttempts) { const delay = getRetryDelay(attempt); if (forceDebug) { console.log(formatLogMessage('cloudflare', `Phishing warning attempt ${attempt} failed, retrying in ${delay}ms: ${result.error}`)); } await new Promise(resolve => setTimeout(resolve, delay)); } lastError = result.error; } catch (error) { lastError = error.message; const errorType = categorizeError(error); if (forceDebug) { console.warn(formatLogMessage('cloudflare', `Phishing warning attempt ${attempt}/${retryConfig.maxAttempts} failed: ${error.message} [${errorType}]`)); } // Don't retry if error type is not retryable or if it's the last attempt if (!retryConfig.retryableErrors.includes(errorType) || attempt === retryConfig.maxAttempts) { return { success: false, attempted: true, error: lastError, details: null, attempts: attempt, errorType: errorType }; } // Wait before retrying with exponential backoff if (attempt < retryConfig.maxAttempts) { await new Promise(resolve => setTimeout(resolve, getRetryDelay(attempt))); } } } return { success: false, attempted: true, error: `All ${retryConfig.maxAttempts} phishing warning attempts failed. Last error: ${lastError}`, details: null, attempts: retryConfig.maxAttempts, maxRetriesExceeded: true }; } /** * Challenge solving with overall timeout protection */ async function attemptChallengeSolveWithTimeout(page, currentUrl, challengeInfo, forceDebug = false) { const result = { success: false, error: null, method: null }; let timeoutId = null; try { const timeoutPromise = new Promise((_, reject) => { timeoutId = setTimeout(() => reject(new Error('Challenge solving timeout')), FAST_TIMEOUTS.CHALLENGE_SOLVING); }); // Reduced timeout for challenge solving const finalResult = await Promise.race([ attemptChallengeSolve(page, currentUrl, challengeInfo, forceDebug), timeoutPromise ]); // Clear timeout if operation completed first if (timeoutId) { clearTimeout(timeoutId); } return finalResult; } catch (error) { // Clear timeout on error if (timeoutId) { clearTimeout(timeoutId); } result.error = `Challenge solving timed out: ${error.message}`; if (forceDebug) console.log(formatLogMessage('cloudflare', `Challenge solving timeout for ${currentUrl}`)); return result; } } /** * Attempts to solve a Cloudflare challenge with modern techniques and enhanced debug logging */ async function attemptChallengeSolve(page, currentUrl, challengeInfo, forceDebug = false) { const result = { success: false, error: null, method: null }; // Method 1: Handle JS challenges (wait for automatic completion) - Most reliable if (challengeInfo.isJSChallenge) { try { if (forceDebug) console.log(formatLogMessage('cloudflare', `Attempting JS challenge wait for ${currentUrl}`)); const jsResult = await waitForJSChallengeCompletion(page, forceDebug); if (jsResult.success) { // Wait for redirect after challenge completion try { await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 10000 }); if (forceDebug) console.log(formatLogMessage('cloudflare', `Post-challenge redirect completed for ${currentUrl}`)); } catch (navErr) { if (forceDebug) console.log(formatLogMessage('cloudflare', `Post-challenge redirect timeout (may already be on target page): ${navErr.message}`)); } result.success = true; result.method = 'js_challenge_wait'; if (forceDebug) console.log(formatLogMessage('cloudflare', `JS challenge completed successfully for ${currentUrl}`)); return result; } } catch (jsError) { if (forceDebug) console.log(formatLogMessage('cloudflare', `JS challenge wait failed for ${currentUrl}: ${jsError.message}`)); } } else if (forceDebug) { console.log(formatLogMessage('cloudflare', `Skipping JS challenge method (not detected)`)); } // Method 2: Handle Turnstile challenges (interactive) if (challengeInfo.isTurnstile) { try { if (forceDebug) console.log(formatLogMessage('cloudflare', `Attempting Turnstile method for ${currentUrl}`)); const turnstileResult = await handleTurnstileChallenge(page, forceDebug); if (turnstileResult.success) { result.success = true; result.method = 'turnstile'; if (forceDebug) console.log(formatLogMessage('cloudflare', `Turnstile challenge solved successfully for ${currentUrl}`)); return result; } } catch (turnstileError) { if (forceDebug) console.log(formatLogMessage('cloudflare', `Turnstile method failed for ${currentUrl}: ${turnstileError.message}`)); } } else if (forceDebug) { console.log(formatLogMessage('cloudflare', `Skipping Turnstile method (not detected)`)); } // Method 3: Legacy checkbox interaction (fallback) if (challengeInfo.hasLegacyCheckbox) { try { if (forceDebug) console.log(formatLogMessage('cloudflare', `Attempting legacy checkbox method for ${currentUrl}`)); const legacyResult = await handleLegacyCheckbox(page, forceDebug); if (legacyResult.success) { result.success = true; result.method = 'legacy_checkbox'; if (forceDebug) console.log(formatLogMessage('cloudflare', `Legacy checkbox method succeeded for ${currentUrl}`)); return result; } } catch