UNPKG

@fanboynz/network-scanner

Version:

A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.

509 lines (463 loc) 19.7 kB
// === curl.js - Curl-based Content Download Module === // Handles HTTP content downloading using curl for searchstring analysis const fs = require('fs'); // spawnSync only kept for validateCurlAvailability (runs once at // startup). Production curl downloads go through runProcess (async). const { spawnSync } = require('child_process'); const { runProcess } = require('./spawn-async'); const { messageColors, formatLogMessage } = require('./colorize'); const { getReferrerForUrl } = require('./referrer'); const CURL_TAG = messageColors.processing('[curl]'); // === Constants === const CURL_DEFAULTS = { TIMEOUT_SECONDS: 30, MAX_REDIRECTS: 5, // 50MB to match lib/searchstring.js's downloadWithCurl cap — the two // modules previously had different defaults (10MB vs 50MB) so the same // URL could succeed or fail depending on which code path fetched it. MAX_SIZE_BYTES: 50 * 1024 * 1024, VALIDATION_TIMEOUT: 5000, CURL_SUCCESS_STATUS: 0, VERSION_LINE_INDEX: 0 }; // Module-level so downloadWithCurl doesn't reallocate this closure on // every call. No state captured — pure factory. function errResult(msg) { return { content: '', httpCode: 0, contentType: 'unknown', downloadSize: 0, success: false, error: msg }; } /** * Downloads content using curl with browser-like headers * @param {string} url - The URL to download * @param {string} userAgent - User agent string to use * @param {object} options - Download options * @returns {Promise<object>} Object with content, status, and metadata */ async function downloadWithCurl(url, userAgent = '', options = {}) { const { timeout = CURL_DEFAULTS.TIMEOUT_SECONDS, maxRedirects = CURL_DEFAULTS.MAX_REDIRECTS, maxSize = CURL_DEFAULTS.MAX_SIZE_BYTES, followRedirects = true, customHeaders = {} } = options; const curlArgs = [ '-s', '--max-time', timeout.toString(), '--max-redirs', maxRedirects.toString(), '--fail-with-body', '--compressed', // Leading '\n' guarantees the metadata sits on its own line even // when content has no trailing newline (older format had no // separator and concatenated metadata with the last content byte). '--write-out', '\n%{http_code}|%{content_type}|%{size_download}' ]; if (followRedirects) curlArgs.push('-L'); if (userAgent) curlArgs.push('-H', `User-Agent: ${userAgent}`); curlArgs.push( '-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', '-H', 'Accept-Language: en-US,en;q=0.5', '-H', 'Accept-Encoding: gzip, deflate, br', '-H', 'Connection: keep-alive', '-H', 'Upgrade-Insecure-Requests: 1', '-H', 'Sec-Fetch-Dest: document', '-H', 'Sec-Fetch-Mode: navigate', '-H', 'Sec-Fetch-Site: none', '-H', 'Cache-Control: no-cache' ); Object.entries(customHeaders).forEach(([key, value]) => { curlArgs.push('-H', `${key}: ${value}`); }); curlArgs.push(url); // Shared async-spawn helper handles streaming/cap/timeout/kill plumbing. const result = await runProcess('curl', curlArgs, { timeout: timeout * 1000, maxStdout: maxSize }); if (result.error) return errResult(result.error); if (result.truncated) return errResult(`Output exceeded ${maxSize} bytes`); if (result.signal) return errResult(`Killed by signal ${result.signal}`); if (result.code !== CURL_DEFAULTS.CURL_SUCCESS_STATUS) { return errResult(`Curl exited with status ${result.code}: ${result.stderr.toString('utf8')}`); } const output = result.stdout.toString('utf8'); // lastIndexOf('\n') is a single O(n) scan from the end vs the old // split('\n') + slice(0,-1) + join('\n') which was three full passes // plus two intermediate array allocations. const sepIdx = output.lastIndexOf('\n'); if (sepIdx === -1) return errResult('No metadata separator in curl output'); const content = output.slice(0, sepIdx); const metadata = output.slice(sepIdx + 1); // Split on first/last pipe so the middle (content-type) can legitimately // contain pipes — naive split('|') with parts-count check would drop the // whole response with 'Invalid metadata format' for such content-types. const firstPipe = metadata.indexOf('|'); const lastPipe = metadata.lastIndexOf('|'); if (firstPipe === -1 || firstPipe === lastPipe) { return errResult(`Invalid metadata format: missing pipes in "${metadata}"`); } const httpCode = metadata.slice(0, firstPipe); const contentType = metadata.slice(firstPipe + 1, lastPipe); const downloadSize = metadata.slice(lastPipe + 1); return { content, httpCode: parseInt(httpCode, 10) || 0, contentType: contentType || 'unknown', downloadSize: parseInt(downloadSize, 10) || content.length, success: true }; } /** * Searches content for patterns using JavaScript (case-insensitive) * @param {string} content - Content to search * @param {Array<string>} searchStrings - OR patterns (any can match) * @param {Array<string>} searchStringsAnd - AND patterns (all must match) * @param {boolean} hasSearchStringAnd - Whether AND logic is being used * @returns {object} Search result with found status and matched pattern */ function searchContent(content, searchStrings = [], searchStringsAnd = [], hasSearchStringAnd = false) { if (!content || content.length === 0) { return { found: false, matchedPattern: null, matchType: null }; } const lowerContent = content.toLowerCase(); // Handle AND logic searchstring_and (all patterns must be present). // Short-circuits on first missing pattern — the old code walked the // entire list to build a full missingPatterns array that's only used // by a debug log. Now we early-exit and report the first miss (the // debug log's missingPatterns.join(', ') still works with one entry). if (hasSearchStringAnd && searchStringsAnd.length > 0) { // Pre-lower patterns once — was per-iteration toLowerCase before. // For a 20-pattern AND check the difference is small per call but // the pattern itself never changes between iterations of the loop. const lowered = searchStringsAnd.map(p => p.toLowerCase()); for (let i = 0; i < searchStringsAnd.length; i++) { if (!lowerContent.includes(lowered[i])) { return { found: false, matchedPattern: null, matchType: 'AND', foundPatterns: searchStringsAnd.slice(0, i), missingPatterns: [searchStringsAnd[i]] }; } } return { found: true, matchedPattern: searchStringsAnd.join(' AND '), matchType: 'AND', foundPatterns: searchStringsAnd, missingPatterns: [] }; } // Handle OR logic searchstring (any pattern can match). Same pre-lower // optimization, though OR usually short-circuits early so the savings // are smaller. if (searchStrings.length > 0) { for (let i = 0; i < searchStrings.length; i++) { if (lowerContent.includes(searchStrings[i].toLowerCase())) { return { found: true, matchedPattern: searchStrings[i], matchType: 'OR' }; } } } return { found: false, matchedPattern: null, matchType: null }; } /** * Emits a match for a curl-fetched URL to both the verbose console * (when siteConfig.verbose === 1) and the matched-URLs log file * (when dumpUrls is true). Single source of truth for the format — * both no-searchstring and with-searchstring match paths funnel * through here so partyType / resourceInfo / timestamp / format * don't drift between the two branches. * * @param {object} opts * @param {string} opts.simplifiedUrl * @param {string} opts.requestUrl * @param {boolean} opts.isFirstParty * @param {string|null} opts.resourceType * @param {string|null} opts.matchInfo - null for "matched regex only" * (no searchstring), a string like * 'pattern: "X"' or 'patterns: 2/3' * for searchstring matches * @param {number|undefined} opts.verbose * @param {boolean} opts.dumpUrls * @param {string} opts.matchedUrlsLogFile */ function logMatchedRequest({ simplifiedUrl, requestUrl, isFirstParty, resourceType, matchInfo, verbose, dumpUrls, matchedUrlsLogFile }) { const partyType = isFirstParty ? 'first-party' : 'third-party'; const resourceInfo = resourceType ? ` (${resourceType})` : ''; if (verbose === 1) { const verboseSuffix = matchInfo ? ` contains ${matchInfo}` : ' matched regex'; console.log(formatLogMessage('match', `[${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${verboseSuffix}${resourceInfo}`)); } if (dumpUrls && matchedUrlsLogFile) { const timestamp = new Date().toISOString(); // matchInfo goes INSIDE the (party, curl, ...) parens to mirror the // pre-refactor file format. const fileExtra = matchInfo ? `, ${matchInfo}` : ''; try { fs.appendFileSync(matchedUrlsLogFile, `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl${fileExtra})${resourceInfo}\n`); } catch (logErr) { console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`)); } } } /** * Creates a curl-based URL handler for downloading and searching content * @param {object} config - Configuration object containing all necessary parameters * @returns {Function} URL handler function for curl-based content analysis */ function createCurlHandler(config) { const { searchStrings, searchStringsAnd, hasSearchStringAnd, regexes, // matchedDomains intentionally not destructured — only addMatchedDomain // is called; the underlying collection is opaque to this handler. addMatchedDomain, isDomainAlreadyDetected, onContentFetched, currentUrl, perSiteSubDomains, ignoreDomains, matchesIgnoreDomain, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, userAgent, resourceType, hasSearchString } = config; // Hoisted: currentUrl doesn't change for this handler's lifetime, so // parsing its root domain once at handler-creation eliminates the // per-request parse + getRootDomain call. let currentRootDomain = ''; try { currentRootDomain = getRootDomain(currentUrl); } catch (_) {} return async function curlHandler(requestUrl) { try { // Regex check FIRST — cheap filter that skips ~99% of requests. // Previously this ran AFTER a URL parse + domain-cache lookup, // paying for parses on requests we then immediately drop. const matchesRegex = regexes.some(re => re.test(requestUrl)); if (!matchesRegex) { if (forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} URL ${requestUrl} doesn't match any regex patterns`)); } return; } // Parse requestUrl ONCE and reuse. The prior structure parsed it // 4-6 times: two `new URL().hostname` calls, two dead-var // hostname computations that were never read, plus the // getRootDomain calls. Single parse + the cache key (fullSubdomain) // + first-party root-domain comparison all come from this one URL // object now. let requestHostname; try { requestHostname = new URL(requestUrl).hostname; } catch (_) { return; } const fullSubdomain = requestHostname; // always the full subdomain // Compute requestRootDomain ONCE — derive respDomain from it when // perSiteSubDomains is false, and reuse it for the first-party // check. Previously getRootDomain(requestUrl) was called twice in // that path. const requestRootDomain = getRootDomain(requestUrl); const respDomain = perSiteSubDomains ? requestHostname : requestRootDomain; // Skip if already detected to avoid duplicates if (isDomainAlreadyDetected(fullSubdomain)) { if (forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} Skipping already detected subdomain: ${fullSubdomain}`)); } return; } // First-party = same registrable root domain. Same definition the // main request handler uses; matches what searchstring.js's // responseHandler does too (post the cross-module unification). const isFirstParty = currentRootDomain === requestRootDomain; // Apply first-party/third-party filtering. `=== false` only (no // `|| === 0`) — matches lib/searchstring.js and the main request // handler, which all treat these as boolean flags. Accepting 0 as // "disabled" here but not elsewhere would silently disagree if a // user ever set "firstParty": 0 in JSON config. if (isFirstParty && siteConfig.firstParty === false) { if (forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} Skipping first-party request (firstParty disabled): ${requestUrl}`)); } return; } if (!isFirstParty && siteConfig.thirdParty === false) { if (forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} Skipping third-party request (thirdParty disabled): ${requestUrl}`)); } return; } if (forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} Processing ${isFirstParty ? 'first-party' : 'third-party'} request: ${requestUrl}`)); } // If NO searchstring is defined, match immediately (like browser // behavior). Simplified from the prior convoluted condition // (hasSearchString being true while both arrays are empty is // impossible given parseSearchStrings, so the OR was redundant). if (!hasSearchString && !hasSearchStringAnd) { if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) { if (forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} Domain ${respDomain} is in ignore list`)); } return; } addMatchedDomain(respDomain, resourceType, fullSubdomain); logMatchedRequest({ simplifiedUrl: currentRootDomain, requestUrl, isFirstParty, resourceType, matchInfo: null, // no searchstring — log says "matched regex" verbose: siteConfig.verbose, dumpUrls, matchedUrlsLogFile }); return; } // If searchstring IS defined, download and search content if ((hasSearchString || hasSearchStringAnd) && forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} Downloading content for pattern matching: ${requestUrl}`)); } // Prepare custom headers from site config. SHALLOW-COPY so the // Referer assignment below doesn't mutate the underlying siteConfig // object — the old `siteConfig.custom_headers || {}` was a reference // (when present), so setting customHeaders['Referer'] persisted the // first URL's random-mode referrer onto siteConfig.custom_headers, // and every subsequent URL inherited that pinned value. Silent // breakage of {mode:'random_search'} variation across a site's URLs. // // Uses getReferrerForUrl so ALL referrer modes work — the old // inline string/array logic dropped object modes silently. const customHeaders = { ...(siteConfig.custom_headers || {}) }; if (siteConfig.referrer_headers) { const referrerUrl = getReferrerForUrl( requestUrl, siteConfig.referrer_headers, siteConfig.referrer_disable, forceDebug ); if (referrerUrl) customHeaders['Referer'] = referrerUrl; } const downloadResult = await downloadWithCurl(requestUrl, userAgent, { timeout: CURL_DEFAULTS.TIMEOUT_SECONDS, maxRedirects: CURL_DEFAULTS.MAX_REDIRECTS, customHeaders }); if (!downloadResult.success) { if (forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} Failed to download ${requestUrl}: ${downloadResult.error}`)); } return; } // Cache the fetched content if callback provided if (onContentFetched) { try { onContentFetched(requestUrl, downloadResult.content); } catch (cacheErr) { if (forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} Content caching failed: ${cacheErr.message}`)); } } } // Search content for patterns const searchResult = searchContent( downloadResult.content, searchStrings, searchStringsAnd, hasSearchStringAnd ); if (searchResult.found) { if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) { if (forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} Domain ${respDomain} matches but is in ignore list`)); } return; } addMatchedDomain(respDomain, resourceType, fullSubdomain); const matchInfo = searchResult.matchType === 'AND' ? `patterns: ${searchResult.foundPatterns.length}/${searchStringsAnd.length}` : `pattern: "${searchResult.matchedPattern}"`; logMatchedRequest({ simplifiedUrl: currentRootDomain, requestUrl, isFirstParty, resourceType, matchInfo, verbose: siteConfig.verbose, dumpUrls, matchedUrlsLogFile }); } else { if (forceDebug) { const partyType = isFirstParty ? 'first-party' : 'third-party'; if (searchResult.matchType === 'AND' && searchResult.missingPatterns) { console.log(formatLogMessage('debug', `${CURL_TAG} ${requestUrl} (${partyType}) matched regex but missing AND patterns: ${searchResult.missingPatterns.join(', ')}`)); } else { console.log(formatLogMessage('debug', `${CURL_TAG} ${requestUrl} (${partyType}) matched regex but no search patterns found`)); } } } } catch (err) { if (forceDebug) { console.log(formatLogMessage('debug', `${CURL_TAG} Handler failed for ${requestUrl}: ${err.message}`)); } } }; } /** * Validates that curl is available on the system * @returns {object} Validation result with isAvailable boolean and version info */ function validateCurlAvailability() { try { const result = spawnSync('curl', ['--version'], { encoding: 'utf8', timeout: CURL_DEFAULTS.VALIDATION_TIMEOUT }); if (result.status === CURL_DEFAULTS.CURL_SUCCESS_STATUS) { const version = result.stdout.split('\n')[CURL_DEFAULTS.VERSION_LINE_INDEX] || 'Unknown version'; return { isAvailable: true, version: version.trim(), error: null }; } else { return { isAvailable: false, version: null, error: 'curl command failed' }; } } catch (error) { return { isAvailable: false, version: null, error: `curl not found: ${error.message}` }; } } // Public surface used by nwss.js (createCurlHandler + validateCurlAvailability). // downloadWithCurl and searchContent are module-internal helpers — no external // caller imports them from here. lib/searchstring.js has its own independently- // defined functions of the same names, which is why a naive grep showed // false-positive 'external uses'. module.exports = { createCurlHandler, validateCurlAvailability };