@fanboynz/network-scanner
Version:
A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.
509 lines (463 loc) • 19.7 kB
JavaScript
// === curl.js - Curl-based Content Download Module ===
// Handles HTTP content downloading using curl for searchstring analysis
const fs = require('fs');
// spawnSync only kept for validateCurlAvailability (runs once at
// startup). Production curl downloads go through runProcess (async).
const { spawnSync } = require('child_process');
const { runProcess } = require('./spawn-async');
const { messageColors, formatLogMessage } = require('./colorize');
const { getReferrerForUrl } = require('./referrer');
const CURL_TAG = messageColors.processing('[curl]');
// === Constants ===
const CURL_DEFAULTS = {
TIMEOUT_SECONDS: 30,
MAX_REDIRECTS: 5,
// 50MB to match lib/searchstring.js's downloadWithCurl cap — the two
// modules previously had different defaults (10MB vs 50MB) so the same
// URL could succeed or fail depending on which code path fetched it.
MAX_SIZE_BYTES: 50 * 1024 * 1024,
VALIDATION_TIMEOUT: 5000,
CURL_SUCCESS_STATUS: 0,
VERSION_LINE_INDEX: 0
};
// Module-level so downloadWithCurl doesn't reallocate this closure on
// every call. No state captured — pure factory.
function errResult(msg) {
return {
content: '', httpCode: 0, contentType: 'unknown', downloadSize: 0,
success: false, error: msg
};
}
/**
* Downloads content using curl with browser-like headers
* @param {string} url - The URL to download
* @param {string} userAgent - User agent string to use
* @param {object} options - Download options
* @returns {Promise<object>} Object with content, status, and metadata
*/
async function downloadWithCurl(url, userAgent = '', options = {}) {
const {
timeout = CURL_DEFAULTS.TIMEOUT_SECONDS,
maxRedirects = CURL_DEFAULTS.MAX_REDIRECTS,
maxSize = CURL_DEFAULTS.MAX_SIZE_BYTES,
followRedirects = true,
customHeaders = {}
} = options;
const curlArgs = [
'-s',
'--max-time', timeout.toString(),
'--max-redirs', maxRedirects.toString(),
'--fail-with-body',
'--compressed',
// Leading '\n' guarantees the metadata sits on its own line even
// when content has no trailing newline (older format had no
// separator and concatenated metadata with the last content byte).
'--write-out', '\n%{http_code}|%{content_type}|%{size_download}'
];
if (followRedirects) curlArgs.push('-L');
if (userAgent) curlArgs.push('-H', `User-Agent: ${userAgent}`);
curlArgs.push(
'-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'-H', 'Accept-Language: en-US,en;q=0.5',
'-H', 'Accept-Encoding: gzip, deflate, br',
'-H', 'Connection: keep-alive',
'-H', 'Upgrade-Insecure-Requests: 1',
'-H', 'Sec-Fetch-Dest: document',
'-H', 'Sec-Fetch-Mode: navigate',
'-H', 'Sec-Fetch-Site: none',
'-H', 'Cache-Control: no-cache'
);
Object.entries(customHeaders).forEach(([key, value]) => {
curlArgs.push('-H', `${key}: ${value}`);
});
curlArgs.push(url);
// Shared async-spawn helper handles streaming/cap/timeout/kill plumbing.
const result = await runProcess('curl', curlArgs, {
timeout: timeout * 1000,
maxStdout: maxSize
});
if (result.error) return errResult(result.error);
if (result.truncated) return errResult(`Output exceeded ${maxSize} bytes`);
if (result.signal) return errResult(`Killed by signal ${result.signal}`);
if (result.code !== CURL_DEFAULTS.CURL_SUCCESS_STATUS) {
return errResult(`Curl exited with status ${result.code}: ${result.stderr.toString('utf8')}`);
}
const output = result.stdout.toString('utf8');
// lastIndexOf('\n') is a single O(n) scan from the end vs the old
// split('\n') + slice(0,-1) + join('\n') which was three full passes
// plus two intermediate array allocations.
const sepIdx = output.lastIndexOf('\n');
if (sepIdx === -1) return errResult('No metadata separator in curl output');
const content = output.slice(0, sepIdx);
const metadata = output.slice(sepIdx + 1);
// Split on first/last pipe so the middle (content-type) can legitimately
// contain pipes — naive split('|') with parts-count check would drop the
// whole response with 'Invalid metadata format' for such content-types.
const firstPipe = metadata.indexOf('|');
const lastPipe = metadata.lastIndexOf('|');
if (firstPipe === -1 || firstPipe === lastPipe) {
return errResult(`Invalid metadata format: missing pipes in "${metadata}"`);
}
const httpCode = metadata.slice(0, firstPipe);
const contentType = metadata.slice(firstPipe + 1, lastPipe);
const downloadSize = metadata.slice(lastPipe + 1);
return {
content,
httpCode: parseInt(httpCode, 10) || 0,
contentType: contentType || 'unknown',
downloadSize: parseInt(downloadSize, 10) || content.length,
success: true
};
}
/**
* Searches content for patterns using JavaScript (case-insensitive)
* @param {string} content - Content to search
* @param {Array<string>} searchStrings - OR patterns (any can match)
* @param {Array<string>} searchStringsAnd - AND patterns (all must match)
* @param {boolean} hasSearchStringAnd - Whether AND logic is being used
* @returns {object} Search result with found status and matched pattern
*/
function searchContent(content, searchStrings = [], searchStringsAnd = [], hasSearchStringAnd = false) {
if (!content || content.length === 0) {
return { found: false, matchedPattern: null, matchType: null };
}
const lowerContent = content.toLowerCase();
// Handle AND logic searchstring_and (all patterns must be present).
// Short-circuits on first missing pattern — the old code walked the
// entire list to build a full missingPatterns array that's only used
// by a debug log. Now we early-exit and report the first miss (the
// debug log's missingPatterns.join(', ') still works with one entry).
if (hasSearchStringAnd && searchStringsAnd.length > 0) {
// Pre-lower patterns once — was per-iteration toLowerCase before.
// For a 20-pattern AND check the difference is small per call but
// the pattern itself never changes between iterations of the loop.
const lowered = searchStringsAnd.map(p => p.toLowerCase());
for (let i = 0; i < searchStringsAnd.length; i++) {
if (!lowerContent.includes(lowered[i])) {
return {
found: false,
matchedPattern: null,
matchType: 'AND',
foundPatterns: searchStringsAnd.slice(0, i),
missingPatterns: [searchStringsAnd[i]]
};
}
}
return {
found: true,
matchedPattern: searchStringsAnd.join(' AND '),
matchType: 'AND',
foundPatterns: searchStringsAnd,
missingPatterns: []
};
}
// Handle OR logic searchstring (any pattern can match). Same pre-lower
// optimization, though OR usually short-circuits early so the savings
// are smaller.
if (searchStrings.length > 0) {
for (let i = 0; i < searchStrings.length; i++) {
if (lowerContent.includes(searchStrings[i].toLowerCase())) {
return {
found: true,
matchedPattern: searchStrings[i],
matchType: 'OR'
};
}
}
}
return { found: false, matchedPattern: null, matchType: null };
}
/**
* Emits a match for a curl-fetched URL to both the verbose console
* (when siteConfig.verbose === 1) and the matched-URLs log file
* (when dumpUrls is true). Single source of truth for the format —
* both no-searchstring and with-searchstring match paths funnel
* through here so partyType / resourceInfo / timestamp / format
* don't drift between the two branches.
*
* @param {object} opts
* @param {string} opts.simplifiedUrl
* @param {string} opts.requestUrl
* @param {boolean} opts.isFirstParty
* @param {string|null} opts.resourceType
* @param {string|null} opts.matchInfo - null for "matched regex only"
* (no searchstring), a string like
* 'pattern: "X"' or 'patterns: 2/3'
* for searchstring matches
* @param {number|undefined} opts.verbose
* @param {boolean} opts.dumpUrls
* @param {string} opts.matchedUrlsLogFile
*/
function logMatchedRequest({
simplifiedUrl, requestUrl, isFirstParty, resourceType,
matchInfo, verbose, dumpUrls, matchedUrlsLogFile
}) {
const partyType = isFirstParty ? 'first-party' : 'third-party';
const resourceInfo = resourceType ? ` (${resourceType})` : '';
if (verbose === 1) {
const verboseSuffix = matchInfo ? ` contains ${matchInfo}` : ' matched regex';
console.log(formatLogMessage('match',
`[${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${verboseSuffix}${resourceInfo}`));
}
if (dumpUrls && matchedUrlsLogFile) {
const timestamp = new Date().toISOString();
// matchInfo goes INSIDE the (party, curl, ...) parens to mirror the
// pre-refactor file format.
const fileExtra = matchInfo ? `, ${matchInfo}` : '';
try {
fs.appendFileSync(matchedUrlsLogFile,
`${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl${fileExtra})${resourceInfo}\n`);
} catch (logErr) {
console.warn(formatLogMessage('warn', `Failed to write to matched URLs log: ${logErr.message}`));
}
}
}
/**
* Creates a curl-based URL handler for downloading and searching content
* @param {object} config - Configuration object containing all necessary parameters
* @returns {Function} URL handler function for curl-based content analysis
*/
function createCurlHandler(config) {
const {
searchStrings,
searchStringsAnd,
hasSearchStringAnd,
regexes,
// matchedDomains intentionally not destructured — only addMatchedDomain
// is called; the underlying collection is opaque to this handler.
addMatchedDomain,
isDomainAlreadyDetected,
onContentFetched,
currentUrl,
perSiteSubDomains,
ignoreDomains,
matchesIgnoreDomain,
getRootDomain,
siteConfig,
dumpUrls,
matchedUrlsLogFile,
forceDebug,
userAgent,
resourceType,
hasSearchString
} = config;
// Hoisted: currentUrl doesn't change for this handler's lifetime, so
// parsing its root domain once at handler-creation eliminates the
// per-request parse + getRootDomain call.
let currentRootDomain = '';
try { currentRootDomain = getRootDomain(currentUrl); } catch (_) {}
return async function curlHandler(requestUrl) {
try {
// Regex check FIRST — cheap filter that skips ~99% of requests.
// Previously this ran AFTER a URL parse + domain-cache lookup,
// paying for parses on requests we then immediately drop.
const matchesRegex = regexes.some(re => re.test(requestUrl));
if (!matchesRegex) {
if (forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} URL ${requestUrl} doesn't match any regex patterns`));
}
return;
}
// Parse requestUrl ONCE and reuse. The prior structure parsed it
// 4-6 times: two `new URL().hostname` calls, two dead-var
// hostname computations that were never read, plus the
// getRootDomain calls. Single parse + the cache key (fullSubdomain)
// + first-party root-domain comparison all come from this one URL
// object now.
let requestHostname;
try { requestHostname = new URL(requestUrl).hostname; } catch (_) { return; }
const fullSubdomain = requestHostname; // always the full subdomain
// Compute requestRootDomain ONCE — derive respDomain from it when
// perSiteSubDomains is false, and reuse it for the first-party
// check. Previously getRootDomain(requestUrl) was called twice in
// that path.
const requestRootDomain = getRootDomain(requestUrl);
const respDomain = perSiteSubDomains ? requestHostname : requestRootDomain;
// Skip if already detected to avoid duplicates
if (isDomainAlreadyDetected(fullSubdomain)) {
if (forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} Skipping already detected subdomain: ${fullSubdomain}`));
}
return;
}
// First-party = same registrable root domain. Same definition the
// main request handler uses; matches what searchstring.js's
// responseHandler does too (post the cross-module unification).
const isFirstParty = currentRootDomain === requestRootDomain;
// Apply first-party/third-party filtering. `=== false` only (no
// `|| === 0`) — matches lib/searchstring.js and the main request
// handler, which all treat these as boolean flags. Accepting 0 as
// "disabled" here but not elsewhere would silently disagree if a
// user ever set "firstParty": 0 in JSON config.
if (isFirstParty && siteConfig.firstParty === false) {
if (forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} Skipping first-party request (firstParty disabled): ${requestUrl}`));
}
return;
}
if (!isFirstParty && siteConfig.thirdParty === false) {
if (forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} Skipping third-party request (thirdParty disabled): ${requestUrl}`));
}
return;
}
if (forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} Processing ${isFirstParty ? 'first-party' : 'third-party'} request: ${requestUrl}`));
}
// If NO searchstring is defined, match immediately (like browser
// behavior). Simplified from the prior convoluted condition
// (hasSearchString being true while both arrays are empty is
// impossible given parseSearchStrings, so the OR was redundant).
if (!hasSearchString && !hasSearchStringAnd) {
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
if (forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} Domain ${respDomain} is in ignore list`));
}
return;
}
addMatchedDomain(respDomain, resourceType, fullSubdomain);
logMatchedRequest({
simplifiedUrl: currentRootDomain,
requestUrl,
isFirstParty,
resourceType,
matchInfo: null, // no searchstring — log says "matched regex"
verbose: siteConfig.verbose,
dumpUrls,
matchedUrlsLogFile
});
return;
}
// If searchstring IS defined, download and search content
if ((hasSearchString || hasSearchStringAnd) && forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} Downloading content for pattern matching: ${requestUrl}`));
}
// Prepare custom headers from site config. SHALLOW-COPY so the
// Referer assignment below doesn't mutate the underlying siteConfig
// object — the old `siteConfig.custom_headers || {}` was a reference
// (when present), so setting customHeaders['Referer'] persisted the
// first URL's random-mode referrer onto siteConfig.custom_headers,
// and every subsequent URL inherited that pinned value. Silent
// breakage of {mode:'random_search'} variation across a site's URLs.
//
// Uses getReferrerForUrl so ALL referrer modes work — the old
// inline string/array logic dropped object modes silently.
const customHeaders = { ...(siteConfig.custom_headers || {}) };
if (siteConfig.referrer_headers) {
const referrerUrl = getReferrerForUrl(
requestUrl,
siteConfig.referrer_headers,
siteConfig.referrer_disable,
forceDebug
);
if (referrerUrl) customHeaders['Referer'] = referrerUrl;
}
const downloadResult = await downloadWithCurl(requestUrl, userAgent, {
timeout: CURL_DEFAULTS.TIMEOUT_SECONDS,
maxRedirects: CURL_DEFAULTS.MAX_REDIRECTS,
customHeaders
});
if (!downloadResult.success) {
if (forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} Failed to download ${requestUrl}: ${downloadResult.error}`));
}
return;
}
// Cache the fetched content if callback provided
if (onContentFetched) {
try {
onContentFetched(requestUrl, downloadResult.content);
} catch (cacheErr) {
if (forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} Content caching failed: ${cacheErr.message}`));
}
}
}
// Search content for patterns
const searchResult = searchContent(
downloadResult.content,
searchStrings,
searchStringsAnd,
hasSearchStringAnd
);
if (searchResult.found) {
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
if (forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} Domain ${respDomain} matches but is in ignore list`));
}
return;
}
addMatchedDomain(respDomain, resourceType, fullSubdomain);
const matchInfo = searchResult.matchType === 'AND'
? `patterns: ${searchResult.foundPatterns.length}/${searchStringsAnd.length}`
: `pattern: "${searchResult.matchedPattern}"`;
logMatchedRequest({
simplifiedUrl: currentRootDomain,
requestUrl,
isFirstParty,
resourceType,
matchInfo,
verbose: siteConfig.verbose,
dumpUrls,
matchedUrlsLogFile
});
} else {
if (forceDebug) {
const partyType = isFirstParty ? 'first-party' : 'third-party';
if (searchResult.matchType === 'AND' && searchResult.missingPatterns) {
console.log(formatLogMessage('debug',
`${CURL_TAG} ${requestUrl} (${partyType}) matched regex but missing AND patterns: ${searchResult.missingPatterns.join(', ')}`));
} else {
console.log(formatLogMessage('debug',
`${CURL_TAG} ${requestUrl} (${partyType}) matched regex but no search patterns found`));
}
}
}
} catch (err) {
if (forceDebug) {
console.log(formatLogMessage('debug', `${CURL_TAG} Handler failed for ${requestUrl}: ${err.message}`));
}
}
};
}
/**
* Validates that curl is available on the system
* @returns {object} Validation result with isAvailable boolean and version info
*/
function validateCurlAvailability() {
try {
const result = spawnSync('curl', ['--version'], {
encoding: 'utf8',
timeout: CURL_DEFAULTS.VALIDATION_TIMEOUT
});
if (result.status === CURL_DEFAULTS.CURL_SUCCESS_STATUS) {
const version = result.stdout.split('\n')[CURL_DEFAULTS.VERSION_LINE_INDEX] || 'Unknown version';
return {
isAvailable: true,
version: version.trim(),
error: null
};
} else {
return {
isAvailable: false,
version: null,
error: 'curl command failed'
};
}
} catch (error) {
return {
isAvailable: false,
version: null,
error: `curl not found: ${error.message}`
};
}
}
// Public surface used by nwss.js (createCurlHandler + validateCurlAvailability).
// downloadWithCurl and searchContent are module-internal helpers — no external
// caller imports them from here. lib/searchstring.js has its own independently-
// defined functions of the same names, which is why a naive grep showed
// false-positive 'external uses'.
module.exports = {
createCurlHandler,
validateCurlAvailability
};