@fanboynz/network-scanner
Version:
A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.
577 lines (501 loc) • 21.1 kB
JavaScript
// === searchstring.js - Content Search Module ===
// Handles response content analysis for searchstring functionality
const fs = require('fs');
const { spawnSync } = require('child_process');
/**
* Parses searchstring configuration into a normalized format
* @param {string|Array<string>|undefined} searchstring - The searchstring config value (OR logic)
* @param {string|Array<string>|undefined} searchstringAnd - The searchstring_and config value (AND logic)
* @returns {object} Object with searchStrings array, searchStringsAnd array, hasSearchString boolean, and hasSearchStringAnd boolean
*/
function parseSearchStrings(searchstring, searchstringAnd) {
let searchStrings = Array.isArray(searchstring)
? searchstring
: searchstring
? [searchstring]
: [];
let searchStringsAnd = Array.isArray(searchstringAnd)
? searchstringAnd
: searchstringAnd
? [searchstringAnd]
: [];
// Filter out empty strings to prevent matching everything
searchStrings = searchStrings.filter(str => str && str.trim().length > 0);
searchStringsAnd = searchStringsAnd.filter(str => str && str.trim().length > 0);
const hasSearchString = searchStrings.length > 0;
const hasSearchStringAnd = searchStringsAnd.length > 0;
return {
searchStrings,
searchStringsAnd,
hasSearchString,
hasSearchStringAnd
};
}
/**
* Helper function to add domain to matched collection (handles both Set and Map)
* @param {Set|Map} matchedDomains - The matched domains collection
* @param {Function} addMatchedDomain - Optional helper function for adding domains
* @param {string} domain - Domain to add
* @param {string} resourceType - Resource type (for --adblock-rules mode)
*/
function addDomainToCollection(matchedDomains, addMatchedDomain, domain, resourceType = null) {
// Use helper function if provided (preferred method)
if (typeof addMatchedDomain === 'function') {
addMatchedDomain(domain, resourceType);
return;
}
// Fallback: handle different collection types directly
if (matchedDomains instanceof Set) {
matchedDomains.add(domain);
} else if (matchedDomains instanceof Map) {
if (!matchedDomains.has(domain)) {
matchedDomains.set(domain, new Set());
}
if (resourceType) {
matchedDomains.get(domain).add(resourceType);
}
} else {
console.warn('[warn] Unknown matchedDomains type, skipping domain addition');
}
}
/**
* Downloads content using curl with appropriate headers and timeout
* @param {string} url - The URL to download
* @param {string} userAgent - User agent string to use
* @param {number} timeout - Timeout in seconds (default: 30)
* @returns {Promise<string>} The downloaded content
*/
async function downloadWithCurl(url, userAgent = '', timeout = 30) {
return new Promise((resolve, reject) => {
try {
const curlArgs = [
'-s', // Silent mode
'-L', // Follow redirects
'--max-time', timeout.toString(),
'--max-redirs', '5',
'--fail-with-body', // Return body even on HTTP errors
'--compressed', // Accept compressed responses
];
if (userAgent) {
curlArgs.push('-H', `User-Agent: ${userAgent}`);
}
// Add common headers to appear more browser-like
curlArgs.push(
'-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'-H', 'Accept-Language: en-US,en;q=0.5',
'-H', 'Accept-Encoding: gzip, deflate',
'-H', 'Connection: keep-alive',
'-H', 'Upgrade-Insecure-Requests: 1'
);
curlArgs.push(url);
// Use spawnSync with proper argument separation
const result = spawnSync('curl', curlArgs, {
encoding: 'utf8',
timeout: timeout * 1000,
maxBuffer: 10 * 1024 * 1024 // 10MB max buffer
});
if (result.error) {
throw result.error;
}
if (result.status !== 0) {
throw new Error(`Curl exited with status ${result.status}: ${result.stderr}`);
}
resolve(result.stdout);
} catch (error) {
reject(new Error(`Curl failed for ${url}: ${error.message}`));
}
});
}
/**
* Checks if response content contains any of the search strings (OR logic)
* or all of the AND search strings (AND logic)
* Handles both raw text search and basic XML content extraction
* @param {string} content - The response content to search
* @param {Array<string>} searchStrings - Array of strings to search for (OR logic)
* @param {Array<string>} searchStringsAnd - Array of strings that must all be present (AND logic)
* @param {string} contentType - Content type for specialized handling
* @returns {object} Object with found boolean, matchedString/matchedStrings, allMatches array, and logic type
*/
function searchContent(content, searchStrings, searchStringsAnd = [], contentType = '') {
let searchableContent = content;
// For XML content, also search decoded entities and stripped tags for better matching
if (contentType.includes('xml')) {
// Decode common XML entities
const decodedContent = content
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/&/g, '&')
.replace(/"/g, '"')
.replace(/'/g, "'");
// Create version with XML tags stripped for text content search
const strippedContent = decodedContent.replace(/<[^>]*>/g, ' ');
// Search in: original + decoded + stripped content
searchableContent = content + '\n' + decodedContent + '\n' + strippedContent;
}
// Check AND logic first (more restrictive)
if (searchStringsAnd && searchStringsAnd.length > 0) {
const lowerContent = searchableContent.toLowerCase();
const foundAndStrings = [];
for (const searchStr of searchStringsAnd) {
if (lowerContent.includes(searchStr.toLowerCase())) {
foundAndStrings.push(searchStr);
}
}
// AND logic: ALL strings must be found
if (foundAndStrings.length === searchStringsAnd.length) {
return {
found: true,
matchedString: foundAndStrings.join(' AND '), // Show all matched strings
matchedStrings: foundAndStrings,
allMatches: foundAndStrings,
logicType: 'AND'
};
}
}
// Fall back to OR logic if AND logic didn't match or wasn't specified
const lowerContent = searchableContent.toLowerCase();
const allMatches = [];
let firstMatch = null;
for (const searchStr of searchStrings) {
if (lowerContent.includes(searchStr.toLowerCase())) {
allMatches.push(searchStr);
if (!firstMatch) {
firstMatch = searchStr;
}
}
}
return {
found: allMatches.length > 0,
matchedString: firstMatch,
matchedStrings: allMatches,
allMatches: allMatches,
logicType: 'OR'
};
}
/**
* Determines if a content type should be analyzed for search strings
* @param {string} contentType - The response content-type header
* @returns {boolean} True if content should be analyzed
*/
function shouldAnalyzeContentType(contentType) {
if (!contentType) return false;
const textTypes = [
'text/', // text/html, text/plain, text/xml, etc.
'application/json',
'application/javascript',
'application/xml', // Standard XML
'application/x-javascript',
'application/soap+xml', // SOAP XML
'application/rss+xml', // RSS feeds
'application/atom+xml', // Atom feeds
'application/xhtml+xml' // XHTML
];
return textTypes.some(type => contentType.includes(type));
}
/**
* Creates a curl-based URL handler for downloading and optionally searching content
* @param {object} config - Configuration object containing all necessary parameters
* @returns {Function} URL handler function for curl-based content analysis
*/
function createCurlHandler(config) {
const {
searchStrings,
searchStringsAnd,
hasSearchStringAnd,
regexes,
matchedDomains,
addMatchedDomain, // Helper function for adding domains
currentUrl,
perSiteSubDomains,
ignoreDomains,
matchesIgnoreDomain,
getRootDomain,
siteConfig,
dumpUrls,
matchedUrlsLogFile,
forceDebug,
userAgent,
resourceType, // Resource type from request
hasSearchString
} = config;
return async function curlHandler(requestUrl) {
// Only process URLs that match our regex patterns
const matchesRegex = regexes.some(re => re.test(requestUrl));
if (!matchesRegex) return;
// Extract domain and check if already detected (skip expensive operations)
const reqDomain = perSiteSubDomains ? (new URL(requestUrl)).hostname : getRootDomain(requestUrl);
if (typeof config.isDomainAlreadyDetected === 'function' && config.isDomainAlreadyDetected(reqDomain)) {
if (forceDebug) {
console.log(`[debug][curl] Skipping already detected domain: ${reqDomain}`);
}
return;
}
// Check if this is a first-party request (same domain as the URL being scanned)
const currentUrlHostname = new URL(currentUrl).hostname;
const requestHostname = new URL(requestUrl).hostname;
const isFirstParty = currentUrlHostname === requestHostname;
// Apply first-party/third-party filtering
if (isFirstParty && siteConfig.firstParty === false) {
if (forceDebug) {
console.log(`[debug][curl] Skipping first-party request (firstParty=false): ${requestUrl}`);
}
return;
}
if (!isFirstParty && siteConfig.thirdParty === false) {
if (forceDebug) {
console.log(`[debug][curl] Skipping third-party request (thirdParty=false): ${requestUrl}`);
}
return;
}
try {
if (forceDebug) {
console.log(`[debug][curl] Downloading content from: ${requestUrl}`);
}
// If NO searchstring is defined, match immediately (like browser behavior)
if (!hasSearchString && !hasSearchStringAnd) {
if (!reqDomain || matchesIgnoreDomain(reqDomain, ignoreDomains)) {
return;
}
addDomainToCollection(matchedDomains, addMatchedDomain, reqDomain, resourceType);
const simplifiedUrl = getRootDomain(currentUrl);
if (siteConfig.verbose === 1) {
const partyType = isFirstParty ? 'first-party' : 'third-party';
const resourceInfo = resourceType ? ` (${resourceType})` : '';
console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) matched regex${resourceInfo}`);
}
if (dumpUrls) {
const timestamp = new Date().toISOString();
const partyType = isFirstParty ? 'first-party' : 'third-party';
const resourceInfo = resourceType ? ` (${resourceType})` : '';
try {
fs.appendFileSync(matchedUrlsLogFile,
`${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${resourceInfo}\n`);
} catch (logErr) {
console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
}
}
return;
}
// If searchstring IS defined, download and search content
const content = await downloadWithCurl(requestUrl, userAgent, 30);
// Check if content contains search strings (OR or AND logic)
const { found, matchedString, logicType } = searchContent(content, searchStrings, searchStringsAnd, '');
if (found) {
if (!reqDomain || matchesIgnoreDomain(reqDomain, ignoreDomains)) {
return;
}
addDomainToCollection(matchedDomains, addMatchedDomain, reqDomain, resourceType);
const simplifiedUrl = getRootDomain(currentUrl);
if (siteConfig.verbose === 1) {
const partyType = isFirstParty ? 'first-party' : 'third-party';
const resourceInfo = resourceType ? ` (${resourceType})` : '';
console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) contains searchstring (${logicType}): "${matchedString}"${resourceInfo}`);
}
if (dumpUrls) {
const timestamp = new Date().toISOString();
const partyType = isFirstParty ? 'first-party' : 'third-party';
const resourceInfo = resourceType ? ` (${resourceType})` : '';
try {
fs.appendFileSync(matchedUrlsLogFile,
`${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl, searchstring (${logicType}): "${matchedString}")${resourceInfo}\n`);
} catch (logErr) {
console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
}
}
} else if (forceDebug) {
const partyType = isFirstParty ? 'first-party' : 'third-party';
console.log(`[debug][curl] ${requestUrl} (${partyType}) matched regex but no searchstring found`);
}
} catch (err) {
if (forceDebug) {
console.log(`[debug][curl] Failed to download content for ${requestUrl}: ${err.message}`);
}
}
};
}
/**
* Creates a response handler function for the given configuration
* @param {object} config - Configuration object containing all necessary parameters
* @returns {Function} Response handler function for page.on('response', handler)
*/
function createResponseHandler(config) {
const {
searchStrings,
searchStringsAnd,
hasSearchStringAnd,
regexes,
matchedDomains,
addMatchedDomain, // Helper function for adding domains
currentUrl,
perSiteSubDomains,
ignoreDomains,
matchesIgnoreDomain,
getRootDomain,
siteConfig,
dumpUrls,
matchedUrlsLogFile,
forceDebug,
resourceType // Will be null for response handler
} = config;
return async function responseHandler(response) {
const respUrl = response.url();
const respDomain = perSiteSubDomains ? (new URL(respUrl)).hostname : getRootDomain(respUrl);
// Only process responses that match our regex patterns
const matchesRegex = regexes.some(re => re.test(respUrl));
if (!matchesRegex) return;
// Extract domain and check if already detected (skip expensive operations)
if (typeof config.isDomainAlreadyDetected === 'function' && config.isDomainAlreadyDetected(respDomain)) {
if (forceDebug) {
console.log(`[debug] Skipping response analysis for already detected domain: ${respDomain}`);
}
return;
}
// Check if this is a first-party response (same domain as the URL being scanned)
const currentUrlHostname = new URL(currentUrl).hostname;
const responseHostname = new URL(respUrl).hostname;
const isFirstParty = currentUrlHostname === responseHostname;
// The main request handler already filtered first-party/third-party requests
// This response handler only runs for requests that passed that filter
// However, we need to apply the same first-party/third-party logic here for searchstring analysis
// because the response handler analyzes content, not just URLs
// Apply first-party/third-party filtering for searchstring analysis
// Use the exact same logic as the main request handler
if (isFirstParty && siteConfig.firstParty === false) {
if (forceDebug) {
console.log(`[debug] Skipping first-party response for searchstring analysis (firstParty=false): ${respUrl}`);
}
return;
}
if (!isFirstParty && siteConfig.thirdParty === false) {
if (forceDebug) {
console.log(`[debug] Skipping third-party response for searchstring analysis (thirdParty=false): ${respUrl}`);
}
return;
}
try {
// Only capture appropriate content types to avoid binary data
const contentType = response.headers()['content-type'] || '';
if (!shouldAnalyzeContentType(contentType)) {
if (forceDebug) {
console.log(`[debug] Skipping content analysis for ${respUrl} (content-type: ${contentType})`);
}
return;
}
const content = await response.text();
// Check if content contains search strings (OR or AND logic)
const { found, matchedString, logicType } = searchContent(content, searchStrings, searchStringsAnd, contentType);
if (found) {
if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) {
return;
}
// Response handler doesn't have access to specific resource type
addDomainToCollection(matchedDomains, addMatchedDomain, respDomain, null);
const simplifiedUrl = getRootDomain(currentUrl);
if (siteConfig.verbose === 1) {
const partyType = isFirstParty ? 'first-party' : 'third-party';
console.log(`[match][${simplifiedUrl}] ${respUrl} (${partyType}) contains searchstring (${logicType}): "${matchedString}"`);
}
if (dumpUrls) {
const timestamp = new Date().toISOString();
const partyType = isFirstParty ? 'first-party' : 'third-party';
try {
fs.appendFileSync(matchedUrlsLogFile,
`${timestamp} [match][${simplifiedUrl}] ${respUrl} (${partyType}, searchstring (${logicType}): "${matchedString}")\n`);
} catch (logErr) {
console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`);
}
}
} else if (forceDebug) {
const partyType = isFirstParty ? 'first-party' : 'third-party';
console.log(`[debug] ${respUrl} (${partyType}) matched regex but no searchstring found`);
}
} catch (err) {
if (forceDebug) {
console.log(`[debug] Failed to read response content for ${respUrl}: ${err.message}`);
}
}
};
}
/**
* Validates searchstring configuration
* @param {any} searchstring - The searchstring value to validate
* @param {any} searchstringAnd - The searchstring_and value to validate
* @returns {object} Validation result with isValid boolean and error message
*/
function validateSearchString(searchstring, searchstringAnd) {
if (searchstring === undefined || searchstring === null) {
return { isValid: true, error: null };
}
if (typeof searchstring === 'string') {
if (searchstring.length === 0) {
return { isValid: false, error: 'searchstring cannot be empty string' };
}
return { isValid: true, error: null };
}
if (Array.isArray(searchstring)) {
if (searchstring.length === 0) {
return { isValid: false, error: 'searchstring array cannot be empty' };
}
for (let i = 0; i < searchstring.length; i++) {
if (typeof searchstring[i] !== 'string') {
return { isValid: false, error: `searchstring[${i}] must be a string` };
}
if (searchstring[i].length === 0) {
return { isValid: false, error: `searchstring[${i}] cannot be empty string` };
}
}
return { isValid: true, error: null };
}
// Validate searchstring_and
if (searchstringAnd !== undefined && searchstringAnd !== null) {
if (typeof searchstringAnd === 'string') {
if (searchstringAnd.length === 0) {
return { isValid: false, error: 'searchstring_and cannot be empty string' };
}
} else if (Array.isArray(searchstringAnd)) {
if (searchstringAnd.length === 0) {
return { isValid: false, error: 'searchstring_and array cannot be empty' };
}
for (let i = 0; i < searchstringAnd.length; i++) {
if (typeof searchstringAnd[i] !== 'string') {
return { isValid: false, error: `searchstring_and[${i}] must be a string` };
}
if (searchstringAnd[i].length === 0) {
return { isValid: false, error: `searchstring_and[${i}] cannot be empty string` };
}
}
} else {
return { isValid: false, error: 'searchstring_and must be string or array of strings' };
}
}
// Check that both searchstring and searchstring_and aren't defined simultaneously
if (searchstring && searchstringAnd) {
return { isValid: false, error: 'Cannot use both searchstring (OR) and searchstring_and (AND) simultaneously. Choose one logic type.' };
}
return { isValid: false, error: 'searchstring must be string or array of strings' };
}
/**
* Gets statistics about search string matches
* @param {Set|Map} matchedDomains - Set or Map of matched domains
* @param {Array<string>} searchStrings - Array of search strings used
* @returns {object} Statistics object
*/
function getSearchStats(matchedDomains, searchStrings) {
const totalMatches = matchedDomains instanceof Map ? matchedDomains.size : matchedDomains.size;
return {
totalMatches,
searchStringCount: searchStrings.length,
searchStrings: [...searchStrings]
};
}
module.exports = {
parseSearchStrings,
searchContent,
shouldAnalyzeContentType,
createResponseHandler,
createCurlHandler,
downloadWithCurl,
validateSearchString,
getSearchStats,
addDomainToCollection
};