UNPKG

@fanboynz/network-scanner

Version:

A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.

862 lines (757 loc) 32.1 kB
// === searchstring.js - Content Search Module === // Handles response content analysis for searchstring functionality const fs = require('fs'); const { spawnSync } = require('child_process'); const { grepContent } = require('./grep'); // Configuration constants for search logic const SEARCH_CONFIG = { MAX_CONTENT_SIZE: 50 * 1024 * 1024, // 50MB max content size MAX_SEARCH_STRING_LENGTH: 1000, XML_ENTITY_TIMEOUT: 5000 // 5 second timeout for XML processing }; /** * Parses searchstring configuration into a normalized format * @param {string|Array<string>|undefined} searchstring - The searchstring config value (OR logic) * @param {string|Array<string>|undefined} searchstringAnd - The searchstring_and config value (AND logic) * @returns {object} Object with searchStrings array, searchStringsAnd array, hasSearchString boolean, and hasSearchStringAnd boolean */ function parseSearchStrings(searchstring, searchstringAnd) { let searchStrings = Array.isArray(searchstring) ? searchstring : searchstring ? [searchstring] : []; let searchStringsAnd = Array.isArray(searchstringAnd) ? searchstringAnd : searchstringAnd ? [searchstringAnd] : []; // Filter out empty strings to prevent matching everything searchStrings = searchStrings.filter(str => str && str.trim().length > 0); searchStringsAnd = searchStringsAnd.filter(str => str && str.trim().length > 0); const hasSearchString = searchStrings.length > 0; const hasSearchStringAnd = searchStringsAnd.length > 0; return { searchStrings, searchStringsAnd, hasSearchString, hasSearchStringAnd }; } /** * Helper function to add domain to matched collection (handles both Set and Map) * @param {Set|Map} matchedDomains - The matched domains collection * @param {Function} addMatchedDomain - Optional helper function for adding domains * @param {string} domain - Domain to add * @param {string} resourceType - Resource type (for --adblock-rules mode) * @param {string} fullSubdomain - Full subdomain for cache tracking (optional) */ function addDomainToCollection(matchedDomains, addMatchedDomain, domain, resourceType = null, fullSubdomain = null) { // Use helper function if provided (preferred method) if (typeof addMatchedDomain === 'function') { addMatchedDomain(domain, resourceType, fullSubdomain); return; } // Fallback: handle different collection types directly if (matchedDomains instanceof Set) { matchedDomains.add(domain); } else if (matchedDomains instanceof Map) { if (!matchedDomains.has(domain)) { matchedDomains.set(domain, new Set()); } if (resourceType) { matchedDomains.get(domain).add(resourceType); } } else { console.warn('[warn] Unknown matchedDomains type, skipping domain addition'); } } /** * Downloads content using curl with appropriate headers and timeout * @param {string} url - The URL to download * @param {string} userAgent - User agent string to use * @param {number} timeout - Timeout in seconds (default: 30) * @returns {Promise<string>} The downloaded content */ async function downloadWithCurl(url, userAgent = '', timeout = 30) { return new Promise((resolve, reject) => { try { const curlArgs = [ '-s', // Silent mode '-L', // Follow redirects '--max-time', timeout.toString(), '--max-redirs', '5', '--fail-with-body', // Return body even on HTTP errors '--max-filesize', '52428800', // 50MB limit '--range', '0-52428799', // Limit download size '--compressed', // Accept compressed responses ]; if (userAgent) { curlArgs.push('-H', `User-Agent: ${userAgent}`); } // Add common headers to appear more browser-like curlArgs.push( '-H', 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', '-H', 'Accept-Language: en-US,en;q=0.5', '-H', 'Accept-Encoding: gzip, deflate', '-H', 'Connection: keep-alive', '-H', 'Upgrade-Insecure-Requests: 1' ); curlArgs.push(url); // Use spawnSync with proper argument separation const result = spawnSync('curl', curlArgs, { encoding: 'utf8', timeout: timeout * 1000, maxBuffer: 10 * 1024 * 1024, // 10MB max buffer killSignal: 'SIGTERM' }); if (result.error) { throw result.error; } if (result.status !== 0) { throw new Error(`Curl exited with status ${result.status}: ${result.stderr}`); } resolve(result.stdout); } catch (error) { reject(new Error(`Curl failed for ${url}: ${error.message}`)); } }); } /** * Downloads content with retry logic for transient failures * @param {string} url - The URL to download * @param {string} userAgent - User agent string to use * @param {number} timeout - Timeout in seconds * @param {number} retries - Number of retry attempts (default: 2) * @returns {Promise<string>} The downloaded content */ async function downloadWithRetry(url, userAgent = '', timeout = 30, retries = 2) { for (let attempt = 0; attempt <= retries; attempt++) { try { return await downloadWithCurl(url, userAgent, timeout); } catch (err) { // Don't retry on final attempt if (attempt === retries) throw err; // Only retry on specific transient errors const shouldRetry = err.message.includes('timeout') || err.message.includes('Connection refused') || err.message.includes('502') || err.message.includes('503') || err.message.includes('Connection reset'); if (!shouldRetry) throw err; // Exponential backoff: 1s, 2s, 4s... await new Promise(resolve => setTimeout(resolve, 1000 * Math.pow(2, attempt))); } } } /** * Safely decodes XML entities with timeout protection * @param {string} content - Content to decode * @returns {string} Decoded content or original if processing fails */ function safeDecodeXmlEntities(content) { const startTime = Date.now(); try { let decoded = content .replace(/&lt;/g, '<') .replace(/&gt;/g, '>') .replace(/&amp;/g, '&') .replace(/&quot;/g, '"') .replace(/&#39;/g, "'") .replace(/&apos;/g, "'"); // Check timeout before expensive regex operations if (Date.now() - startTime > SEARCH_CONFIG.XML_ENTITY_TIMEOUT) { console.warn('[warn] XML entity decoding timeout, using partial result'); return decoded; } // Decode numeric entities (decimal) decoded = decoded.replace(/&#(\d+);/g, (match, dec) => { const num = parseInt(dec, 10); // Validate range for safety (valid Unicode range) if (num >= 0 && num <= 0x10FFFF) { return String.fromCharCode(num); } return match; // Keep original if invalid }); // Check timeout again if (Date.now() - startTime > SEARCH_CONFIG.XML_ENTITY_TIMEOUT) { console.warn('[warn] XML entity decoding timeout, using partial result'); return decoded; } // Decode numeric entities (hexadecimal) decoded = decoded.replace(/&#x([0-9a-f]+);/gi, (match, hex) => { const num = parseInt(hex, 16); // Validate range for safety (valid Unicode range) if (num >= 0 && num <= 0x10FFFF) { return String.fromCharCode(num); } return match; // Keep original if invalid }); return decoded; } catch (xmlErr) { console.warn(`[warn] XML entity decoding failed: ${xmlErr.message}`); return content; // Return original content if decoding fails } } /** * Safely strips XML/HTML tags with size limits * @param {string} content - Content to strip tags from * @returns {string} Content with tags removed */ function safeStripTags(content) { try { // Limit content size for tag stripping to prevent excessive memory usage const limitedContent = content.length > SEARCH_CONFIG.MAX_CONTENT_SIZE ? content.substring(0, SEARCH_CONFIG.MAX_CONTENT_SIZE) : content; // Replace tags with spaces to preserve word boundaries return limitedContent.replace(/<[^>]*>/g, ' ').replace(/\s+/g, ' '); } catch (stripErr) { console.warn(`[warn] Tag stripping failed: ${stripErr.message}`); return content; } } /** * Checks if response content contains any of the search strings (OR logic) * or all of the AND search strings (AND logic) * Handles both raw text search and basic XML content extraction * @param {string} content - The response content to search * @param {Array<string>} searchStrings - Array of strings to search for (OR logic) * @param {Array<string>} searchStringsAnd - Array of strings that must all be present (AND logic) * @param {string} contentType - Content type for specialized handling * @param {string} url - URL for debugging context (optional) * @returns {object} Object with found boolean, matchedString/matchedStrings, allMatches array, and logic type */ function searchContent(content, searchStrings, searchStringsAnd = [], contentType = '', url = '') { // Input validation if (!content || typeof content !== 'string') { return { found: false, matchedString: null, matchedStrings: [], allMatches: [], logicType: 'NONE', error: 'Invalid or empty content' }; } // Size check and truncation with warning const originalLength = content.length; if (originalLength > SEARCH_CONFIG.MAX_CONTENT_SIZE) { content = content.substring(0, SEARCH_CONFIG.MAX_CONTENT_SIZE); console.warn(`[warn] Content truncated from ${originalLength} to ${SEARCH_CONFIG.MAX_CONTENT_SIZE} chars for ${url || 'unknown URL'}`); } let searchableContent = content; const isXmlContent = contentType.toLowerCase().includes('xml') || contentType.toLowerCase().includes('html'); if (isXmlContent) { try { // Safely decode XML entities const decodedContent = safeDecodeXmlEntities(content); // Safely strip tags to extract text content const strippedContent = safeStripTags(decodedContent); // Search in: original + decoded + stripped content // Use newlines as separators to prevent false matches across content types searchableContent = [content, decodedContent, strippedContent].join('\n'); } catch (xmlProcessingErr) { console.warn(`[warn] XML processing failed for ${url || 'unknown URL'}: ${xmlProcessingErr.message}`); // Fall back to original content searchableContent = content; } } // Input validation for search strings const validSearchStrings = searchStrings.filter(str => str && typeof str === 'string' && str.length > 0 && str.length <= SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH ); const validSearchStringsAnd = searchStringsAnd.filter(str => str && typeof str === 'string' && str.length > 0 && str.length <= SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH ); // Warn about filtered search strings if (validSearchStrings.length !== searchStrings.length) { console.warn(`[warn] Filtered ${searchStrings.length - validSearchStrings.length} invalid search strings`); } if (validSearchStringsAnd.length !== searchStringsAnd.length) { console.warn(`[warn] Filtered ${searchStringsAnd.length - validSearchStringsAnd.length} invalid AND search strings`); } // Early return if no valid search strings if (validSearchStrings.length === 0 && validSearchStringsAnd.length === 0) { return { found: false, matchedString: null, matchedStrings: [], allMatches: [], logicType: 'NONE', error: 'No valid search strings provided' }; } // Pre-compute lowercase content once for better performance const lowerContent = searchableContent.toLowerCase(); // Check AND logic first (more restrictive) - ALL strings must be present if (validSearchStringsAnd && validSearchStringsAnd.length > 0) { const foundAndStrings = []; for (const searchStr of validSearchStringsAnd) { const lowerSearchStr = searchStr.toLowerCase(); if (lowerContent.includes(lowerSearchStr)) { foundAndStrings.push(searchStr); } else { // Early exit if any AND string is not found break; } } // AND logic: ALL valid strings must be found if (foundAndStrings.length === validSearchStringsAnd.length) { return { found: true, matchedString: foundAndStrings.join(' AND '), matchedStrings: foundAndStrings, allMatches: foundAndStrings, logicType: 'AND', contentSize: originalLength, searchableSize: searchableContent.length }; } } // OR logic: ANY string can match const allMatches = []; let firstMatch = null; for (const searchStr of validSearchStrings) { const lowerSearchStr = searchStr.toLowerCase(); if (lowerContent.includes(lowerSearchStr)) { allMatches.push(searchStr); if (!firstMatch) { firstMatch = searchStr; } } } return { found: allMatches.length > 0, matchedString: firstMatch, matchedStrings: allMatches, allMatches: allMatches, logicType: validSearchStrings.length > 0 ? 'OR' : 'NONE', contentSize: originalLength, searchableSize: searchableContent.length, processedAsXml: isXmlContent }; } /** * Determines if a content type should be analyzed for search strings * @param {string} contentType - The response content-type header * @returns {boolean} True if content should be analyzed */ function shouldAnalyzeContentType(contentType) { if (!contentType) return false; // Normalize content type (remove charset and other parameters) const normalizedType = contentType.toLowerCase().split(';')[0].trim(); const textTypes = [ 'text/', // text/html, text/plain, text/xml, etc. 'application/json', 'application/javascript', 'application/xml', // Standard XML 'application/x-javascript', 'application/soap+xml', // SOAP XML 'application/rss+xml', // RSS feeds 'application/atom+xml', // Atom feeds 'application/xhtml+xml', // XHTML 'application/ld+json', // JSON-LD structured data 'application/manifest+json', // Web App Manifest 'application/feed+xml', // Generic XML feeds 'application/vnd.api+json', // JSON API specification 'application/hal+json', // HAL (Hypertext Application Language) 'application/problem+json' // Problem Details for HTTP APIs ]; return textTypes.some(type => normalizedType.startsWith(type)); } /** * Creates a curl-based URL handler for downloading and optionally searching content * @param {object} config - Configuration object containing all necessary parameters * @returns {Function} URL handler function for curl-based content analysis */ function createCurlHandler(config) { const { searchStrings, searchStringsAnd, hasSearchStringAnd, regexes, matchedDomains, addMatchedDomain, // Helper function for adding domains currentUrl, perSiteSubDomains, ignoreDomains, matchesIgnoreDomain, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, forceDebug, userAgent, resourceType, // Resource type from request hasSearchString } = config; return async function curlHandler(requestUrl) { // Only process URLs that match our regex patterns const matchesRegex = regexes.some(re => re.test(requestUrl)); if (!matchesRegex) return; // Extract domain and check if already detected (skip expensive operations) const reqDomain = perSiteSubDomains ? (new URL(requestUrl)).hostname : getRootDomain(requestUrl); if (typeof config.isDomainAlreadyDetected === 'function' && config.isDomainAlreadyDetected(reqDomain)) { if (forceDebug) { console.log(`[debug][curl] Skipping already detected domain: ${reqDomain}`); } return; } // Check if this is a first-party request (same domain as the URL being scanned) const currentUrlHostname = new URL(currentUrl).hostname; const requestHostname = new URL(requestUrl).hostname; const isFirstParty = currentUrlHostname === requestHostname; // Apply first-party/third-party filtering if (isFirstParty && siteConfig.firstParty === false) { if (forceDebug) { console.log(`[debug][curl] Skipping first-party request (firstParty=false): ${requestUrl}`); } return; } if (!isFirstParty && siteConfig.thirdParty === false) { if (forceDebug) { console.log(`[debug][curl] Skipping third-party request (thirdParty=false): ${requestUrl}`); } return; } try { if (forceDebug) { console.log(`[debug][curl] Downloading content from: ${requestUrl}`); } // If NO searchstring is defined, match immediately (like browser behavior) if (!hasSearchString && !hasSearchStringAnd) { if (!reqDomain || matchesIgnoreDomain(reqDomain, ignoreDomains)) { return; } addDomainToCollection(matchedDomains, addMatchedDomain, reqDomain, resourceType); const simplifiedUrl = getRootDomain(currentUrl); if (siteConfig.verbose === 1) { const partyType = isFirstParty ? 'first-party' : 'third-party'; const resourceInfo = resourceType ? ` (${resourceType})` : ''; console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) matched regex${resourceInfo}`); } if (dumpUrls) { const timestamp = new Date().toISOString(); const partyType = isFirstParty ? 'first-party' : 'third-party'; const resourceInfo = resourceType ? ` (${resourceType})` : ''; try { fs.appendFileSync(matchedUrlsLogFile, `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl)${resourceInfo}\n`); } catch (logErr) { console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`); } } return; } // If searchstring IS defined, download and search content const content = await downloadWithRetry(requestUrl, userAgent, 30); // Check if content contains search strings (OR or AND logic) const { found, matchedString, logicType, error } = searchContent(content, searchStrings, searchStringsAnd, '', requestUrl); if (found) { if (!reqDomain || matchesIgnoreDomain(reqDomain, ignoreDomains)) { return; } addDomainToCollection(matchedDomains, addMatchedDomain, reqDomain, resourceType); const simplifiedUrl = getRootDomain(currentUrl); if (siteConfig.verbose === 1) { const partyType = isFirstParty ? 'first-party' : 'third-party'; const resourceInfo = resourceType ? ` (${resourceType})` : ''; console.log(`[match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl) contains searchstring (${logicType}): "${matchedString}"${resourceInfo}`); } if (dumpUrls) { const timestamp = new Date().toISOString(); const partyType = isFirstParty ? 'first-party' : 'third-party'; const resourceInfo = resourceType ? ` (${resourceType})` : ''; try { fs.appendFileSync(matchedUrlsLogFile, `${timestamp} [match][${simplifiedUrl}] ${requestUrl} (${partyType}, curl, searchstring (${logicType}): "${matchedString}")${resourceInfo}\n`); } catch (logErr) { console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`); } } } else if (forceDebug) { const partyType = isFirstParty ? 'first-party' : 'third-party'; console.log(`[debug][curl] ${requestUrl} (${partyType}) matched regex but no searchstring found`); if (error) { console.log(`[debug][curl] Search error: ${error}`); } } } catch (err) { if (forceDebug) { console.log(`[debug][curl] Failed to download content for ${requestUrl}: ${err.message}`); } } }; } /** * Creates a response handler function for the given configuration * @param {object} config - Configuration object containing all necessary parameters * @returns {Function} Response handler function for page.on('response', handler) */ function createResponseHandler(config) { const { searchStrings, searchStringsAnd, hasSearchStringAnd, regexes, matchedDomains, addMatchedDomain, // Helper function for adding domains currentUrl, perSiteSubDomains, ignoreDomains, matchesIgnoreDomain, getRootDomain, siteConfig, dumpUrls, matchedUrlsLogFile, useGrep = false, forceDebug, resourceType // Will be null for response handler } = config; return async function responseHandler(response) { const respUrl = response.url(); const respDomain = perSiteSubDomains ? (new URL(respUrl)).hostname : getRootDomain(respUrl); // Only process responses that match our regex patterns const fullSubdomain = (new URL(respUrl)).hostname; // Always get full subdomain for cache tracking // Skip if already detected to avoid duplicates if (typeof config.isDomainAlreadyDetected === 'function' && config.isDomainAlreadyDetected(fullSubdomain)) { return; } const matchesRegex = regexes.some(re => re.test(respUrl)); if (!matchesRegex) return; // Extract domain and check if already detected (skip expensive operations) // The main request handler already filtered first-party/third-party requests // This response handler only runs for requests that passed that filter // However, we need to apply the same first-party/third-party logic here for searchstring analysis // because the response handler analyzes content, not just URLs // Apply first-party/third-party filtering for searchstring analysis // Use the exact same logic as the main request handler const currentUrlHostname = new URL(currentUrl).hostname; const responseHostname = new URL(respUrl).hostname; const isFirstParty = currentUrlHostname === responseHostname; if (isFirstParty && siteConfig.firstParty === false) { if (forceDebug) { console.log(`[debug] Skipping first-party response for searchstring analysis (firstParty=false): ${respUrl}`); } return; } if (!isFirstParty && siteConfig.thirdParty === false) { if (forceDebug) { console.log(`[debug] Skipping third-party response for searchstring analysis (thirdParty=false): ${respUrl}`); } return; } try { // Only capture appropriate content types to avoid binary data const contentType = response.headers()['content-type'] || ''; if (!shouldAnalyzeContentType(contentType)) { if (forceDebug) { console.log(`[debug] Skipping content analysis for ${respUrl} (content-type: ${contentType})`); } return; } const content = await response.text(); // Cache the fetched content if callback provided if (config.onContentFetched) { try { config.onContentFetched(respUrl, content); } catch (cacheErr) { if (forceDebug) { console.log(`[debug] Content caching failed: ${cacheErr.message}`); } } } // Check if content contains search strings (OR or AND logic) let searchResult; if (useGrep && (searchStrings.length > 0 || searchStringsAnd.length > 0)) { // Use grep for pattern matching try { const allPatterns = [...(searchStrings || []), ...(searchStringsAnd || [])]; const grepResult = await grepContent(content, allPatterns, { ignoreCase: true, wholeWord: false, regex: false }); if (hasSearchStringAnd && searchStringsAnd.length > 0) { // For AND logic, check that all patterns were found const foundPatterns = grepResult.allMatches.map(match => match.pattern); const allFound = searchStringsAnd.every(pattern => foundPatterns.includes(pattern)); searchResult = { found: allFound, matchedString: allFound ? foundPatterns.join(' AND ') : null, logicType: 'AND' }; } else { // For OR logic, any match is sufficient searchResult = { found: grepResult.found, matchedString: grepResult.matchedPattern, logicType: 'OR' }; } } catch (grepErr) { if (forceDebug) { console.log(`[debug] Grep failed for ${respUrl}, falling back to JavaScript: ${grepErr.message}`); } // Fallback to JavaScript search searchResult = searchContent(content, searchStrings, searchStringsAnd, contentType, respUrl); } } else { // Use JavaScript search searchResult = searchContent(content, searchStrings, searchStringsAnd, contentType, respUrl); } const { found, matchedString, logicType, error } = searchResult; if (found) { if (!respDomain || matchesIgnoreDomain(respDomain, ignoreDomains)) { return; } // Response handler doesn't have access to specific resource type // Use the addMatchedDomain helper which handles fullSubdomain properly addMatchedDomain(respDomain, null, fullSubdomain); const simplifiedUrl = getRootDomain(currentUrl); if (siteConfig.verbose === 1) { const partyType = isFirstParty ? 'first-party' : 'third-party'; const searchMethod = useGrep ? 'grep' : 'js'; console.log(`[match][${simplifiedUrl}] ${respUrl} (${partyType}, ${searchMethod}) contains searchstring (${logicType}): "${matchedString}"`); } if (dumpUrls) { const timestamp = new Date().toISOString(); const partyType = isFirstParty ? 'first-party' : 'third-party'; const searchMethod = useGrep ? 'grep' : 'js'; try { fs.appendFileSync(matchedUrlsLogFile, `${timestamp} [match][${simplifiedUrl}] ${respUrl} (${partyType}, ${searchMethod}, searchstring (${logicType}): "${matchedString}")\n`); } catch (logErr) { console.warn(`[warn] Failed to write to matched URLs log: ${logErr.message}`); } } } else if (forceDebug) { const partyType = isFirstParty ? 'first-party' : 'third-party'; const searchMethod = useGrep ? 'grep' : 'js'; console.log(`[debug] ${respUrl} (${partyType}, ${searchMethod}) matched regex but no searchstring found`); if (error) { console.log(`[debug] Search error: ${error}`); } } } catch (err) { if (forceDebug) { console.log(`[debug] Failed to read response content for ${respUrl}: ${err.message}`); } } }; } /** * Validates searchstring configuration * @param {any} searchstring - The searchstring value to validate * @param {any} searchstringAnd - The searchstring_and value to validate * @returns {object} Validation result with isValid boolean and error message */ function validateSearchString(searchstring, searchstringAnd) { if (searchstring === undefined || searchstring === null) { return { isValid: true, error: null }; } if (typeof searchstring === 'string') { if (searchstring.length === 0) { return { isValid: false, error: 'searchstring cannot be empty string' }; } return { isValid: true, error: null }; } if (Array.isArray(searchstring)) { if (searchstring.length === 0) { return { isValid: false, error: 'searchstring array cannot be empty' }; } for (let i = 0; i < searchstring.length; i++) { if (typeof searchstring[i] !== 'string') { return { isValid: false, error: `searchstring[${i}] must be a string` }; } if (searchstring[i].length === 0) { return { isValid: false, error: `searchstring[${i}] cannot be empty string` }; } } return { isValid: true, error: null }; } // Validate searchstring_and if (searchstringAnd !== undefined && searchstringAnd !== null) { if (typeof searchstringAnd === 'string') { if (searchstringAnd.length === 0) { return { isValid: false, error: 'searchstring_and cannot be empty string' }; } } else if (Array.isArray(searchstringAnd)) { if (searchstringAnd.length === 0) { return { isValid: false, error: 'searchstring_and array cannot be empty' }; } for (let i = 0; i < searchstringAnd.length; i++) { if (typeof searchstringAnd[i] !== 'string') { return { isValid: false, error: `searchstring_and[${i}] must be a string` }; } if (searchstringAnd[i].length === 0) { return { isValid: false, error: `searchstring_and[${i}] cannot be empty string` }; } } } else { return { isValid: false, error: 'searchstring_and must be string or array of strings' }; } } // Check that both searchstring and searchstring_and aren't defined simultaneously if ((searchstring !== undefined && searchstring !== null) && (searchstringAnd !== undefined && searchstringAnd !== null)) { return { isValid: false, error: 'Cannot use both searchstring (OR) and searchstring_and (AND) simultaneously. Choose one logic type.' }; } // Additional validation for search string length limits const validateStringLength = (str, fieldName) => { if (str.length > SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH) { return { isValid: false, error: `${fieldName} too long (max ${SEARCH_CONFIG.MAX_SEARCH_STRING_LENGTH} chars)` }; } return { isValid: true }; }; // Validate search string lengths if (typeof searchstring === 'string') { const lengthCheck = validateStringLength(searchstring, 'searchstring'); if (!lengthCheck.isValid) return lengthCheck; } else if (Array.isArray(searchstring)) { for (let i = 0; i < searchstring.length; i++) { const lengthCheck = validateStringLength(searchstring[i], `searchstring[${i}]`); if (!lengthCheck.isValid) return lengthCheck; } } // Validate AND search string lengths if (typeof searchstringAnd === 'string') { const lengthCheck = validateStringLength(searchstringAnd, 'searchstring_and'); if (!lengthCheck.isValid) return lengthCheck; } else if (Array.isArray(searchstringAnd)) { for (let i = 0; i < searchstringAnd.length; i++) { const lengthCheck = validateStringLength(searchstringAnd[i], `searchstring_and[${i}]`); if (!lengthCheck.isValid) return lengthCheck; } } return { isValid: false, error: 'searchstring must be string or array of strings' }; } /** * Gets statistics about search string matches * @param {Set|Map} matchedDomains - Set or Map of matched domains * @param {Array<string>} searchStrings - Array of search strings used * @returns {object} Statistics object */ function getSearchStats(matchedDomains, searchStrings) { const totalMatches = matchedDomains instanceof Map ? matchedDomains.size : matchedDomains.size; return { totalMatches, searchStringCount: searchStrings.length, searchStrings: [...searchStrings] }; } module.exports = { parseSearchStrings, searchContent, safeDecodeXmlEntities, shouldAnalyzeContentType, createResponseHandler, createCurlHandler, downloadWithCurl, validateSearchString, getSearchStats, addDomainToCollection, downloadWithRetry };