@fanboynz/network-scanner
Version:
A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.
377 lines (328 loc) • 13.9 kB
JavaScript
const { formatLogMessage } = require('./colorize');
/**
* IGNORE_SIMILAR MODULE
*
* This module implements domain similarity detection to prevent collecting
* domains that are too similar to ones already found. It uses Levenshtein
* distance algorithm to calculate similarity between domain base names.
*
* Main use case: When scanning for ad/tracker domains, prevent collecting
* both "googleads.com" and "googlevds.com" since they're 89% similar.
*
* Performance consideration: This runs on every potential domain match,
* so the algorithms need to be efficient for high-volume scanning.
*/
/**
* Extracts the base domain name without TLD for similarity comparison
*
* Examples:
* - "ads.google.com" -> "google"
* - "tracker.facebook.co.uk" -> "facebook"
* - "cdn.example.org" -> "example"
*
* Why we do this: We want to compare the actual brand/company name part
* of domains, not be fooled by different TLDs or subdomains.
*
* @param {string} domain - The domain to process
* @returns {string} The base domain name
*/
function getBaseDomainName(domain) {
if (!domain || typeof domain !== 'string') {
return '';
}
// Remove protocol if present (handles cases where full URLs are passed)
domain = domain.replace(/^https?:\/\//, '');
// Remove www prefix (standardize domain format)
domain = domain.replace(/^www\./, '');
// Split by dots and get the part before the last dot (TLD)
const parts = domain.split('.');
if (parts.length < 2) {
return domain; // Single part, return as-is (localhost, IP addresses, etc.)
}
/**
* MULTI-PART TLD HANDLING
*
* Many countries use multi-part TLDs like "co.uk", "com.au", etc.
* We need to account for these when extracting the base domain name.
*
* Without this logic:
* - "example.co.uk" would incorrectly return "co" instead of "example"
* - "google.com.au" would return "com" instead of "google"
*
* This extensive list covers most common multi-part TLDs worldwide.
*/
const multiPartTLDs = [
// Common Anglo countries
'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
// Latin America
'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
// Asia-Pacific
'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
// Central America & Africa
'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
// Europe extensions
'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
// Asia-Pacific extensions detailed
'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
// Americas extensions detailed
'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
// Central America & Caribbean
'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
// Middle East & Africa extensions
'com.eg', 'org.eg', 'or.ke'
];
// Check if domain ends with a multi-part TLD
const lastTwoParts = parts.slice(-2).join('.'); // e.g., "co.uk"
const lastThreeParts = parts.length >= 3 ? parts.slice(-3).join('.') : ''; // e.g., "com.au.com"
// Handle 2-part TLDs (most common case)
// Example: "google.co.uk" -> parts = ["google", "co", "uk"] -> return "google"
if (multiPartTLDs.includes(lastTwoParts)) {
return parts.length >= 3 ? parts[parts.length - 3] : parts[0];
}
// Handle rare 3-part TLDs (future-proofing)
// This is mostly theoretical but good to have for completeness
if (parts.length >= 4 && lastThreeParts &&
['com.au.com', 'co.uk.com'].includes(lastThreeParts)) {
return parts[parts.length - 4];
}
// For standard TLDs, take the second-to-last part
// Example: "google.com" -> parts = ["google", "com"] -> return "google"
return parts[parts.length - 2];
}
/**
* Calculates similarity between two domain base names using Levenshtein distance
*
* The Levenshtein distance is the minimum number of single-character edits
* (insertions, deletions, substitutions) needed to transform one string into another.
*
* We convert this to a percentage similarity for easier threshold comparison.
*
* Examples:
* - "google" vs "googl" = 83% similar (1 deletion needed)
* - "facebook" vs "facebo0k" = 87% similar (1 substitution needed)
* - "amazon" vs "amaz0n" = 83% similar (1 substitution needed)
*
* Why this matters: Malicious domains often use typosquatting techniques
* like character substitution, insertion, or deletion to appear legitimate.
*
* @param {string} domain1 - First domain base name
* @param {string} domain2 - Second domain base name
* @returns {number} Similarity percentage (0-100)
*/
function calculateSimilarity(domain1, domain2) {
// Exact match = 100% similar (optimization for common case)
if (domain1 === domain2) return 100;
// Empty strings have no similarity
if (!domain1 || !domain2) return 0;
// Identify longer and shorter strings for algorithm efficiency
const longer = domain1.length > domain2.length ? domain1 : domain2;
const shorter = domain1.length > domain2.length ? domain2 : domain1;
// Edge case: empty longer string means both are empty (100% similar)
if (longer.length === 0) return 100;
// Calculate edit distance using dynamic programming algorithm
const distance = levenshteinDistance(longer, shorter);
// Convert to percentage: (max_length - edits_needed) / max_length * 100
// Higher percentage = more similar
return Math.round(((longer.length - distance) / longer.length) * 100);
}
/**
* Calculates Levenshtein distance between two strings using dynamic programming
*
* This is the core algorithm that powers our similarity detection.
* Time complexity: O(m*n) where m and n are string lengths
* Space complexity: O(m*n) for the matrix
*
* The algorithm builds a matrix where each cell [i,j] represents the minimum
* edit distance between the first i characters of str1 and first j characters of str2.
*
* Dynamic programming recurrence relation:
* - If characters match: matrix[i][j] = matrix[i-1][j-1] (no edit needed)
* - If different: matrix[i][j] = 1 + min(substitution, insertion, deletion)
*
* @param {string} str1 - First string
* @param {string} str2 - Second string
* @returns {number} Edit distance (number of edits needed to transform str1 to str2)
*/
function levenshteinDistance(str1, str2) {
// Initialize matrix with base cases
const matrix = [];
// Base case: transforming empty string to str2 requires str2.length insertions
for (let i = 0; i <= str2.length; i++) {
matrix[i] = [i];
}
// Base case: transforming str1 to empty string requires str1.length deletions
for (let j = 0; j <= str1.length; j++) {
matrix[0][j] = j;
}
// Fill matrix using dynamic programming
for (let i = 1; i <= str2.length; i++) {
for (let j = 1; j <= str1.length; j++) {
// If characters match, no additional cost
if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
// Take minimum cost operation:
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1, // substitution: replace char in str1
matrix[i][j - 1] + 1, // insertion: add char to str1
matrix[i - 1][j] + 1 // deletion: remove char from str1
);
}
}
}
// Bottom-right cell contains the final edit distance
return matrix[str2.length][str1.length];
}
/**
* Main function: Checks if a domain should be ignored based on similarity to existing domains
*
* This is called for every potential domain match during scanning, so it needs to be
* efficient. The function uses early returns and optimizations to minimize processing.
*
* Usage workflow:
* 1. New domain found: "g00gleads.com"
* 2. Extract base: "g00gleads"
* 3. Compare to existing: ["googleads", "facebook", "amazon"]
* 4. Find "googleads" is 89% similar (above 80% threshold)
* 5. Return shouldIgnore: true
*
* @param {string} newDomain - The domain to check for similarity
* @param {Set|Array} existingDomains - Collection of already found domains
* @param {object} options - Configuration options
* @param {boolean} options.enabled - Whether similarity checking is enabled
* @param {number} options.threshold - Similarity percentage threshold (0-100)
* @param {boolean} options.forceDebug - Whether to log debug information
* @returns {object} Result object with shouldIgnore boolean and metadata
*/
function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
const {
enabled = true,
threshold = 80, // Default: ignore domains that are 80%+ similar
forceDebug = false
} = options;
// Quick exit if feature is disabled (performance optimization)
if (!enabled) {
return { shouldIgnore: false, reason: 'ignore_similar disabled' };
}
// Validate input domain
if (!newDomain) {
return { shouldIgnore: false, reason: 'invalid domain' };
}
// Extract base domain name for comparison
const newBaseDomain = getBaseDomainName(newDomain);
if (!newBaseDomain) {
return { shouldIgnore: false, reason: 'could not extract base domain' };
}
// Convert Set to Array if needed (handles both data structures)
const domainsArray = Array.isArray(existingDomains) ? existingDomains : Array.from(existingDomains);
// Compare against each existing domain
for (const existingDomain of domainsArray) {
// Skip invalid, empty, or identical domains (optimization)
if (!existingDomain || existingDomain === newDomain) {
continue;
}
// Extract base domain for comparison
const existingBaseDomain = getBaseDomainName(existingDomain);
if (!existingBaseDomain || existingBaseDomain === newBaseDomain) {
continue; // Skip if same base domain or extraction failed
}
// Calculate similarity percentage
const similarity = calculateSimilarity(newBaseDomain, existingBaseDomain);
// Check if similarity exceeds threshold
if (similarity >= threshold) {
// Debug logging for similarity matches (helps tune thresholds)
if (forceDebug) {
console.log(formatLogMessage('debug',
`[ignore_similar] ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`
));
}
// Return detailed similarity information for debugging/analysis
return {
shouldIgnore: true,
reason: `${similarity}% similar to ${existingDomain}`,
similarity,
similarDomain: existingDomain,
newBaseDomain,
existingBaseDomain
};
}
}
// No similar domains found - safe to add this domain
return { shouldIgnore: false, reason: 'no similar domains found' };
}
/**
* Utility function: Filters out similar domains from a collection
*
* This is useful for post-processing existing domain lists to remove
* similar entries. It processes the array sequentially, comparing each
* domain against the already-accepted domains.
*
* Use case: Clean up an existing blocklist by removing similar domains
* Example: ["googleads.com", "g00gleads.com", "facebook.com"]
* -> ["googleads.com", "facebook.com"] (removed g00gleads as similar)
*
* @param {Array} domains - Array of domains to filter
* @param {object} options - Filtering options (same as shouldIgnoreSimilarDomain)
* @returns {object} Result with filtered domains and information about removed domains
*/
function filterSimilarDomains(domains, options = {}) {
const {
enabled = true,
threshold = 80,
forceDebug = false
} = options;
// Quick exit if disabled or invalid input
if (!enabled || !Array.isArray(domains)) {
return { filtered: domains, removed: [] };
}
const filtered = []; // Domains to keep
const removed = []; // Domains that were filtered out (for reporting)
// Process each domain sequentially
for (const domain of domains) {
// Check if this domain is similar to any already-accepted domain
const result = shouldIgnoreSimilarDomain(domain, filtered, { enabled, threshold, forceDebug });
if (result.shouldIgnore) {
// Domain is too similar - add to removed list with metadata
removed.push({
domain,
reason: result.reason,
similarTo: result.similarDomain
});
} else {
// Domain is unique enough - add to filtered list
filtered.push(domain);
}
}
// Debug reporting for filtering results
if (forceDebug && removed.length > 0) {
console.log(formatLogMessage('debug',
`[ignore_similar] Filtered out ${removed.length} similar domains`
));
}
return { filtered, removed };
}
/**
* MODULE EXPORTS
*
* Public API for the ignore_similar module:
* - getBaseDomainName: Extract base domain from full domain
* - calculateSimilarity: Get similarity percentage between two domains
* - shouldIgnoreSimilarDomain: Main function for real-time similarity checking
* - filterSimilarDomains: Batch processing function for existing lists
*/
module.exports = {
getBaseDomainName,
calculateSimilarity,
shouldIgnoreSimilarDomain,
filterSimilarDomains
};