@fanboynz/network-scanner
Version:
A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.
246 lines (204 loc) • 7.65 kB
JavaScript
const { formatLogMessage } = require('./colorize');
// Precompiled regex (avoids recompilation per getBaseDomainName call)
const REGEX_PROTOCOL = /^https?:\/\//;
const REGEX_WWW = /^www\./;
// Multi-part TLD lookup (module-level Set, O(1) instead of per-call array + O(n) .includes)
const MULTI_PART_TLDS = new Set([
'co.uk', 'co.nz', 'com.au', 'co.za', 'co.in', 'co.jp', 'co.kr',
'com.br', 'com.mx', 'com.ar', 'com.co', 'com.pe', 'com.ve',
'co.th', 'co.id', 'co.il', 'co.ke', 'co.tz', 'co.zw', 'co.bw',
'com.sg', 'com.my', 'com.hk', 'com.tw', 'com.ph', 'com.vn',
'co.cr', 'co.ug', 'co.zm', 'co.ao', 'co.mz', 'co.ls',
'org.uk', 'me.uk', 'ltd.uk', 'plc.uk', 'gov.uk', 'ac.uk', 'sch.uk',
'com.de', 'org.de', 'com.fr', 'org.fr', 'com.es', 'org.es',
'com.it', 'org.it', 'com.pl', 'org.pl', 'com.nl', 'org.nl',
'com.ru', 'org.ru', 'com.ua', 'org.ua', 'com.tr', 'org.tr',
'or.jp', 'ne.jp', 'ac.jp', 'ed.jp', 'go.jp',
'or.kr', 'ne.kr', 'com.cn', 'org.cn', 'net.cn', 'edu.cn', 'gov.cn',
'org.in', 'net.in', 'org.au', 'net.au', 'edu.au', 'gov.au',
'org.nz', 'net.nz', 'org.il', 'net.il', 'org.za', 'net.za',
'org.br', 'net.br', 'edu.br', 'gov.br', 'org.ar', 'org.mx',
'org.co', 'org.pe', 'com.cl', 'org.cl', 'com.uy', 'org.uy',
'org.ve', 'com.do', 'org.do', 'com.pr', 'org.pr',
'com.gt', 'org.gt', 'com.pa', 'org.pa', 'com.sv', 'org.sv',
'com.ni', 'org.ni', 'com.hn', 'org.hn', 'org.cr',
'com.eg', 'org.eg', 'or.ke'
]);
// 3-part TLD lookup
const THREE_PART_TLDS = new Set(['com.au.com', 'co.uk.com']);
/**
* Extracts the base domain name without TLD for similarity comparison
* @param {string} domain - The domain to process
* @returns {string} The base domain name
*/
function getBaseDomainName(domain) {
if (!domain || typeof domain !== 'string') {
return '';
}
domain = domain.replace(REGEX_PROTOCOL, '');
domain = domain.replace(REGEX_WWW, '');
const parts = domain.split('.');
if (parts.length < 2) {
return domain;
}
// Check multi-part TLD (O(1) Set lookup instead of O(n) array scan)
const lastTwoParts = parts[parts.length - 2] + '.' + parts[parts.length - 1];
if (MULTI_PART_TLDS.has(lastTwoParts)) {
return parts.length >= 3 ? parts[parts.length - 3] : parts[0];
}
// Handle rare 3-part TLDs
if (parts.length >= 4) {
const lastThreeParts = parts[parts.length - 3] + '.' + lastTwoParts;
if (THREE_PART_TLDS.has(lastThreeParts)) {
return parts[parts.length - 4];
}
}
return parts[parts.length - 2];
}
/**
* Calculates similarity between two domain base names using Levenshtein distance
* @param {string} domain1 - First domain base name
* @param {string} domain2 - Second domain base name
* @returns {number} Similarity percentage (0-100)
*/
function calculateSimilarity(domain1, domain2) {
if (domain1 === domain2) return 100;
if (!domain1 || !domain2) return 0;
const longer = domain1.length > domain2.length ? domain1 : domain2;
const shorter = domain1.length > domain2.length ? domain2 : domain1;
if (longer.length === 0) return 100;
const distance = levenshteinDistance(longer, shorter);
return Math.round(((longer.length - distance) / longer.length) * 100);
}
/**
* Calculates Levenshtein distance using two-row approach
* Same results as original, but O(min(m,n)) space instead of O(m*n)
* @param {string} str1 - First string
* @param {string} str2 - Second string
* @returns {number} Edit distance
*/
function levenshteinDistance(str1, str2) {
const m = str1.length;
const n = str2.length;
// Ensure we iterate over the shorter dimension for row arrays
if (m < n) return levenshteinDistance(str2, str1);
// Two rows instead of full matrix
let prevRow = new Array(n + 1);
let currRow = new Array(n + 1);
for (let j = 0; j <= n; j++) {
prevRow[j] = j;
}
for (let i = 1; i <= m; i++) {
currRow[0] = i;
const ch1 = str1[i - 1];
for (let j = 1; j <= n; j++) {
if (ch1 === str2[j - 1]) {
currRow[j] = prevRow[j - 1];
} else {
const sub = prevRow[j - 1];
const ins = currRow[j - 1];
const del = prevRow[j];
currRow[j] = (sub < ins ? (sub < del ? sub : del) : (ins < del ? ins : del)) + 1;
}
}
// Swap rows
const temp = prevRow;
prevRow = currRow;
currRow = temp;
}
return prevRow[n];
}
/**
* Main function: Checks if a domain should be ignored based on similarity to existing domains
* @param {string} newDomain - The domain to check for similarity
* @param {Set|Array} existingDomains - Collection of already found domains
* @param {object} options - Configuration options
* @returns {object} Result object with shouldIgnore boolean and metadata
*/
function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
const {
enabled = true,
threshold = 80,
forceDebug = false
} = options;
if (!enabled) {
return { shouldIgnore: false, reason: 'ignore_similar disabled' };
}
if (!newDomain) {
return { shouldIgnore: false, reason: 'invalid domain' };
}
const newBaseDomain = getBaseDomainName(newDomain);
if (!newBaseDomain) {
return { shouldIgnore: false, reason: 'could not extract base domain' };
}
// KEEP original guard exactly as-is: Array.from handles undefined/null/objects safely
const domainsArray = Array.isArray(existingDomains) ? existingDomains : Array.from(existingDomains);
for (const existingDomain of domainsArray) {
if (!existingDomain || existingDomain === newDomain) {
continue;
}
const existingBaseDomain = getBaseDomainName(existingDomain);
if (!existingBaseDomain || existingBaseDomain === newBaseDomain) {
continue;
}
const similarity = calculateSimilarity(newBaseDomain, existingBaseDomain);
if (similarity >= threshold) {
if (forceDebug) {
console.log(formatLogMessage('debug',
`[ignore_similar] ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`
));
}
return {
shouldIgnore: true,
reason: `${similarity}% similar to ${existingDomain}`,
similarity,
similarDomain: existingDomain,
newBaseDomain,
existingBaseDomain
};
}
}
return { shouldIgnore: false, reason: 'no similar domains found' };
}
/**
* Utility function: Filters out similar domains from a collection
* @param {Array} domains - Array of domains to filter
* @param {object} options - Filtering options
* @returns {object} Result with filtered domains and removed domains
*/
function filterSimilarDomains(domains, options = {}) {
const {
enabled = true,
threshold = 80,
forceDebug = false
} = options;
if (!enabled || !Array.isArray(domains)) {
return { filtered: domains, removed: [] };
}
const filtered = [];
const removed = [];
for (const domain of domains) {
const result = shouldIgnoreSimilarDomain(domain, filtered, { enabled, threshold, forceDebug });
if (result.shouldIgnore) {
removed.push({
domain,
reason: result.reason,
similarTo: result.similarDomain
});
} else {
filtered.push(domain);
}
}
if (forceDebug && removed.length > 0) {
console.log(formatLogMessage('debug',
`[ignore_similar] Filtered out ${removed.length} similar domains`
));
}
return { filtered, removed };
}
module.exports = {
getBaseDomainName,
calculateSimilarity,
shouldIgnoreSimilarDomain,
filterSimilarDomains
};