@fanboynz/network-scanner
Version:
A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.
192 lines (167 loc) • 7.04 kB
JavaScript
const psl = require('psl');
const { formatLogMessage, messageColors } = require('./colorize');
const IGNORE_SIMILAR_TAG = messageColors.processing('[ignore_similar]');
// Strip protocol before handing to psl.parse, which expects a bare
// hostname per Public Suffix List semantics. psl handles 'www.' as a
// subdomain naturally (no need for a separate strip).
const REGEX_PROTOCOL = /^https?:\/\//;
/**
* Extracts the base domain name (sld) without TLD for similarity comparison.
*
* Uses the project's `psl` dependency — the canonical Public Suffix List
* parser, maintained against the live Mozilla list. Replaces a hand-curated
* ~80-entry MULTI_PART_TLDS Set that went stale as PSL changed, plus a
* THREE_PART_TLDS set that only listed two entries (both vanity domains
* 'com.au.com'/'co.uk.com', not real public suffixes). The rest of the
* codebase already uses psl (nwss.js, lib/post-processing.js, etc.) — this
* brings ignore_similar in line.
*
* @param {string} domain - The domain to process
* @returns {string} The base domain name (sld), e.g. 'example' for
* 'www.example.co.uk'. Returns '' for invalid input; falls back to
* second-to-last token for hostnames psl can't parse (IPs, single-token
* hosts, unlisted TLDs).
*/
function getBaseDomainName(domain) {
if (!domain || typeof domain !== 'string') {
return '';
}
const hostname = domain.replace(REGEX_PROTOCOL, '');
const parsed = psl.parse(hostname);
if (parsed && parsed.sld) {
return parsed.sld;
}
// Fallback for IPs / single-token / unparseable: best-effort
// second-to-last token (the old behavior's default branch).
const parts = hostname.split('.');
return parts.length >= 2 ? parts[parts.length - 2] : hostname;
}
/**
* Calculates similarity between two domain base names using Levenshtein distance
* @param {string} domain1 - First domain base name
* @param {string} domain2 - Second domain base name
* @returns {number} Similarity percentage (0-100)
*/
function calculateSimilarity(domain1, domain2) {
if (domain1 === domain2) return 100;
if (!domain1 || !domain2) return 0;
// Both inputs are non-empty different strings at this point — the
// `''` cases are handled by the two guards above. (Used to have an
// `if (longer.length === 0) return 100` here but it was unreachable.)
const longer = domain1.length > domain2.length ? domain1 : domain2;
const shorter = domain1.length > domain2.length ? domain2 : domain1;
const distance = levenshteinDistance(longer, shorter);
return Math.round(((longer.length - distance) / longer.length) * 100);
}
/**
* Calculates Levenshtein distance using two-row approach
* Same results as original, but O(min(m,n)) space instead of O(m*n)
* @param {string} str1 - First string
* @param {string} str2 - Second string
* @returns {number} Edit distance
*/
function levenshteinDistance(str1, str2) {
// Ensure str1 is the longer one so the inner-loop dimension (n)
// stays small. Inline swap instead of recursive re-entry — the old
// `if (m < n) return levenshteinDistance(str2, str1)` paid a stack
// frame + re-validation for what's really just a variable rename.
let a = str1, b = str2;
if (a.length < b.length) { const t = a; a = b; b = t; }
const m = a.length;
const n = b.length;
// Two rows instead of full matrix — O(n) space instead of O(m*n).
let prevRow = new Array(n + 1);
let currRow = new Array(n + 1);
for (let j = 0; j <= n; j++) {
prevRow[j] = j;
}
for (let i = 1; i <= m; i++) {
currRow[0] = i;
const ch1 = a[i - 1];
for (let j = 1; j <= n; j++) {
if (ch1 === b[j - 1]) {
currRow[j] = prevRow[j - 1];
} else {
const sub = prevRow[j - 1];
const ins = currRow[j - 1];
const del = prevRow[j];
currRow[j] = (sub < ins ? (sub < del ? sub : del) : (ins < del ? ins : del)) + 1;
}
}
// Swap rows
const temp = prevRow;
prevRow = currRow;
currRow = temp;
}
return prevRow[n];
}
/**
* Main function: Checks if a domain should be ignored based on similarity to existing domains
* @param {string} newDomain - The domain to check for similarity
* @param {Set|Array} existingDomains - Collection of already found domains
* @param {object} options - Configuration options
* @returns {object} Result object with shouldIgnore boolean and metadata
*/
function shouldIgnoreSimilarDomain(newDomain, existingDomains, options = {}) {
const {
enabled = true,
threshold = 80,
forceDebug = false
} = options;
if (!enabled) {
return { shouldIgnore: false, reason: 'ignore_similar disabled' };
}
if (!newDomain) {
return { shouldIgnore: false, reason: 'invalid domain' };
}
const newBaseDomain = getBaseDomainName(newDomain);
if (!newBaseDomain) {
return { shouldIgnore: false, reason: 'could not extract base domain' };
}
// KEEP original guard exactly as-is: Array.from handles undefined/null/objects safely
const domainsArray = Array.isArray(existingDomains) ? existingDomains : Array.from(existingDomains);
for (const existingDomain of domainsArray) {
if (!existingDomain || existingDomain === newDomain) {
continue;
}
const existingBaseDomain = getBaseDomainName(existingDomain);
if (!existingBaseDomain) {
continue;
}
// BEHAVIOR NOTE: identical base names (e.g. google.com vs google.net)
// now count as 100% similar — calculateSimilarity returns 100 for
// matching strings, which is above any reasonable threshold. The old
// `existingBaseDomain === newBaseDomain` skip silently exempted
// same-base-different-TLD pairs, defeating the dedup purpose for the
// most common variant case (brand registrations across multiple TLDs).
// Both call sites in nwss.js (matched-dedup at ~2833, ignoreDomains
// expansion at ~2849) want this stricter behavior. Set a lower
// threshold or disable ignore_similar entirely if you actually want
// to keep brand variants.
const similarity = calculateSimilarity(newBaseDomain, existingBaseDomain);
if (similarity >= threshold) {
if (forceDebug) {
console.log(formatLogMessage('debug',
`${IGNORE_SIMILAR_TAG} ${newDomain} (${newBaseDomain}) is ${similarity}% similar to ${existingDomain} (${existingBaseDomain}) - ignoring`
));
}
return {
shouldIgnore: true,
reason: `${similarity}% similar to ${existingDomain}`,
similarity,
similarDomain: existingDomain,
newBaseDomain,
existingBaseDomain
};
}
}
return { shouldIgnore: false, reason: 'no similar domains found' };
}
// Public surface used by nwss.js. getBaseDomainName + (deleted)
// filterSimilarDomains had zero external callers — getBaseDomainName
// stays as an internal helper, filterSimilarDomains is gone entirely
// (no internal callers either).
module.exports = {
calculateSimilarity,
shouldIgnoreSimilarDomain
};