@fanboynz/network-scanner
Version:
A Puppeteer-based network scanner for analyzing web traffic, generating adblock filter rules, and identifying third-party requests. Features include fingerprint spoofing, Cloudflare bypass, content analysis with curl/grep, and multiple output formats.
374 lines (350 loc) • 15.2 kB
JavaScript
// === Adblock Rust Engine Wrapper (adblock-rust.js) ===
// Drop-in replacement for ./lib/adblock that delegates matching to Brave's
// adblock-rust engine (npm: adblock-rs) for higher throughput on large lists.
//
// Exposes the same parseAdblockRules(filePath, options) factory and the same
// matcher shape ({ shouldBlock, getStats, rules }) so nwss.js can switch
// engines with a single require() swap.
const fs = require('fs');
const path = require('path');
const os = require('os');
const crypto = require('crypto');
const { formatLogMessage, messageColors } = require('./colorize');
// Subsystem tag matches the project convention used by other modules
// (lib/adblock.js, flowproxy, cloudflare, curl, grep, etc.).
const ADBLOCK_RUST_TAG = messageColors.processing('[adblock-rust]');
let adblockRust = null;
let adblockRustVersion = null;
function loadAdblockRust() {
if (adblockRust) return adblockRust;
try {
adblockRust = require('adblock-rs');
// Read once for the cache key — serialized engine format is not promised
// stable across versions, so partitioning cache files by version means
// upgrades cleanly invalidate without producing confusing deserialize
// failures on the warm path.
adblockRustVersion = require('adblock-rs/package.json').version;
} catch (err) {
throw new Error(
"adblock-rs is not installed. Install with: npm install adblock-rs " +
"(requires Rust toolchain for native build). Original error: " + err.message
);
}
return adblockRust;
}
// Best-effort cleanup of stale serialized engines. Filter lists change roughly
// monthly; cache files older than this are unlikely to be reused and only cost
// disk space. Runs once per cold parse and swallows all errors — cleanup
// failure must never block a scan.
function pruneOldCacheFiles(cacheDir, maxAgeMs) {
try {
const cutoff = Date.now() - maxAgeMs;
const files = fs.readdirSync(cacheDir);
for (const name of files) {
// Only touch our own files; `.tmp` covers stray writes from killed
// processes. Skip anything else (in case the dir is shared).
if (!name.endsWith('.bin') && !name.endsWith('.tmp')) continue;
const full = path.join(cacheDir, name);
try {
if (fs.statSync(full).mtimeMs < cutoff) fs.unlinkSync(full);
} catch (_) { /* file vanished mid-walk — fine */ }
}
} catch (_) { /* dir doesn't exist or unreadable — fine */ }
}
// Map Puppeteer/CDP resource type names to adblock-rust request types.
// Uses a null-prototype object so lookups skip the prototype chain — small but
// free win on a hot-path lookup that runs once per network request.
const RESOURCE_TYPE_MAP = Object.assign(Object.create(null), {
'document': 'main_frame',
'subdocument': 'sub_frame',
'stylesheet': 'stylesheet',
'script': 'script',
'image': 'image',
'font': 'font',
'media': 'media',
'texttrack': 'media',
'xhr': 'xmlhttprequest',
'fetch': 'xmlhttprequest',
'xmlhttprequest': 'xmlhttprequest',
'eventsource': 'other',
'websocket': 'websocket',
'manifest': 'other',
'signedexchange': 'other',
'ping': 'ping',
'cspviolationreport': 'other',
'preflight': 'other',
'other': 'other',
'': ''
});
// Removed: normalizeResourceType() helper. The hot path in shouldBlock
// inlines the (RESOURCE_TYPE_MAP[rt] || 'other') lookup directly to skip
// the function-call frame; the standalone helper had zero callers.
// Small FIFO cache keyed on (url \0 sourceUrl \0 resourceType). Eviction
// is insertion-order — `get()` does not promote. For this workload
// (per-page request bursts whose working set fits in maxSize) FIFO and
// true LRU produce the same evictions, so the simpler path wins. If
// cache effectiveness becomes a concern with larger working sets,
// promote on hit by re-inserting (delete + set). Renamed from ResultLRU
// since the previous name lied about the eviction policy — matches
// the FIFOCache rename in lib/adblock.js.
class FIFOCache {
constructor(maxSize) {
this.cache = new Map();
this.maxSize = maxSize;
}
get(k) { return this.cache.get(k); }
set(k, v) {
if (this.cache.size >= this.maxSize) {
this.cache.delete(this.cache.keys().next().value);
}
this.cache.set(k, v);
}
}
/**
* Build a request-blocking matcher backed by Brave's adblock-rs engine.
*
* @param {string|string[]} filePathOrArray - One filter list path, or an array
* of paths to load in order. Order is significant: it affects rule
* precedence and the cache key.
* @param {object} [options]
* @param {boolean} [options.enableLogging=false] - Print parse + cache events.
* @param {number} [options.resultCacheSize=32000] - Max entries in the
* per-matcher result cache (FIFO eviction).
* @param {boolean} [options.useDiskCache=true] - Persist the compiled engine
* to disk and reload on next run with the same input lists + library version.
* @param {string} [options.cacheDir] - Directory for compiled-engine cache
* files. Defaults to a folder under the OS temp dir.
* @param {number} [options.cacheTtlMs=2592000000] - Files in cacheDir older
* than this are pruned during cold parse. Default 30 days.
* @returns {{shouldBlock: Function, getStats: Function, rules: object}}
*/
function parseAdblockRules(filePathOrArray, options = {}) {
const {
enableLogging = false,
resultCacheSize = 32000,
useDiskCache = true,
cacheDir = path.join(os.tmpdir(), 'nwss-adblock-rs-cache'),
cacheTtlMs = 30 * 24 * 60 * 60 * 1000
} = options;
const rust = loadAdblockRust();
// Accept a single path or an array of paths — caller no longer needs to
// materialize a temp concatenation file for multi-list scans.
const filePaths = Array.isArray(filePathOrArray) ? filePathOrArray : [filePathOrArray];
// Read all files up front; hash the raw bytes so the disk cache key reflects
// both content changes and list-order changes. Mix in the adblock-rs version
// so a library upgrade (which may change the serialized format) doesn't try
// to deserialize an incompatible blob.
const buffers = [];
const hash = crypto.createHash('sha256');
hash.update('adblock-rs:' + adblockRustVersion + '\0');
let totalBytes = 0;
for (const fp of filePaths) {
let buf;
try {
buf = fs.readFileSync(fp);
} catch (err) {
throw new Error(`Adblock rules file not found: ${fp}`);
}
buffers.push(buf);
hash.update(buf);
hash.update('\0');
totalBytes += buf.length;
}
const cacheKey = hash.digest('hex');
const cachePath = path.join(cacheDir, cacheKey + '.bin');
let engine = null;
let ruleCount = 0;
let cacheHit = false;
// Fast path: deserialize a previously-compiled engine if available.
// Skip the existsSync/readFileSync double-syscall pattern — let readFileSync
// throw ENOENT and treat it as a clean cache-miss. Avoids a redundant stat()
// and the TOCTOU race where the cache file could be removed between the
// exists check and the read.
if (useDiskCache) {
let compiled;
try {
compiled = fs.readFileSync(cachePath);
} catch (err) {
if (err.code !== 'ENOENT' && enableLogging) {
console.log(formatLogMessage('debug', `${ADBLOCK_RUST_TAG} Cache read failed (${err.message}); reparsing`));
}
}
if (compiled) {
try {
engine = new rust.Engine(new rust.FilterSet(enableLogging), true);
// Avoid copying the ~10MB serialized engine when the underlying
// ArrayBuffer is exclusively ours (true for any read above Node's
// ~4KB Buffer pool threshold — i.e. always for compiled engines).
// Fall back to slicing only when the Buffer is a view into a pooled
// backing store, which would otherwise leak unrelated data.
const ab = (compiled.byteOffset === 0 &&
compiled.byteLength === compiled.buffer.byteLength)
? compiled.buffer
: compiled.buffer.slice(
compiled.byteOffset,
compiled.byteOffset + compiled.byteLength
);
engine.deserialize(ab);
cacheHit = true;
} catch (err) {
// Corrupt cache or version mismatch — fall through to a fresh parse.
engine = null;
if (enableLogging) {
console.log(formatLogMessage('debug', `${ADBLOCK_RUST_TAG} Cache deserialize failed (${err.message}); reparsing`));
}
}
}
}
if (!engine) {
// Slow path: parse every list. Use addFilters per-file so a single bad
// line in one list does not blast the whole input, and so the per-list
// line count is correct. Release each buffer's reference as soon as it
// is consumed so GC can reclaim the file bytes mid-loop instead of
// holding all input files (~3-5MB combined for easylist+easyprivacy)
// alive until the function returns.
const filterSet = new rust.FilterSet(enableLogging);
for (let i = 0; i < buffers.length; i++) {
const buf = buffers[i];
buffers[i] = null;
const lines = buf.toString('utf-8').split('\n');
for (let j = 0; j < lines.length; j++) {
const line = lines[j];
if (line.length === 0) continue;
if (line.charCodeAt(0) === 0x21) continue;
ruleCount++;
}
filterSet.addFilters(lines);
}
engine = new rust.Engine(filterSet, true);
if (useDiskCache) {
try {
fs.mkdirSync(cacheDir, { recursive: true });
const serialized = engine.serialize();
// Atomic write: writeFileSync to a per-pid tmp path then rename. If
// the process is killed mid-write we leave a stray .tmp file (cleaned
// up by the TTL prune on a future run) but the final cachePath is
// either complete or absent — never half-written.
const tmpPath = cachePath + '.' + process.pid + '.tmp';
fs.writeFileSync(tmpPath, Buffer.from(serialized));
fs.renameSync(tmpPath, cachePath);
// Best-effort prune of stale cache files. Done after our own write so
// we never delete the entry we just created.
pruneOldCacheFiles(cacheDir, cacheTtlMs);
} catch (err) {
if (enableLogging) {
console.log(formatLogMessage('warn', `${ADBLOCK_RUST_TAG} Cache write failed (${err.message}); continuing`));
}
}
}
}
const stats = {
// When deserialized from cache we don't see the rules; report bytes instead
// so the startup banner remains informative.
total: cacheHit ? null : ruleCount,
bytes: totalBytes,
engine: 'adblock-rust',
fromDiskCache: cacheHit,
listCount: filePaths.length,
blocked: 0,
allowed: 0,
exceptions: 0,
errors: 0,
cacheHits: 0,
cacheMisses: 0
};
const resultCache = new FIFOCache(resultCacheSize);
// Hot-path optimization: shared "no_match" object — most checks return this,
// skip per-call object allocation. Safe because callers only read fields.
const NO_MATCH = Object.freeze({ blocked: false, rule: null, reason: 'no_match' });
// Bind once: skips the prototype property lookup for `engine.check` on every
// call. The adblock-rs forwarder still does an internal name concat per
// invocation; bypassing that further would require reaching into the native
// binding (engine.boxed + blocker.Engine_check), which is brittle across
// library versions.
const engineCheck = engine.check.bind(engine);
if (enableLogging) {
if (cacheHit) {
console.log(formatLogMessage('debug', `${ADBLOCK_RUST_TAG} Restored compiled engine from ${cachePath} (${(totalBytes/1024/1024).toFixed(2)}MB source, ${filePaths.length} list${filePaths.length>1?'s':''})`));
} else {
console.log(formatLogMessage('debug', `${ADBLOCK_RUST_TAG} Compiled ${ruleCount} rules from ${filePaths.length} list${filePaths.length>1?'s':''} (${(totalBytes/1024/1024).toFixed(2)}MB)`));
}
}
return {
rules: { stats },
shouldBlock(url, sourceUrl, resourceType) {
// Avoid default-parameter syntax in the hot path — explicit null/undefined
// checks are slightly cheaper for V8's argument adaptor.
const src = sourceUrl || '';
const rt = resourceType || '';
// Single null-proto object lookup; falls back to 'other' for unknown types.
const normType = rt ? (RESOURCE_TYPE_MAP[rt] || 'other') : '';
const key = url + '\0' + src + '\0' + normType;
const cached = resultCache.get(key);
if (cached !== undefined) {
stats.cacheHits++;
return cached;
}
stats.cacheMisses++;
// Narrow try/catch to the native call only — keeps the rest of the
// function on TurboFan's fast path and avoids exception-handler overhead
// on stats updates and Map operations.
let result;
try {
// Pass empty string (not the request URL) when source is unknown — the
// engine then skips first/third-party determination instead of treating
// the request as same-origin to itself, which would suppress
// $third-party rules entirely.
// The 4th arg MUST be true: with false adblock-rs returns a bare
// boolean instead of the {matched, exception, filter, important}
// object we read below, which silently breaks matching.
result = engineCheck(url, src, normType, true);
} catch (err) {
stats.errors++;
if (enableLogging) {
console.log(formatLogMessage('warn', `${ADBLOCK_RUST_TAG} Error checking ${url}: ${err.message}`));
}
// Don't cache errors — next call may succeed (transient native panic).
return { blocked: false, rule: null, reason: 'error' };
}
// engine.check is contract-bound to return an object; no null guard
// needed. Reading each field once into a local keeps the IC monomorphic.
let r;
if (result.matched) {
const exception = result.exception;
if (exception) {
stats.exceptions++;
r = { blocked: false, rule: exception, reason: 'whitelisted' };
} else {
stats.blocked++;
r = {
blocked: true,
rule: result.filter || null,
reason: result.important ? 'important_rule' : 'adblock_rust'
};
}
} else {
stats.allowed++;
r = NO_MATCH;
}
resultCache.set(key, r);
return r;
},
getStats() {
const total = stats.cacheHits + stats.cacheMisses;
const hitRate = total > 0 ? ((stats.cacheHits / total) * 100).toFixed(1) + '%' : '0%';
return {
...stats,
cache: {
hits: stats.cacheHits,
misses: stats.cacheMisses,
hitRate,
size: resultCache.cache.size,
maxSize: resultCache.maxSize
}
};
}
};
}
module.exports = {
parseAdblockRules
};