@fairfetch/fair-fetch
Version:
Protect your site from AI scrapers by adding invisible noise to your site which confuses AI bots while keeping your site looking and functioning normally for your human visitors.
112 lines (111 loc) • 4.56 kB
JavaScript
import { BotAgent } from '../types/BotAgent';
import ipaddr from 'ipaddr.js';
// This class will be used to validate bot requests so that we can dynamically *un-pollute* content
// to avoid hurting SEO performance.
// In-memory store of all CIDR ranges
let cidrsByAgent = {
[BotAgent.GoogleBot]: [],
[BotAgent.BingBot]: [],
[BotAgent.DuckDuckBot]: [],
[BotAgent.Unknown]: [],
};
const GOOGLE_JSON_URLS = {
common: 'https://developers.google.com/search/apis/ipranges/googlebot.json',
special: 'https://developers.google.com/search/apis/ipranges/special-crawlers.json',
userFetch: 'https://developers.google.com/search/apis/ipranges/user-triggered-fetchers.json',
userFetchGoogle: 'https://developers.google.com/search/apis/ipranges/user-triggered-fetchers-google.json',
general: 'https://www.gstatic.com/ipranges/goog.json',
};
const BING_JSON_URLS = {
common: 'https://www.bing.com/toolbox/bingbot.json',
};
const DUCKDUCKGO_JSON_URLS = {
common: 'https://duckduckgo.com/duckduckbot.json',
};
export class BotValidator {
constructor(headers, ipAddress) {
var _a;
this.ipAddress = ipAddress;
const reqUserAgent = (_a = headers.get('User-Agent')) !== null && _a !== void 0 ? _a : '';
switch (reqUserAgent) {
case BotAgent.GoogleBot.toString():
this.userAgent = BotAgent.GoogleBot;
console.log(`Detected GoogleBot: ${this.ipAddress}`);
break;
case BotAgent.BingBot.toString():
console.log(`Detected BingBot: ${this.ipAddress}`);
this.userAgent = BotAgent.BingBot;
break;
case BotAgent.DuckDuckBot.toString():
console.log(`Detected DuckDuckBot: ${this.ipAddress}`);
this.userAgent = BotAgent.DuckDuckBot;
break;
default:
this.userAgent = BotAgent.Unknown;
}
}
/**
* Asynchronously validates the bot based on user agent and IP.
*/
async isValid() {
/**
* Check if a given IP (v4 or v6) belongs to any of the loaded CIDR ranges.
* Returns true if it does (i.e. it’s a genuine Google crawler or fetcher).
*/
let addr;
try {
addr = ipaddr.parseCIDR(this.ipAddress);
}
catch (_a) {
console.warn(`Invalid IP address: ${this.ipAddress}`);
return false;
}
let cidrList = cidrsByAgent[this.userAgent];
if (!cidrList || cidrList.length === 0) {
console.warn(`No CIDR ranges loaded for agent: ${this.userAgent}`);
return false;
}
return cidrList.some((cidr) => {
try {
const range = ipaddr.parseCIDR(cidr);
return range[0].match(addr[0]); // TODO: This is broken
}
catch (err) {
console.error(`Error checking CIDR ${cidr} for IP ${this.ipAddress}:`, err);
// If there's an error parsing the CIDR, we assume it's not valid for
return false;
}
});
}
/**
* Fetch one of Google's JSON lists and extract all CIDR prefixes.
*/
async fetchCidrs(url) {
const res = await fetch(url);
if (!res.ok)
throw new Error(`Failed to fetch ${url}: ${res.status}`);
const json = await res.json();
return [
...(json.prefixes || []).flatMap((p) => p.ipv4Prefix ? [p.ipv4Prefix] : []),
...(json.prefixes || []).flatMap((p) => p.ipv6Prefix ? [p.ipv6Prefix] : []),
];
}
/**
* Load (or reload) all Google IP ranges into memory.
* Call once at startup, and schedule a daily refresh.
*/
async loadIpRanges() {
try {
cidrsByAgent[BotAgent.GoogleBot] = (await Promise.all(Object.values(GOOGLE_JSON_URLS).map(this.fetchCidrs))).flat();
cidrsByAgent[BotAgent.BingBot] = (await Promise.all(Object.values(BING_JSON_URLS).map(this.fetchCidrs))).flat();
cidrsByAgent[BotAgent.DuckDuckBot] = (await Promise.all(Object.values(DUCKDUCKGO_JSON_URLS).map(this.fetchCidrs))).flat();
// cidrsByAgent[BotAgent.AppleBot] = (await Promise.all(
// Object.values(APPLE_JSON_URLS).map(fetchCidrs),
// )).flat();
console.info(`Loaded CIDR ranges.`, cidrsByAgent);
}
catch (err) {
console.error('Error loading Google IP ranges:', err);
}
}
}