UNPKG

@botwall/middleware

Version:

Express middleware for BotWall - protects your APIs with pay-per-crawl, signature verification, and bot analytics

514 lines (505 loc) 23.8 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.validateCrawlRequest = validateCrawlRequest; const verifyEd25519Signature_1 = require("./verifyEd25519Signature"); const node_fetch_1 = __importDefault(require("node-fetch")); // --- Known bots cache --- let knownBotsCache = []; let knownBotsCacheTimestamp = 0; const KNOWN_BOTS_CACHE_TTL = 10 * 60 * 1000; // 10 minutes async function getKnownBots(backendUrl) { const now = Date.now(); if (knownBotsCache.length && now - knownBotsCacheTimestamp < KNOWN_BOTS_CACHE_TTL) { return knownBotsCache; } try { const res = await (0, node_fetch_1.default)(`${backendUrl}/api/known-bots`); if (res.ok) { knownBotsCache = await res.json(); knownBotsCacheTimestamp = now; return knownBotsCache; } } catch (err) { // Fallback to last cache if (knownBotsCache.length) return knownBotsCache; return []; } return []; } // --- Site bot preferences cache (per site) --- const siteBotPrefsCache = {}; const SITE_PREFS_CACHE_TTL = 5 * 60 * 1000; // 5 minutes // --- Unknown bot preferences cache (per site) --- const siteUnknownBotPrefsCache = {}; const SITE_UNKNOWN_PREFS_CACHE_TTL = 5 * 60 * 1000; // 5 minutes async function getSiteBotPrefs(backendUrl, siteId) { const now = Date.now(); if (siteBotPrefsCache[siteId] && now - siteBotPrefsCache[siteId].ts < SITE_PREFS_CACHE_TTL) { return siteBotPrefsCache[siteId].prefs; } try { const res = await (0, node_fetch_1.default)(`${backendUrl}/api/sites/${siteId}/bot-prefs`); if (res.ok) { const prefs = await res.json(); siteBotPrefsCache[siteId] = { prefs, ts: now }; return prefs; } } catch (err) { if (siteBotPrefsCache[siteId]) return siteBotPrefsCache[siteId].prefs; return []; } return []; } async function getSiteUnknownBotPrefs(backendUrl, siteId) { const now = Date.now(); if (siteUnknownBotPrefsCache[siteId] && now - siteUnknownBotPrefsCache[siteId].ts < SITE_UNKNOWN_PREFS_CACHE_TTL) { return siteUnknownBotPrefsCache[siteId].prefs; } try { const res = await (0, node_fetch_1.default)(`${backendUrl}/api/sites/${siteId}/unknown-bot-prefs`); if (res.ok) { const prefs = await res.json(); siteUnknownBotPrefsCache[siteId] = { prefs, ts: now }; return prefs; } } catch (err) { if (siteUnknownBotPrefsCache[siteId]) return siteUnknownBotPrefsCache[siteId].prefs; return []; } return []; } async function discoverUnknownBot(backendUrl, userAgent, botName) { try { await (0, node_fetch_1.default)(`${backendUrl}/api/unknown-bot-discovery`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ user_agent: userAgent, bot_name: botName }) }); } catch (err) { console.log(`❌ Middleware: Failed to discover unknown bot:`, err); } } // --- Site lookup cache (per domain) --- const siteCache = {}; const SITE_CACHE_TTL = 5 * 60 * 1000; // 5 minutes async function getSiteByDomain(backendUrl, domain) { const now = Date.now(); if (siteCache[domain] && now - siteCache[domain].ts < SITE_CACHE_TTL) { return siteCache[domain].siteId; } try { const res = await (0, node_fetch_1.default)(`${backendUrl}/api/sites/domain/${encodeURIComponent(domain)}`); if (res.ok) { const data = await res.json(); siteCache[domain] = { siteId: data.site_id, ts: now }; return data.site_id; } } catch (err) { // Fallback to cached value if available if (siteCache[domain]) return siteCache[domain].siteId; } return null; } // --- Site configuration cache (per site ID) --- const siteConfigCache = {}; const SITE_CONFIG_CACHE_TTL = 5 * 60 * 1000; // 5 minutes async function getSiteConfig(backendUrl, siteId, hostname) { const now = Date.now(); const cacheKey = `${siteId}:${hostname}`; if (siteConfigCache[cacheKey] && now - siteConfigCache[cacheKey].ts < SITE_CONFIG_CACHE_TTL) { return siteConfigCache[cacheKey].config; } try { const res = await (0, node_fetch_1.default)(`${backendUrl}/api/sites/config/${siteId}?hostname=${encodeURIComponent(hostname)}`); if (res.ok) { const config = await res.json(); siteConfigCache[cacheKey] = { config, ts: now }; return config; } } catch (err) { if (siteConfigCache[cacheKey]) return siteConfigCache[cacheKey].config; } return null; } // --- Route matching helper --- function isMonetizedRoute(path, monetizedRoutes) { for (const route of monetizedRoutes) { // Convert glob pattern to regex const pattern = route .replace(/\./g, '\\.') // Escape dots .replace(/\*/g, '.*') // Convert * to .* .replace(/\?/g, '\\.') // Convert ? to . .replace(/\[/g, '\\[') // Escape brackets .replace(/\]/g, '\\]'); const regex = new RegExp(`^${pattern}$`); if (regex.test(path)) { return true; } } return false; } // --- Bot detection helper --- function isBotRequest(userAgent) { // Specific bot patterns that won't catch regular browsers // Using word boundaries (\b) to avoid matching "bot" in "robot" or other words const botPatterns = [ /bot\b/i, /crawler\b/i, /spider\b/i, /scraper\b/i, /gptbot/i, /chatgpt/i, /claude/i, /anthropic/i, /bingbot/i, /googlebot/i, /slurp/i, /duckduckbot/i, /baiduspider/i, /yandexbot/i, /facebookexternalhit/i, /twitterbot/i, /linkedinbot/i, /whatsapp/i, /telegrambot/i, /curl/i, /wget/i, /python/i, /requests/i, /scrapy/i, /selenium/i, /puppeteer/i, /playwright/i ]; return botPatterns.some(pattern => pattern.test(userAgent)); } // --- Bot name extraction helper --- function getBotName(userAgent) { const knownBots = [ { pattern: /gptbot/i, name: 'GPTBot' }, { pattern: /chatgpt/i, name: 'ChatGPT' }, { pattern: /claude/i, name: 'Claude' }, { pattern: /anthropic/i, name: 'Anthropic' }, { pattern: /bingbot/i, name: 'BingBot' }, { pattern: /googlebot/i, name: 'GoogleBot' }, { pattern: /slurp/i, name: 'Yahoo Slurp' }, { pattern: /duckduckbot/i, name: 'DuckDuckBot' }, { pattern: /baiduspider/i, name: 'BaiduSpider' }, { pattern: /yandexbot/i, name: 'YandexBot' }, { pattern: /facebookexternalhit/i, name: 'Facebook' }, { pattern: /twitterbot/i, name: 'TwitterBot' }, { pattern: /linkedinbot/i, name: 'LinkedInBot' }, { pattern: /whatsapp/i, name: 'WhatsApp' }, { pattern: /telegrambot/i, name: 'TelegramBot' } ]; for (const bot of knownBots) { if (bot.pattern.test(userAgent)) { return bot.name; } } return userAgent || 'Unknown Bot'; } // --- Bot classification helper --- async function classifyBot(headers, userAgent, knownBots, backendUrl) { // Signed bot: has crawler-id, signature-input, signature if (headers['crawler-id'] && headers['signature-input'] && headers['signature']) { return { type: 'signed', knownBot: null }; } // Check if this is a registered signed bot (by user agent or bot name) try { const registeredBotRes = await (0, node_fetch_1.default)(`${backendUrl}/api/bots/check-registered`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ user_agent: userAgent }) }); if (registeredBotRes.ok) { const botData = await registeredBotRes.json(); if (botData.is_registered) { return { type: 'registered_signed', knownBot: null, botData: botData }; } } } catch (err) { console.log(`❌ Middleware: Failed to check registered bot status:`, err); } // Known bot: match user-agent for (const bot of knownBots) { try { if (bot.user_agent_pattern && new RegExp(bot.user_agent_pattern, 'i').test(userAgent)) { return { type: 'known', knownBot: bot }; } } catch (e) { // fallback to includes if (userAgent.includes(bot.user_agent_pattern)) { return { type: 'known', knownBot: bot }; } } } // Unknown bot return { type: 'unknown', knownBot: null }; } function validateCrawlRequest(options) { const backendUrl = (options === null || options === void 0 ? void 0 : options.backendUrl) || process.env.BACKEND_URL || 'https://botwall-api.onrender.com'; const configSiteId = options === null || options === void 0 ? void 0 : options.siteId; const configMonetizedRoutes = (options === null || options === void 0 ? void 0 : options.monetizedRoutes) || ['/*']; const configPricePerCrawl = (options === null || options === void 0 ? void 0 : options.pricePerCrawl) || 0.01; return async function (req, res, next) { var _a; const headers = Object.fromEntries(Object.entries(req.headers).map(([k, v]) => [k.toLowerCase(), Array.isArray(v) ? v[0] : v || ''])); const userAgent = headers['user-agent'] || ''; const crawlerId = headers['crawler-id'] || ''; const maxPrice = parseFloat(headers['crawler-max-price'] || '0'); const signatureInput = headers['signature-input'] || ''; const signature = headers['signature'] || ''; // Extract domain with fallbacks for different environments let domain = ''; if (req.hostname) { domain = req.hostname.split(':')[0]; } else if (req.headers.host) { const host = Array.isArray(req.headers.host) ? req.headers.host[0] : req.headers.host; domain = host.split(':')[0]; } else if (req.headers['x-forwarded-host']) { const forwardedHost = Array.isArray(req.headers['x-forwarded-host']) ? req.headers['x-forwarded-host'][0] : req.headers['x-forwarded-host']; domain = forwardedHost.split(':')[0]; } else { // Fallback for edge functions or environments without hostname domain = 'unknown'; } const path = req.path; const now = new Date().toISOString(); const ip = req.ip || ((_a = req.connection) === null || _a === void 0 ? void 0 : _a.remoteAddress) || ''; console.log(`🔍 Middleware: Processing request for domain: ${domain}`); // --- EARLY EXIT: Allow regular browsers to pass through immediately --- const isSignedBot = crawlerId && signatureInput && signature; const isActualBot = isBotRequest(userAgent); // If it's not a signed bot and not an actual bot, allow it immediately if (!isSignedBot && !isActualBot) { console.log(`✅ Middleware: Regular browser detected, allowing access: ${userAgent}`); return next(); } // --- 0. Site ID-based validation (new system) --- if (configSiteId) { console.log(`🔍 Middleware: Using site ID-based validation: ${configSiteId}`); try { // Get site configuration from database const siteConfig = await getSiteConfig(backendUrl, configSiteId, domain); if (!siteConfig) { console.log(`❌ Middleware: Site configuration not found for site ID: ${configSiteId}`); return res.status(403).send('Domain not authorized'); } // Verify domain authorization if (siteConfig.frontendDomain !== domain && siteConfig.backendDomain !== domain) { console.log(`❌ Middleware: Domain ${domain} not authorized for site ID: ${configSiteId}`); return res.status(403).send('Domain not authorized'); } // Always record analytics for bot requests if (isBotRequest(userAgent)) { await logBotCrawl({ backendUrl, siteId: siteConfig.siteId, userAgent, botName: getBotName(userAgent), path, status: 'success', ip, headers }); } // Check if this is a monetized route const monetizedRoutes = siteConfig.monetizedRoutes || configMonetizedRoutes; if (isMonetizedRoute(path, monetizedRoutes)) { console.log(`💰 Middleware: Monetized route detected: ${path}`); // For signed bots, use existing verification if (crawlerId && signatureInput && signature) { // Continue with signed bot verification console.log(`🔍 Middleware: Signed bot on monetized route: ${crawlerId}`); } else { // For unsigned bots, block access to monetized routes console.log(`🚫 Middleware: Blocking unsigned bot on monetized route: ${path}`); return res.status(402).send('Insufficient credits - Please purchase credits to access this API'); } } else { console.log(`✅ Middleware: Non-monetized route, allowing access: ${path}`); return next(); } } catch (error) { console.error(`❌ Middleware: Error in site ID validation:`, error); // Fallback to allow request return next(); } } // --- 1. Check if this is a signed bot request --- if (isSignedBot) { console.log(`🔍 Middleware: Signed bot detected: ${crawlerId}`); // For signed bots, use the original workflow // 1. Fetch public key let publicKey = null; try { const publicKeyUrl = `${backendUrl}/api/bots/${encodeURIComponent(crawlerId)}/public-key`; console.log(`🔍 Middleware: Fetching public key from: ${publicKeyUrl}`); const pkRes = await (0, node_fetch_1.default)(publicKeyUrl); console.log(`🔍 Middleware: Public key response status: ${pkRes.status}`); if (pkRes.ok) { const pkData = await pkRes.json(); console.log(`🔍 Middleware: Public key response data:`, pkData); publicKey = pkData.publicKey || null; } else { const errorText = await pkRes.text(); console.log(`❌ Middleware: Public key fetch failed with status ${pkRes.status}: ${errorText}`); } } catch (err) { console.log(`❌ Middleware: Failed to fetch public key for ${crawlerId}:`, err); return res.status(500).json({ error: 'Failed to fetch public key from backend.' }); } if (!publicKey) { console.log(`❌ Middleware: No public key found for ${crawlerId}`); return res.status(403).send(`This content is protected by BotWall pay-per-crawl system Bots must pay to access this content. Check BotWall.`); } // 2. Verify signature const validSig = (0, verifyEd25519Signature_1.verifyEd25519Signature)(headers, signature, publicKey); if (!validSig) { console.log(`❌ Middleware: Invalid signature for ${crawlerId}`); return res.status(403).send(`This content is protected by BotWall pay-per-crawl system Bots must pay to access this content. Check BotWall.`); } // 3. Call original verify endpoint (this handles credits, site lookup, etc.) try { const verifyRes = await (0, node_fetch_1.default)(`${backendUrl}/api/verify`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ botId: crawlerId, botApiKey: '', // Empty for signed bots path: path, userAgent: userAgent, domain: domain // Pass domain for site lookup }) }); if (!verifyRes.ok) { const errorData = await verifyRes.json(); if (verifyRes.status === 402) { console.log(`❌ Middleware: Insufficient credits for ${crawlerId}`); return res.status(402).send(`This content is protected by BotWall pay-per-crawl system Bots must pay to access this content. Check BotWall.`); } console.log(`❌ Middleware: Verification failed for ${crawlerId}`); return res.status(verifyRes.status).send(`This content is protected by BotWall pay-per-crawl system Bots must pay to access this content. Check BotWall.`); } // Note: Signed bots are logged to 'crawls' table by the verify endpoint // No need to duplicate log to bot_crawl_logs console.log(`✅ Middleware: Signed bot ${crawlerId} verified and allowed`); return next(); } catch (err) { console.log(`❌ Middleware: Verification error for ${crawlerId}:`, err); return res.status(500).json({ error: 'Verification failed' }); } } // --- 2. Handle known/unknown bots (new analytics workflow) --- console.log(`🔍 Middleware: Processing as known/unknown bot`); // Look up siteId by domain for known/unknown bots const siteId = await getSiteByDomain(backendUrl, domain); console.log(`🔍 Middleware: Site lookup result for ${domain}: ${siteId || 'NOT FOUND'}`); if (!siteId) { // Site not found for this domain, allow request but don't log console.log(`⚠️ Middleware: No site found for domain ${domain}, allowing request without logging`); return next(); } // Fetch known bots const knownBots = await getKnownBots(backendUrl); const { type, knownBot, botData } = await classifyBot(headers, userAgent, knownBots, backendUrl); console.log(`🔍 Middleware: Bot classified as ${type}${knownBot ? ` (${knownBot.name})` : ''}`); // Handle registered signed bots without proper headers if (type === 'registered_signed') { console.log(`🔍 Middleware: Registered signed bot detected without proper headers: ${userAgent}`); return res.status(401).send(`🔐 Authentication Required - BotWall Protection This content is protected by BotWall's pay-per-crawl system. Your bot "${(botData === null || botData === void 0 ? void 0 : botData.bot_name) || userAgent}" is registered but missing required authentication headers. Required headers: - crawler-id: Your bot ID - signature-input: Request signature input - signature: Ed25519 signature Please include these headers in your request to access this content. For more information, visit: https://botwall.com`); } // For known bots, check site-specific block/allow if (type === 'known' && knownBot) { const prefs = await getSiteBotPrefs(backendUrl, siteId); const pref = prefs.find((p) => p.known_bot_id === knownBot.id); if (pref && pref.blocked) { console.log(`🚫 Middleware: Blocking ${knownBot.name} for site ${siteId}`); await logBotCrawl({ backendUrl, siteId, userAgent, botName: knownBot.name, path, status: 'blocked', ip, knownBotId: knownBot.id, headers }); return res.status(403).send(`This content is protected by BotWall pay-per-crawl system Bots must pay to access this content. Check BotWall.`); } } // Handle unknown bots - BLOCK BY DEFAULT (these are likely scrapers/crawlers) if (type === 'unknown') { console.log(`🔍 Middleware: Unknown bot detected: ${userAgent}`); // Discover/register the unknown bot await discoverUnknownBot(backendUrl, userAgent, getBotName(userAgent)); // Check site-specific preferences for this unknown bot const unknownBotPrefs = await getSiteUnknownBotPrefs(backendUrl, siteId); const unknownBotPref = unknownBotPrefs.find((p) => p.user_agent === userAgent); // Default to blocked unless explicitly allowed const shouldBlock = !unknownBotPref || unknownBotPref.blocked; if (shouldBlock) { console.log(`🚫 Middleware: Blocking unknown bot for site ${siteId}: ${userAgent}`); await logBotCrawl({ backendUrl, siteId, userAgent, botName: getBotName(userAgent), path, status: 'blocked', ip, headers }); return res.status(403).send(`🚫 Access Denied - BotWall Protection Active This content is protected by BotWall's pay-per-crawl system. Unauthorized bots are not allowed to scrape this content. To access this content, you need to: 1. Register your bot at https://botwall.com 2. Purchase credits for crawling 3. Use proper authentication headers For more information, visit: https://botwall.com`); } else { console.log(`✅ Middleware: Unknown bot allowed by site preference for site ${siteId}: ${userAgent}`); await logBotCrawl({ backendUrl, siteId, userAgent, botName: getBotName(userAgent), path, status: 'success', ip, headers }); return next(); } } // Log and allow known bots if (type === 'known' && knownBot) { console.log(`✅ Middleware: Known bot ${knownBot.name} allowed for site ${siteId}`); await logBotCrawl({ backendUrl, siteId, userAgent, botName: knownBot.name, path, status: 'success', ip, knownBotId: knownBot.id, headers }); } return next(); }; } // --- Helper: Log every crawl to backend --- async function logBotCrawl({ backendUrl, siteId, userAgent, botName, path, status, ip, botId, knownBotId, headers }) { try { await (0, node_fetch_1.default)(`${backendUrl}/api/bot-logs`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ site_id: siteId, bot_id: botId, known_bot_id: knownBotId, user_agent: userAgent, bot_name: botName, path, status, ip_address: ip, raw_headers: headers }) }); } catch (err) { // Fallback: log to console console.log(`[${new Date().toISOString()}] Failed to log bot crawl:`, err); } }