UNPKG

@suzakuteam/scraper-node

Version:

Sebuah Module Scraper yang dibuat oleh Sxyz dan SuzakuTeam untuk memudahkan penggunaan scraper di project ESM maupun CJS.

115 lines (100 loc) 2.97 kB
import axios from "axios"; export default async function detect(url) { const result = { url, cloudflare: false, captcha: false, rateLimit: false, robotsBlock: false, suspiciousHeaders: [], jsChallenge: false, statusCode: null, antiScrapeScore: 0, scoreLevel: "Unknown", error: null, }; try { const res = await axios.get(url, { headers: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", Accept: "text/html", }, timeout: 10000, validateStatus: () => true, }); result.statusCode = res.status; const html = res.data || ""; const server = res.headers.server?.toLowerCase(); if (server?.includes("cloudflare") || res.headers["cf-ray"]) { result.cloudflare = true; result.antiScrapeScore += 25; } if (/captcha|hcaptcha|recaptcha/i.test(html)) { result.captcha = true; result.antiScrapeScore += 20; } if ( res.status === 200 && /cf_chl_opt|Just a moment|Please enable JavaScript and cookies/i.test( html, ) ) { result.jsChallenge = true; result.antiScrapeScore += 15; } if (html.includes("_cf_chl_opt")) { result.cloudflare = true; // reinforce detection result.jsChallenge = true; result.antiScrapeScore += 15; } if ([403, 429, 503].includes(res.status)) { result.rateLimit = true; result.antiScrapeScore += 20; } const susHeaders = [ "cf-ray", "x-frame-options", "x-captcha", "server", "set-cookie", ]; result.suspiciousHeaders = Object.keys(res.headers).filter((h) => susHeaders.includes(h.toLowerCase()), ); if (result.suspiciousHeaders.length > 0) { result.antiScrapeScore += 10; } if (res.status === 200 && html.length < 300) { result.jsChallenge = true; result.antiScrapeScore += 10; } const bodyCheck = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i); const bodyContent = bodyCheck?.[1]?.trim() || ""; if ( bodyContent.length < 100 && (html.match(/<script/gi) || []).length > 5 ) { result.antiScrapeScore += 10; } try { const robots = await axios.get(new URL("/robots.txt", url).toString(), { headers: { "User-Agent": "Mozilla/5.0" }, }); if (robots.data.includes("Disallow: /")) { result.robotsBlock = true; result.antiScrapeScore += 5; } } catch {} } catch (err) { result.error = err.message; result.antiScrapeScore += 20; } const score = result.antiScrapeScore; if (score <= 30) result.scoreLevel = "Low (Safe to scrape)"; else if (score <= 60) result.scoreLevel = "Medium (May require headers or cookies)"; else if (score <= 85) result.scoreLevel = "High (May require puppeteer or JS rendering)"; else result.scoreLevel = "Very High (Hard to scrape, not recommended)"; return result; }