@suzakuteam/scraper-node
Version:
Sebuah Module Scraper yang dibuat oleh Sxyz dan SuzakuTeam untuk memudahkan penggunaan scraper di project ESM maupun CJS.
115 lines (100 loc) • 2.97 kB
JavaScript
import axios from "axios";
export default async function detect(url) {
const result = {
url,
cloudflare: false,
captcha: false,
rateLimit: false,
robotsBlock: false,
suspiciousHeaders: [],
jsChallenge: false,
statusCode: null,
antiScrapeScore: 0,
scoreLevel: "Unknown",
error: null,
};
try {
const res = await axios.get(url, {
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
Accept: "text/html",
},
timeout: 10000,
validateStatus: () => true,
});
result.statusCode = res.status;
const html = res.data || "";
const server = res.headers.server?.toLowerCase();
if (server?.includes("cloudflare") || res.headers["cf-ray"]) {
result.cloudflare = true;
result.antiScrapeScore += 25;
}
if (/captcha|hcaptcha|recaptcha/i.test(html)) {
result.captcha = true;
result.antiScrapeScore += 20;
}
if (
res.status === 200 &&
/cf_chl_opt|Just a moment|Please enable JavaScript and cookies/i.test(
html,
)
) {
result.jsChallenge = true;
result.antiScrapeScore += 15;
}
if (html.includes("_cf_chl_opt")) {
result.cloudflare = true; // reinforce detection
result.jsChallenge = true;
result.antiScrapeScore += 15;
}
if ([403, 429, 503].includes(res.status)) {
result.rateLimit = true;
result.antiScrapeScore += 20;
}
const susHeaders = [
"cf-ray",
"x-frame-options",
"x-captcha",
"server",
"set-cookie",
];
result.suspiciousHeaders = Object.keys(res.headers).filter((h) =>
susHeaders.includes(h.toLowerCase()),
);
if (result.suspiciousHeaders.length > 0) {
result.antiScrapeScore += 10;
}
if (res.status === 200 && html.length < 300) {
result.jsChallenge = true;
result.antiScrapeScore += 10;
}
const bodyCheck = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
const bodyContent = bodyCheck?.[1]?.trim() || "";
if (
bodyContent.length < 100 &&
(html.match(/<script/gi) || []).length > 5
) {
result.antiScrapeScore += 10;
}
try {
const robots = await axios.get(new URL("/robots.txt", url).toString(), {
headers: { "User-Agent": "Mozilla/5.0" },
});
if (robots.data.includes("Disallow: /")) {
result.robotsBlock = true;
result.antiScrapeScore += 5;
}
} catch {}
} catch (err) {
result.error = err.message;
result.antiScrapeScore += 20;
}
const score = result.antiScrapeScore;
if (score <= 30) result.scoreLevel = "Low (Safe to scrape)";
else if (score <= 60)
result.scoreLevel = "Medium (May require headers or cookies)";
else if (score <= 85)
result.scoreLevel = "High (May require puppeteer or JS rendering)";
else result.scoreLevel = "Very High (Hard to scrape, not recommended)";
return result;
}