ag-webscrape
Version:
TypeScript web scraper with Playwright fallback for anti-scraping protection
167 lines • 6.01 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.WebScraper = void 0;
const log_1 = require("ag-common/dist/common/helpers/log");
const dom_1 = require("./helpers/dom");
class WebScraper {
constructor(options = {}) {
this.userAgent =
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36';
this.defaultOptions = {
timeout: 30000,
retries: 3,
waitForTimeout: 5000,
...options,
};
}
async fetchDirectly(url, options) {
const headers = {
'User-Agent': options.userAgent || this.userAgent.toString(),
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
Connection: 'keep-alive',
'Upgrade-Insecure-Requests': '1',
...options.headers,
};
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), options.timeout ?? this.defaultOptions.timeout);
try {
const response = await fetch(url, {
headers,
signal: controller.signal,
redirect: 'follow',
});
clearTimeout(timeoutId);
const html = await response.text();
return {
url,
html,
status: response.status,
method: 'fetch',
redirected: response.redirected,
finalUrl: response.url,
};
}
catch (error) {
clearTimeout(timeoutId);
throw error;
}
}
async scrapeWithpuppeteer(url, options) {
let html = '';
let status = 200;
let error;
let finalUrl = url;
try {
const pageResult = await (0, dom_1.goToPage)(url, {
timeout: options.timeout ?? this.defaultOptions.timeout,
wailUntilSelector: options.waitForSelector,
executablePath: options.executablePath,
});
html = pageResult.html.outerHTML;
status = pageResult.status;
finalUrl = pageResult.url;
if (status >= 400) {
error = `HTTP ${status}: ${pageResult.statusText}`;
}
return {
url,
html,
status,
method: 'visual',
error,
finalUrl,
};
}
catch (err) {
const errorMessage = err instanceof Error ? err.message : 'Unknown error';
if (errorMessage.includes('timeout')) {
status = 408;
}
else if (errorMessage.includes('404') ||
errorMessage.includes('not found')) {
status = 404;
}
else if (errorMessage.includes('403') ||
errorMessage.includes('forbidden')) {
status = 403;
}
else if (errorMessage.includes('500')) {
status = 500;
}
else {
status = 0;
}
return {
url,
html: '',
status,
method: 'visual',
error: errorMessage,
finalUrl,
};
}
}
async scrape(url, options = {}) {
const mergedOptions = { ...this.defaultOptions, ...options };
let lastError = null;
try {
const result = await this.fetchDirectly(url, mergedOptions);
if (result.status >= 200 && result.status < 300) {
return result;
}
if (result.status === 404) {
(0, log_1.info)(`Client error ${result.status} for ${url}. Not retrying with puppeteer as resource doesn't exist`, JSON.stringify(result, null, 2));
return result;
}
(0, log_1.info)(`Direct fetch failed or anti-scraping detected for ${url}. Falling back to puppeteer.`, JSON.stringify(result, null, 2));
}
catch (error) {
lastError =
error instanceof Error ? error : new Error('Unknown fetch error');
(0, log_1.info)(`Direct fetch failed for ${url}: ${lastError.message}. Falling back to puppeteer.`);
}
try {
const result = await this.scrapeWithpuppeteer(url, mergedOptions);
(0, log_1.debug)(`Puppeteer scrape successful for ${url}.`, JSON.stringify(result, null, 2));
return result;
}
catch (error) {
const puppeteerError = error instanceof Error ? error : new Error('Unknown puppeteer error');
const m = `Both methods failed. Fetch: ${lastError?.message || 'Unknown'}. puppeteer: ${puppeteerError.message}. err=${error.message}`;
(0, log_1.warn)(m);
return {
url,
html: '',
status: 0,
method: 'visual',
error: m,
};
}
}
async scrapeMultiple(urls, options = {}) {
const results = [];
for (const url of urls) {
try {
const result = await this.scrape(url, options);
results.push(result);
}
catch (error) {
results.push({
url,
html: '',
status: 0,
method: 'fetch',
error: error instanceof Error ? error.message : 'Unknown error',
});
}
}
return results;
}
async dispose() {
await (0, dom_1.closeBrowser)();
}
}
exports.WebScraper = WebScraper;
//# sourceMappingURL=WebScraper.js.map