UNPKG

ag-webscrape

Version:

TypeScript web scraper with Playwright fallback for anti-scraping protection

167 lines 6.01 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.WebScraper = void 0; const log_1 = require("ag-common/dist/common/helpers/log"); const dom_1 = require("./helpers/dom"); class WebScraper { constructor(options = {}) { this.userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'; this.defaultOptions = { timeout: 30000, retries: 3, waitForTimeout: 5000, ...options, }; } async fetchDirectly(url, options) { const headers = { 'User-Agent': options.userAgent || this.userAgent.toString(), Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', Connection: 'keep-alive', 'Upgrade-Insecure-Requests': '1', ...options.headers, }; const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), options.timeout ?? this.defaultOptions.timeout); try { const response = await fetch(url, { headers, signal: controller.signal, redirect: 'follow', }); clearTimeout(timeoutId); const html = await response.text(); return { url, html, status: response.status, method: 'fetch', redirected: response.redirected, finalUrl: response.url, }; } catch (error) { clearTimeout(timeoutId); throw error; } } async scrapeWithpuppeteer(url, options) { let html = ''; let status = 200; let error; let finalUrl = url; try { const pageResult = await (0, dom_1.goToPage)(url, { timeout: options.timeout ?? this.defaultOptions.timeout, wailUntilSelector: options.waitForSelector, executablePath: options.executablePath, }); html = pageResult.html.outerHTML; status = pageResult.status; finalUrl = pageResult.url; if (status >= 400) { error = `HTTP ${status}: ${pageResult.statusText}`; } return { url, html, status, method: 'visual', error, finalUrl, }; } catch (err) { const errorMessage = err instanceof Error ? err.message : 'Unknown error'; if (errorMessage.includes('timeout')) { status = 408; } else if (errorMessage.includes('404') || errorMessage.includes('not found')) { status = 404; } else if (errorMessage.includes('403') || errorMessage.includes('forbidden')) { status = 403; } else if (errorMessage.includes('500')) { status = 500; } else { status = 0; } return { url, html: '', status, method: 'visual', error: errorMessage, finalUrl, }; } } async scrape(url, options = {}) { const mergedOptions = { ...this.defaultOptions, ...options }; let lastError = null; try { const result = await this.fetchDirectly(url, mergedOptions); if (result.status >= 200 && result.status < 300) { return result; } if (result.status === 404) { (0, log_1.info)(`Client error ${result.status} for ${url}. Not retrying with puppeteer as resource doesn't exist`, JSON.stringify(result, null, 2)); return result; } (0, log_1.info)(`Direct fetch failed or anti-scraping detected for ${url}. Falling back to puppeteer.`, JSON.stringify(result, null, 2)); } catch (error) { lastError = error instanceof Error ? error : new Error('Unknown fetch error'); (0, log_1.info)(`Direct fetch failed for ${url}: ${lastError.message}. Falling back to puppeteer.`); } try { const result = await this.scrapeWithpuppeteer(url, mergedOptions); (0, log_1.debug)(`Puppeteer scrape successful for ${url}.`, JSON.stringify(result, null, 2)); return result; } catch (error) { const puppeteerError = error instanceof Error ? error : new Error('Unknown puppeteer error'); const m = `Both methods failed. Fetch: ${lastError?.message || 'Unknown'}. puppeteer: ${puppeteerError.message}. err=${error.message}`; (0, log_1.warn)(m); return { url, html: '', status: 0, method: 'visual', error: m, }; } } async scrapeMultiple(urls, options = {}) { const results = []; for (const url of urls) { try { const result = await this.scrape(url, options); results.push(result); } catch (error) { results.push({ url, html: '', status: 0, method: 'fetch', error: error instanceof Error ? error.message : 'Unknown error', }); } } return results; } async dispose() { await (0, dom_1.closeBrowser)(); } } exports.WebScraper = WebScraper; //# sourceMappingURL=WebScraper.js.map