UNPKG

ag-webscrape

Version:

TypeScript web scraper with Playwright fallback for anti-scraping protection

171 lines 6.13 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.goToPage = exports.closeBrowser = exports.launchBrowser = void 0; const chromium_1 = __importDefault(require("@sparticuz/chromium")); const log_1 = require("ag-common/dist/common/helpers/log"); const fs_1 = require("fs"); const node_html_parser_1 = require("node-html-parser"); const puppeteer_core_1 = require("puppeteer-core"); let browser; const getSystemChromePath = async () => { const possiblePaths = [ 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe', 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe', 'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe', 'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe', '/usr/bin/chromium-browser', ]; for (const path of possiblePaths) { try { (0, fs_1.accessSync)(path); return path; } catch { } } const ret = await chromium_1.default.executablePath(); return ret; }; const launchBrowser = async (executablePath) => { const browserExecutablePath = executablePath || (await getSystemChromePath()); const opt = { defaultViewport: { height: 1920, width: 1080, }, headless: process.env.HEADLESS === 'false' ? false : true, ignoreHTTPSErrors: true, devtools: false, executablePath: browserExecutablePath, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--no-first-run', '--no-zygote', '--single-process', '--disable-gpu', '--disable-background-timer-throttling', '--disable-renderer-backgrounding', '--disable-backgrounding-occluded-windows', '--disable-ipc-flooding-protection', '--force-color-profile=srgb', '--metrics-recording-only', '--disable-extensions', ], }; (0, log_1.trace)('launch browser, opt=', opt); try { if (browser?.close) { await browser.close(); } } catch { } browser = (await (0, puppeteer_core_1.launch)(opt)); }; exports.launchBrowser = launchBrowser; const closeBrowser = async () => { try { if (!browser) { return; } await browser.close(); } catch (e) { (0, log_1.info)('error closing browser:', e); } }; exports.closeBrowser = closeBrowser; const goToPage = async (url, opt) => { let errorRetry = false; do { try { if (!browser) { await (0, exports.launchBrowser)(opt?.executablePath); } (0, log_1.debug)('go to page:' + url); const page = await browser.newPage(); let t = opt?.timeout ?? 5000; if (errorRetry) { t += 5000; } const urlx = typeof url === 'string' ? url : url.toString(); let response; if (!opt?.wailUntilSelector) { response = await page.goto(urlx, { waitUntil: ['load', 'domcontentloaded'], timeout: t, }); } else { response = await page.goto(urlx, { waitUntil: ['load'], timeout: t, }); await page.waitForSelector(opt.wailUntilSelector, { timeout: t, visible: true, }); } if (!response) { throw new Error('No response received from page navigation'); } const content = await page.content(); const doc = (0, node_html_parser_1.parse)(content); doc.querySelectorAll('.visually-hidden')?.forEach((n) => n.remove()); await page.close(); const result = { html: doc, status: response.status(), statusText: response.statusText(), url: response.url(), headers: response.headers(), }; errorRetry = false; return result; } catch (err) { const e = err; if (errorRetry) { (0, log_1.error)('retry already, bail', url, e.toString()); throw e; } if (e.toString().includes('has disconnected') || e.toString().includes('timeout of') || e.toString().includes('frame was detached') || e.toString().includes('Navigating frame was detached') || e.toString().includes('Protocol error') || e.toString().includes('Target closed') || e.toString().includes('ETXTBSY') || e.toString().includes('spawn') || e.toString().includes('ENOENT') || e.toString().includes('EACCES')) { try { (0, log_1.debug)('retry:', url, e.toString()); if (e.toString().includes('ETXTBSY') || e.toString().includes('spawn')) { await new Promise((resolve) => setTimeout(resolve, 1000)); } await (0, exports.launchBrowser)(opt?.executablePath); errorRetry = true; } catch (ex) { (0, log_1.error)('error relaunching browser:', ex); throw ex; } } else { (0, log_1.error)(`scrape error:${e}`); throw e; } } } while (errorRetry); throw new Error('too many errors'); }; exports.goToPage = goToPage; //# sourceMappingURL=dom.js.map