ag-webscrape
Version:
TypeScript web scraper with Playwright fallback for anti-scraping protection
171 lines • 6.13 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.goToPage = exports.closeBrowser = exports.launchBrowser = void 0;
const chromium_1 = __importDefault(require("@sparticuz/chromium"));
const log_1 = require("ag-common/dist/common/helpers/log");
const fs_1 = require("fs");
const node_html_parser_1 = require("node-html-parser");
const puppeteer_core_1 = require("puppeteer-core");
let browser;
const getSystemChromePath = async () => {
const possiblePaths = [
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
'/usr/bin/chromium-browser',
];
for (const path of possiblePaths) {
try {
(0, fs_1.accessSync)(path);
return path;
}
catch {
}
}
const ret = await chromium_1.default.executablePath();
return ret;
};
const launchBrowser = async (executablePath) => {
const browserExecutablePath = executablePath || (await getSystemChromePath());
const opt = {
defaultViewport: {
height: 1920,
width: 1080,
},
headless: process.env.HEADLESS === 'false' ? false : true,
ignoreHTTPSErrors: true,
devtools: false,
executablePath: browserExecutablePath,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--single-process',
'--disable-gpu',
'--disable-background-timer-throttling',
'--disable-renderer-backgrounding',
'--disable-backgrounding-occluded-windows',
'--disable-ipc-flooding-protection',
'--force-color-profile=srgb',
'--metrics-recording-only',
'--disable-extensions',
],
};
(0, log_1.trace)('launch browser, opt=', opt);
try {
if (browser?.close) {
await browser.close();
}
}
catch {
}
browser = (await (0, puppeteer_core_1.launch)(opt));
};
exports.launchBrowser = launchBrowser;
const closeBrowser = async () => {
try {
if (!browser) {
return;
}
await browser.close();
}
catch (e) {
(0, log_1.info)('error closing browser:', e);
}
};
exports.closeBrowser = closeBrowser;
const goToPage = async (url, opt) => {
let errorRetry = false;
do {
try {
if (!browser) {
await (0, exports.launchBrowser)(opt?.executablePath);
}
(0, log_1.debug)('go to page:' + url);
const page = await browser.newPage();
let t = opt?.timeout ?? 5000;
if (errorRetry) {
t += 5000;
}
const urlx = typeof url === 'string' ? url : url.toString();
let response;
if (!opt?.wailUntilSelector) {
response = await page.goto(urlx, {
waitUntil: ['load', 'domcontentloaded'],
timeout: t,
});
}
else {
response = await page.goto(urlx, {
waitUntil: ['load'],
timeout: t,
});
await page.waitForSelector(opt.wailUntilSelector, {
timeout: t,
visible: true,
});
}
if (!response) {
throw new Error('No response received from page navigation');
}
const content = await page.content();
const doc = (0, node_html_parser_1.parse)(content);
doc.querySelectorAll('.visually-hidden')?.forEach((n) => n.remove());
await page.close();
const result = {
html: doc,
status: response.status(),
statusText: response.statusText(),
url: response.url(),
headers: response.headers(),
};
errorRetry = false;
return result;
}
catch (err) {
const e = err;
if (errorRetry) {
(0, log_1.error)('retry already, bail', url, e.toString());
throw e;
}
if (e.toString().includes('has disconnected') ||
e.toString().includes('timeout of') ||
e.toString().includes('frame was detached') ||
e.toString().includes('Navigating frame was detached') ||
e.toString().includes('Protocol error') ||
e.toString().includes('Target closed') ||
e.toString().includes('ETXTBSY') ||
e.toString().includes('spawn') ||
e.toString().includes('ENOENT') ||
e.toString().includes('EACCES')) {
try {
(0, log_1.debug)('retry:', url, e.toString());
if (e.toString().includes('ETXTBSY') ||
e.toString().includes('spawn')) {
await new Promise((resolve) => setTimeout(resolve, 1000));
}
await (0, exports.launchBrowser)(opt?.executablePath);
errorRetry = true;
}
catch (ex) {
(0, log_1.error)('error relaunching browser:', ex);
throw ex;
}
}
else {
(0, log_1.error)(`scrape error:${e}`);
throw e;
}
}
} while (errorRetry);
throw new Error('too many errors');
};
exports.goToPage = goToPage;
//# sourceMappingURL=dom.js.map