@coya/web-scraper
Version:
Web scraper on top of PhantomJS or Chromium
188 lines (187 loc) • 7.48 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const Logs = require("@coya/logs");
const fs = require("fs");
const puppeteer = require('puppeteer');
const JQUERY_PATH = __dirname + '/resources/jquery.js';
const COOKIE_JAR = __dirname + '/resources/cookies.json';
const DEBUG_SCREENSHOT = __dirname + '/resources/debug.png';
class WebScraper {
constructor(config = { requestsLimitBeforeReboot: 100 }) {
this.logs = new Logs('web_scraper', config);
this.requestsLimitBeforeReboot = config.requestsLimitBeforeReboot;
this.browser = null;
this.requestsQueue = [];
this.globalRequestsCounter = 0;
this.requestsCounter = 0;
}
static getInstance(config) {
if (WebScraper.self == null)
WebScraper.self = new WebScraper(config);
return WebScraper.self;
}
request(req) {
return new Promise(async (resolve, reject) => {
if (!req.url)
return reject('"url" parameter is required.');
if (!req.fct)
return reject('"fct" parameter is required.');
if (!req.url.startsWith('http://') && !req.url.startsWith('https://'))
req.url = 'http://' + req.url;
this.requestsQueue.push({
resolve: resolve,
reject: reject,
content: req
});
if (this.requestsQueue.length == 1)
this.processRequestsQueue();
});
}
async close(params = { clear: false }) {
await this.browser.close();
this.browser = null;
this.requestsCounter = 0;
if (params.clear)
this.requestsQueue = [];
}
processRequestsQueue() {
if (!this.requestsQueue.length) {
this.logs.error('Fatal error : bad call to function "processRequestsQueue()".');
process.exit(1);
}
const currentRequest = this.requestsQueue[0];
this.scrap(currentRequest.content)
.then((res) => {
this.requestsQueue.shift();
if (this.requestsQueue.length)
this.processRequestsQueue();
if (res.err)
currentRequest.reject(res.err);
else
currentRequest.resolve(res.result);
}); // no possible rejection
}
async scrap(req) {
try {
if (!this.browser) {
this.logs.info('Starting headless browser...');
this.browser = await puppeteer.launch({
args: ['--no-sandbox', '--disable-setuid-sandbox'],
devtools: false,
headless: true,
ignoreHTTPSErrors: true
});
this.logs.info('Headless browser started.');
}
this.logs.info('Requesting page with url = "' + req.url + '"...');
const page = await this.createPage(req.referer, req.debug);
await page.goto(req.url, { waitUntil: 'domcontentloaded' });
await page.addScriptTag({ path: JQUERY_PATH });
const result = await page.evaluate(req.fct, req.args);
if (req.debug)
await page.screenshot({ path: DEBUG_SCREENSHOT, fullPage: true });
await this.saveCookies(COOKIE_JAR, page);
await page.close();
this.globalRequestsCounter++;
if (++this.requestsCounter >= this.requestsLimitBeforeReboot)
await this.close();
return { result: result };
}
catch (e) {
this.logs.error(e);
return { err: 'An error has occurred while trying to access to the page.' };
}
}
async createPage(referer, debug) {
this.logs.debug('Creating new page...');
const defaultViewport = {
deviceScaleFactor: 1,
hasTouch: false,
height: 1024,
isLandscape: false,
isMobile: false,
width: 1280
};
const page = await this.browser.newPage();
await page.setViewport(defaultViewport);
//await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0');
await page.setRequestInterception(true);
await this.loadCookies(COOKIE_JAR, page);
page.on('request', req => {
if (req.url.match(/\.(png|jpg|jpeg|gif)$/))
req.abort();
else
req.continue();
});
if (debug) {
page.once('load', () => {
this.logs.debug('Page loaded.');
});
page.on('console', msg => {
this.logs.info(msg);
});
page.on('error', err => {
this.logs.error(err);
});
page.on('pageerror', err => {
this.logs.error(err);
});
page.on('requestfailed', req => {
if (!req.url.match(/\.(png|jpg|jpeg|gif)$/))
this.logs.warning('Request failed : ' + req.url);
});
/*page.on('requestfinished', req => {
this.logs.debug('Request finished : ' + req.url);
});*/
page.on('response', res => {
this.logs.debug('Response received : ' + res.url);
});
}
if (referer)
await page.setExtraHTTPHeaders({
'Referer': referer
//'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
//'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
//'Accept-Encoding': 'gzip, deflate, br',
//'Connection': 'keep-alive',
//'Pragma': 'no-cache',
//'Cache-Control': 'no-cache'
});
this.logs.debug('Page created.');
return page;
}
loadCookies(cookiesFile, page) {
this.logs.debug('Loading cookies...');
return new Promise((resolve, reject) => {
fs.readFile(cookiesFile, async (err, cookies) => {
if (err && err.code != 'ENOENT')
reject(err);
if (!err && cookies) {
cookies = JSON.parse(cookies);
if (Array.isArray(cookies))
for (let cookie of cookies) {
try {
await page.setCookie(cookie);
}
catch (e) { } // sometimes it may fail (when "expires" value is not an integer for example)
}
}
this.logs.debug('Cookies loaded.');
resolve();
});
});
}
saveCookies(cookiesFile, page) {
this.logs.debug('Saving cookies...');
return new Promise(async (resolve, reject) => {
fs.writeFile(cookiesFile, JSON.stringify(await page.cookies()), ((err) => {
if (err)
reject(err);
this.logs.debug('Cookies saved.');
resolve();
}));
});
}
}
WebScraper.self = null;
exports.WebScraper = WebScraper;