UNPKG

@coya/web-scraper

Version:

Web scraper on top of PhantomJS or Chromium

215 lines (214 loc) 9.17 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const child_process = require("child_process"); const http = require("http"); const path = require("path"); const ps = require("ps-node"); const Logs = require("@coya/logs"); const SCRAPER_FILE = path.join(__dirname, 'PhantomScraper.js'); const SCRAPER_CONFIG_FILE = path.join(__dirname, 'resources/config.json'); class ScraperClient { constructor(config) { this.hostname = 'localhost'; if (config) { this.port = config.port || 8080; this.requestsLimitBeforeReboot = config.requestsLimitBeforeReboot || 10; this.timeout = config.timeout || 30000; } else { this.port = 8080; this.requestsLimitBeforeReboot = 10; this.timeout = 30000; } this.logs = new Logs('scraper_client', config); this.scraperProcess = null; this.requestsQueue = []; this.requestsCounter = 0; } static getInstance(config) { if (ScraperClient.self == null) ScraperClient.self = new ScraperClient(config); return ScraperClient.self; } runScraper() { if (this.scraperProcess) return Promise.resolve(); return this.killExistingProcessIfExists() .then(() => { return new Promise((resolve, reject) => { this.logs.info('Starting web scraper server...'); this.scraperProcess = child_process.exec(['phantomjs', '--config=' + SCRAPER_CONFIG_FILE, SCRAPER_FILE, this.port].join(' '), { cwd: __dirname }); if (!this.scraperProcess) reject('The web scraper creation process has failed'); this.scraperProcess.on('exit', (code) => { this.scraperProcess = null; if (!this.requestsQueue.length) this.logs.error('The web scraper has crashed unexpectedly (code = ' + code + ').'); this.runScraper() // restart the server .then(this.processRequestsQueue.bind(this), reject); // restarting to process the requests queue }); this.scraperProcess.stderr.on('data', (data) => { this.logs.warning(data); }); this.scraperProcess.stdout.on('data', (data) => { let lines = data.trim().split('\n'); for (let line of lines) if (line == 'ready') { this.logs.info('Web scraper ready.'); resolve(); } else this.logs.debug(line); }); }); }); } processRequestsQueue() { if (!this.requestsQueue.length) { this.logs.error('Fatal error : bad call to function "processRequestsQueue()".'); process.exit(1); } if (++this.requestsCounter >= this.requestsLimitBeforeReboot) this.sendExitRequest(); // and then wait for the "exit" event above else { const currentRequest = this.requestsQueue[0]; this.logs.info('Requesting page with url = "' + currentRequest.content.url + '"...'); this.sendRequest(JSON.stringify(currentRequest.content)) .then((result) => { this.requestsQueue.shift(); if (this.requestsQueue.length) this.processRequestsQueue(); currentRequest.resolve(result); }) .catch((err) => { this.requestsQueue.shift(); if (this.requestsQueue.length) this.processRequestsQueue(); currentRequest.reject(err); }); } } sendRequest(content) { const opts = { hostname: this.hostname, port: this.port, path: '/', method: 'POST', headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(content) }, timeout: this.timeout }; let pageOpeningsFailedCounter = 0; return new Promise((resolve, reject) => { (function send() { const request = http.request(opts, (res) => { let data = ''; res.on('data', (chunk) => { data += chunk; }); res.on('end', () => { if (data == '') reject({ error: 'scraper_down' }); // fatal error else { try { data = JSON.parse(data); } catch (e) { reject({ error: 'json_parse_error', data: data }); // fatal error return; } if (data['error']) { if (data['error'] == 'page_opening_failed') { this.logs.warning('The page opening has failed, status : "' + data['status'] + '".'); if (++pageOpeningsFailedCounter >= 5) reject({ error: 'page_opening_failed' }); else setTimeout(send.bind(this), 3000); // try again } else reject(data); // fatal error } else resolve(data['result']); // the request has succeeded } }); }); request.on('error', (error) => { if (error['code'] == 'ECONNRESET') { this.logs.warning('The connection to the scraper server has been reset.'); setTimeout(send.bind(this), 3000); // try again } else reject(error); // fatal error }); request.write(content); request.end(); }).call(this); }); } sendExitRequest() { this.logs.info('Sending exit request to scraper server...'); this.requestsCounter = 0; return this.sendRequest(JSON.stringify({ exit: true })) .then((result) => { if (result != 'ok') { this.logs.error(result); this.logs.warning('Scraper server does not want to exit. Killing process by force...'); this.scraperProcess.kill(); } }); } killExistingProcessIfExists() { return new Promise((resolve, reject) => { ps.lookup({ command: 'phantomjs', }, (err, processList) => { if (err) return reject(new Error(err)); if (!processList.length) return resolve(); processList.forEach((process) => { ps.kill(process.pid, 'SIGKILL', (err) => { if (err) return reject(new Error(err)); this.logs.info('Existing PhantomJS process killed.'); resolve(); }); }); }); }); } request(req) { return new Promise((resolve, reject) => { if (!req.url) return reject('"url" parameter is required.'); if (!req.fct) return reject('"fct" parameter is required.'); if (req.url.indexOf('http://') != 0 && req.url.indexOf('https://') != 0) req.url = 'http://' + req.url; if (typeof req.fct == 'function') req.fctAsString = req.fct.toString(); this.requestsQueue.push({ resolve: resolve, reject: reject, content: req }); if (this.requestsQueue.length == 1) { this.runScraper() .then(this.processRequestsQueue.bind(this)) .catch(reject); } }); } close() { this.scraperProcess.removeAllListeners('exit'); // to avoid restarting the scraper return this.sendExitRequest() .then(() => { this.scraperProcess = null; this.requestsQueue = []; this.logs.info('Web scraper process done and connection closed.'); }); } } ScraperClient.self = null; exports.ScraperClient = ScraperClient;