UNPKG

@coya/web-scraper

Version:

Web scraper on top of PhantomJS or Chromium

154 lines (153 loc) 6.45 kB
var fs = require('fs'); var server = require('webserver').create(); var system = require('system'); var webpage = require('webpage'); console.error = function () { system.stderr.write(Array.prototype.join.call(arguments, ' ')); }; var JQUERY_PATH = './resources/jquery.js'; var COOKIE_JAR = './resources/cookies.json'; var DEBUG_SCREENSHOT = './resources/debug.png'; var PhantomScraper = /** @class */ (function () { function PhantomScraper() { this.scripts = []; PhantomScraper.importCookies(); server.listen('127.0.0.1:' + system.args[1], function (request, response) { response.statusCode = 200; var req; try { req = JSON.parse(request.post); } catch (e) { response.write('{"error": "json_parse_failed", "msg": "The request JSON parsing has failed."}'); response.close(); return; } if (req.exit) { response.write('{"result": "ok"}'); response.close(); phantom.exit(0); return; } var fct; if (req.fctAsString) fct = req.fctAsString; else { var parts = req.fct.split('#'); if (!this.scripts[parts[0]]) this.scripts[parts[0]] = require(parts[0]); fct = parts.length > 1 ? this.scripts[parts[0]][parts[1]] : this.scripts[parts[0]]; } PhantomScraper.scrap(req.url, fct, req.args, req.referer, req.debug, function (result) { response.write(JSON.stringify(result)); response.close(); }); }.bind(this)); console.log('ready'); } PhantomScraper.createPage = function (referer, debug) { var page = webpage.create(); if (debug) { page.onError = function (msg, trace) { console.error('Error :', msg); }; page.onResourceTimeout = function (request) { console.error('Timeout resource :', JSON.stringify(request)); }; page.onResourceError = function (resourceError) { console.error('Resource error :', JSON.stringify(resourceError)); }; page.onConsoleMessage = function (msg, lineNum, sourceId) { console.log(msg); }; page.onLoadStarted = function () { console.log('Page loading started.'); }; page.onLoadFinished = function (status) { console.log('Page loading finished, status : "' + status + '".'); }; page.onResourceRequested = function (requestData, networkRequest) { }; page.onNavigationRequested = function (url, type, willNavigate, main) { }; } page.onResourceReceived = function (response) { fs.write(COOKIE_JAR, JSON.stringify(phantom.cookies), 'w'); }; page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'; page.settings.loadImages = false; page.settings.loadPlugins = false; page.settings.javascriptEnabled = true; page.settings.resourceTimeout = 30000; page.customHeaders = { 'Referer': referer, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3', //'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache' }; page.viewportSize = { width: 1600, height: 900 }; return page; }; PhantomScraper.closePage = function (page) { page.clearMemoryCache(); page.close(); page = null; }; PhantomScraper.importCookies = function () { if (fs.isFile(COOKIE_JAR)) { try { var cookies = JSON.parse(fs.read(COOKIE_JAR)); for (var _i = 0, cookies_1 = cookies; _i < cookies_1.length; _i++) { var cookie = cookies_1[_i]; phantom.addCookie(cookie); } } catch (e) { } } }; PhantomScraper.scrap = function (url, fct, args, referer, debug, callback) { var page = PhantomScraper.createPage(referer, debug); page.open(url, function (status) { try { if (status !== 'success' || !page.evaluateJavaScript('function() { return !!document.body; }')) { PhantomScraper.closePage(page); callback({ error: 'page_opening_failed', msg: 'An error has occurred when opening the page.', status: status }); } else { if (debug) page.render(DEBUG_SCREENSHOT); if (page.injectJs(JQUERY_PATH)) { var result = typeof fct == 'function' ? page.evaluate(fct, args) : page.evaluateJavaScript(fct); PhantomScraper.closePage(page); callback({ result: result }); } else { PhantomScraper.closePage(page); callback({ error: 'script_injection_failed', msg: 'The script injection has failed.' }); } } } catch (e) { PhantomScraper.closePage(page); callback({ error: e }); } }); }; return PhantomScraper; }()); new PhantomScraper(); // https://github.com/ariya/phantomjs/blob/master/examples/waitfor.js var waitFor = function (testFct, readyFct, timeOutMillis) { var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, start = new Date().getTime(), condition = false, interval = setInterval(function () { if ((new Date().getTime() - start < maxtimeOutMillis) && !condition) condition = testFct(); else { clearInterval(interval); readyFct(!condition); } }, 250); };