@coya/web-scraper
Version:
Web scraper on top of PhantomJS or Chromium
154 lines (153 loc) • 6.45 kB
JavaScript
var fs = require('fs');
var server = require('webserver').create();
var system = require('system');
var webpage = require('webpage');
console.error = function () {
system.stderr.write(Array.prototype.join.call(arguments, ' '));
};
var JQUERY_PATH = './resources/jquery.js';
var COOKIE_JAR = './resources/cookies.json';
var DEBUG_SCREENSHOT = './resources/debug.png';
var PhantomScraper = /** @class */ (function () {
function PhantomScraper() {
this.scripts = [];
PhantomScraper.importCookies();
server.listen('127.0.0.1:' + system.args[1], function (request, response) {
response.statusCode = 200;
var req;
try {
req = JSON.parse(request.post);
}
catch (e) {
response.write('{"error": "json_parse_failed", "msg": "The request JSON parsing has failed."}');
response.close();
return;
}
if (req.exit) {
response.write('{"result": "ok"}');
response.close();
phantom.exit(0);
return;
}
var fct;
if (req.fctAsString)
fct = req.fctAsString;
else {
var parts = req.fct.split('#');
if (!this.scripts[parts[0]])
this.scripts[parts[0]] = require(parts[0]);
fct = parts.length > 1 ? this.scripts[parts[0]][parts[1]] : this.scripts[parts[0]];
}
PhantomScraper.scrap(req.url, fct, req.args, req.referer, req.debug, function (result) {
response.write(JSON.stringify(result));
response.close();
});
}.bind(this));
console.log('ready');
}
PhantomScraper.createPage = function (referer, debug) {
var page = webpage.create();
if (debug) {
page.onError = function (msg, trace) {
console.error('Error :', msg);
};
page.onResourceTimeout = function (request) {
console.error('Timeout resource :', JSON.stringify(request));
};
page.onResourceError = function (resourceError) {
console.error('Resource error :', JSON.stringify(resourceError));
};
page.onConsoleMessage = function (msg, lineNum, sourceId) {
console.log(msg);
};
page.onLoadStarted = function () {
console.log('Page loading started.');
};
page.onLoadFinished = function (status) {
console.log('Page loading finished, status : "' + status + '".');
};
page.onResourceRequested = function (requestData, networkRequest) {
};
page.onNavigationRequested = function (url, type, willNavigate, main) {
};
}
page.onResourceReceived = function (response) {
fs.write(COOKIE_JAR, JSON.stringify(phantom.cookies), 'w');
};
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0';
page.settings.loadImages = false;
page.settings.loadPlugins = false;
page.settings.javascriptEnabled = true;
page.settings.resourceTimeout = 30000;
page.customHeaders = {
'Referer': referer,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
//'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache'
};
page.viewportSize = { width: 1600, height: 900 };
return page;
};
PhantomScraper.closePage = function (page) {
page.clearMemoryCache();
page.close();
page = null;
};
PhantomScraper.importCookies = function () {
if (fs.isFile(COOKIE_JAR)) {
try {
var cookies = JSON.parse(fs.read(COOKIE_JAR));
for (var _i = 0, cookies_1 = cookies; _i < cookies_1.length; _i++) {
var cookie = cookies_1[_i];
phantom.addCookie(cookie);
}
}
catch (e) {
}
}
};
PhantomScraper.scrap = function (url, fct, args, referer, debug, callback) {
var page = PhantomScraper.createPage(referer, debug);
page.open(url, function (status) {
try {
if (status !== 'success' || !page.evaluateJavaScript('function() { return !!document.body; }')) {
PhantomScraper.closePage(page);
callback({ error: 'page_opening_failed', msg: 'An error has occurred when opening the page.', status: status });
}
else {
if (debug)
page.render(DEBUG_SCREENSHOT);
if (page.injectJs(JQUERY_PATH)) {
var result = typeof fct == 'function' ? page.evaluate(fct, args) : page.evaluateJavaScript(fct);
PhantomScraper.closePage(page);
callback({ result: result });
}
else {
PhantomScraper.closePage(page);
callback({ error: 'script_injection_failed', msg: 'The script injection has failed.' });
}
}
}
catch (e) {
PhantomScraper.closePage(page);
callback({ error: e });
}
});
};
return PhantomScraper;
}());
new PhantomScraper();
// https://github.com/ariya/phantomjs/blob/master/examples/waitfor.js
var waitFor = function (testFct, readyFct, timeOutMillis) {
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, start = new Date().getTime(), condition = false, interval = setInterval(function () {
if ((new Date().getTime() - start < maxtimeOutMillis) && !condition)
condition = testFct();
else {
clearInterval(interval);
readyFct(!condition);
}
}, 250);
};