chen-crawler
Version:
Web Crawler Provider for Chen Framework
70 lines • 2.29 kB
JavaScript
;
const system = require('system');
const url = system.args[1];
if (!url) {
console.log('Usage: phantomjs extractor.js "<some URL>" "<JSON config>"');
phantom.exit(1);
}
const webpage_1 = require('webpage');
const page = webpage_1.create();
/**
* This callback is invoked when there is a JavaScript console message on the web page
* @param {string} message
* @param {number} lineNum
* @param {string} sourceId
*/
page.onConsoleMessage = function (message, lineNum, sourceId) {
console.log('CONSOLE: ' + message + ' (from line #' + lineNum + ' in "' + sourceId + '")');
};
/**
* This callback is invoked when the page requests a resource
* @param {requestData} request
* @param {networkRequest} network
*/
page.onResourceRequested = function (request, network) {
// other unnecessary links
if (request.url.indexOf('facebook.com') !== -1
|| request.url.indexOf('twitter.com') !== -1
|| request.url.indexOf('google.com') !== -1
|| request.url.indexOf('youtube.com') !== -1
|| request.url.indexOf('google-analytics.com') !== -1
|| request.url.indexOf('mixpanel.com') !== -1
|| request.url.indexOf('.js') !== -1
|| request.url.indexOf('doubleclick.net') !== -1
|| request.url.indexOf('cdn.com') !== -1) {
network.abort();
return;
}
if (/(\.png)|(\.jpg)|(\.gif)|(\.svg)/ig.test(request.url)) {
network.abort();
return;
}
};
/**
* This property sets the size of the viewport for the layout process
* @type {Object}
*/
page.viewportSize = {
width: 1440,
height: 900
};
/**
* defines the timeout after which any resource requested will stop trying and proceed with other parts of the page
* @type {Number}
*/
page.settings.resourceTimeout = 300000;
/**
* defines the user agent sent to server when the web page requests resources
* @type {string}
*/
page.settings.userAgent = 'Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36';
/**
* Opens the url and loads it to the page
*/
page.open(url, function (status) {
console.log('--BOUNDARY');
console.log(page.content);
console.log('--BOUNDARY');
phantom.exit();
});
//# sourceMappingURL=extractor.js.map