UNPKG

chen-crawler

Version:

Web Crawler Provider for Chen Framework

70 lines 2.29 kB
"use strict"; const system = require('system'); const url = system.args[1]; if (!url) { console.log('Usage: phantomjs extractor.js "<some URL>" "<JSON config>"'); phantom.exit(1); } const webpage_1 = require('webpage'); const page = webpage_1.create(); /** * This callback is invoked when there is a JavaScript console message on the web page * @param {string} message * @param {number} lineNum * @param {string} sourceId */ page.onConsoleMessage = function (message, lineNum, sourceId) { console.log('CONSOLE: ' + message + ' (from line #' + lineNum + ' in "' + sourceId + '")'); }; /** * This callback is invoked when the page requests a resource * @param {requestData} request * @param {networkRequest} network */ page.onResourceRequested = function (request, network) { // other unnecessary links if (request.url.indexOf('facebook.com') !== -1 || request.url.indexOf('twitter.com') !== -1 || request.url.indexOf('google.com') !== -1 || request.url.indexOf('youtube.com') !== -1 || request.url.indexOf('google-analytics.com') !== -1 || request.url.indexOf('mixpanel.com') !== -1 || request.url.indexOf('.js') !== -1 || request.url.indexOf('doubleclick.net') !== -1 || request.url.indexOf('cdn.com') !== -1) { network.abort(); return; } if (/(\.png)|(\.jpg)|(\.gif)|(\.svg)/ig.test(request.url)) { network.abort(); return; } }; /** * This property sets the size of the viewport for the layout process * @type {Object} */ page.viewportSize = { width: 1440, height: 900 }; /** * defines the timeout after which any resource requested will stop trying and proceed with other parts of the page * @type {Number} */ page.settings.resourceTimeout = 300000; /** * defines the user agent sent to server when the web page requests resources * @type {string} */ page.settings.userAgent = 'Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'; /** * Opens the url and loads it to the page */ page.open(url, function (status) { console.log('--BOUNDARY'); console.log(page.content); console.log('--BOUNDARY'); phantom.exit(); }); //# sourceMappingURL=extractor.js.map