UNPKG

crawl

Version:

Website crawler and differencer

186 lines (154 loc) 6.26 kB
/*! * Crawl - crawler * Copyright(c) 2012 Mike Moulton <mike@meltmedia.com> * MIT Licensed */ var events = require("events"), Crawler = require('simplecrawler').Crawler, hash = require('node_hash'), _ = require('underscore'), fs = require("fs"), util = require("util"), urlUtil = require('url'); var crawler = new events.EventEmitter(); /** * crawler.crawl * * Crawls a website, starting at `url`, finding all linked pages within the same domain. * The `url` can also be a filesystem path containing a stringified JSON object of a past crawl. This * can be generated using the included CLI and the '--json' option. * * This function also supports several `options`: * - headers {Boolean}: include the raw http headers from the response in the results * - body {Boolean}: include the http response body in results * * This function is asyncronous and requires a `callback` with the following signature: * - function(err, pages) * where `pages` is an array with the following object structure for each result: * { * url: URL Object provided from Node URL parser, * status: HTTP status code, * contentType: the MIME type for the resource, * checksum: SHA 256 hash of the response body, * links: Array of links found on this page, * referrers: Array of URLs within the crawled site who referr to this page, * body: The response body (optional), * headers: The response headers (optional), * date: Timestamp of the page crawl * } * * @param {String} http(s) url of site to crawl, or filesystem path to JSON file to import * @param {Object} options (optional) * @param {Function} callback -- function(err, pages) * @api public */ crawler.crawl = function(url, options, callback) { if (typeof options === 'function') { callback = options; options = {}; } options = options || {}; // setup some default options for the node.io job options['crawler'] = crawler; var pages = {}, potentialPages = {}, urlParts = urlUtil.parse(url, true); // do a web crawl of url if it's an http protocol if (urlParts.protocol == "https:" || urlParts.protocol == "http:") { var port = urlParts.port ? urlParts.port : 80, siteCrawler = new Crawler(urlParts.hostname, urlParts.path, port); // configure crawler siteCrawler.interval = 10; siteCrawler.maxConcurrency = 10; siteCrawler.scanSubdomains = true; siteCrawler.downloadUnsupported = false; if (options.username && options.password) { siteCrawler.needsAuth = true; siteCrawler.authUser = options.username; siteCrawler.authPass = options.password; } function mimeTypeSupported(MIMEType) { var supported = false; siteCrawler.supportedMimeTypes.forEach(function(mimeCheck) { if (!!mimeCheck.exec(MIMEType)) { supported = true; } }); return supported; } var pageHandler = function(queueItem, responseBuffer, response) { if (mimeTypeSupported(queueItem.stateData.contentType)) { crawler.emit("crawl", queueItem.url); var data = responseBuffer.toString(), parsedUrl = urlUtil.parse(queueItem.url, true), page = {}; page.url = queueItem.url; page.status = queueItem.stateData.code; page.contentType = queueItem.stateData.contentType; page.checksum = hash.sha256(data); page.date = new Date().toJSON(); if (options.headers) page.headers = queueItem.stateData.headers; if (options.body) page.body = data; if (potentialPages[parsedUrl.path]) { page = _.extend(page, potentialPages[parsedUrl.path]); } pages[parsedUrl.path] = page; } }; siteCrawler.on("discoverycomplete", function(queueItem, resources) { var parsedUrl = urlUtil.parse(queueItem.url, true); // Save the outbound links for the item that just completed discovery pages[parsedUrl.path].links = resources; // Update each linked to page storing us as a referrer resources.forEach(function(link) { // A normalized, resolved URL, used for uniq identification purposes var resourceUrl = urlUtil.parse(urlUtil.resolve(parsedUrl, urlUtil.parse(link, true)), true); // Links found in the discovery of this resource may not have been crawled yet. // In the case of links that have not yet been crawled, we save them as potential // pages that may meet the crawling criteria (content-type,etc). if (!pages[resourceUrl.path]) { if (!potentialPages[resourceUrl.path]) potentialPages[resourceUrl.path] = { referrers: [] }; potentialPages[resourceUrl.path].referrers.push(queueItem.url); // Otherwise the resourece has already been crawled so we can store ourselves as a referrer. } else { if (!pages[resourceUrl.path].referrers) pages[resourceUrl.path].referrers = []; pages[resourceUrl.path].referrers.push(queueItem.url); } }); }); // handle normal pages siteCrawler.on("fetchcomplete", pageHandler); // handle broken links siteCrawler.on("fetch404", pageHandler); // on completion, parse broken links and report siteCrawler.on("complete", function(queueItem, responseBuffer, response) { callback(null, _.map(pages, function (value, key) { // De-Dup referrers. Note: this could be done much more efficiently if performance // becomes a problem. if (value.referrers) value.referrers = _.uniq(value.referrers); return value; })); }); siteCrawler.start(); // crawl the site } // otherwise we load a json file, assumed to be generated by the CLI using the '--json' option else if (urlParts.path) { var path = urlParts.path; try { if (fs.statSync(path).isFile()) { fs.readFile(path, function(err, data) { if (err) callback(err); var pages = JSON.parse(data); callback(null, pages); }); } } catch (err) { callback(err); } } else { callback("Unable to interperate url as path or web address: %s", url); } }; module.exports = crawler;