UNPKG

supercrawler

Version:

A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.

github.com/brendonboshell/supercrawler

brendonboshell/supercrawler

1,042 lines (862 loc) • 27.3 kB

JavaScript

var proxyquire = require('proxyquire'), sinon = require("sinon"), expect = require("chai").expect, makeUrl = require("./utils/makeUrl"), Promise = require("bluebird"); // Note: I cannot get sinon's useFakeTimers to behave with Bluebird's // promises. The calls to .finally(\Function) seem to only be called at the // last minute. Instead, the tests are using actual timing, which is not ideal. describe("Crawler", function () { var Crawler, listSize, FifoUrlListMock, requestSpy, insertIfNotExistsSpy, insertIfNotExistsBulkSpy, upsertSpy, pageContentType, pageLocationHeader, pageStatusCode, pageBody, robotsStatusCode, robotsTxt; beforeEach(function () { pageContentType = "text/html"; pageStatusCode = 200; pageBody = "<html><body>test</body></html>"; robotsStatusCode = 200; robotsTxt = ["User-agent: *", "Allow: /", "Disallow: /index17.html" ].join("\n"); listSize = 20; FifoUrlListMock = function () { this.callCount = 0; this.delayTime = 1; }; FifoUrlListMock.prototype.getNextUrl = function () { var self = this; this.callCount++; if (this.callCount > listSize) { return Promise.reject(new RangeError("rangeerror")); } return Promise.delay(this.delayTime).then(function () { return makeUrl("https://example.com/index" + self.callCount + ".html"); }); }; requestSpy = sinon.spy(function (opts, cb) { if (opts.url.indexOf("robots") === -1) { setTimeout(function () { var headers = {}; if (pageContentType) { headers["content-type"] = pageContentType; } if (pageLocationHeader) { headers.location = pageLocationHeader; } if (pageStatusCode === 0) { return cb(new Error("Some request error")); } cb(null, { headers: headers, statusCode: pageStatusCode, body: new Buffer(pageBody) }); }, 1); } else { setTimeout(function () { if (robotsStatusCode === 0) { return cb(new Error("Some request error")); } cb(null, { headers: { "content-type": "text/plain" }, statusCode: robotsStatusCode, body: new Buffer(robotsTxt) }); }, 1); } }); insertIfNotExistsSpy = sinon.spy(function () { return Promise.resolve(); }); FifoUrlListMock.prototype.insertIfNotExists = insertIfNotExistsSpy; insertIfNotExistsBulkSpy = sinon.spy(function () { return Promise.resolve(); }); upsertSpy = sinon.spy(function () { return Promise.resolve(); }); FifoUrlListMock.prototype.upsert = upsertSpy; Crawler = proxyquire("../lib/Crawler", { "./FifoUrlList": FifoUrlListMock, "request": requestSpy }); }); var numCrawlsOfUrl = function (url, followRedirect) { var numCalls = 0; var n = 0; var call; while (requestSpy.getCall(n)) { call = requestSpy.getCall(n); if (call.calledWith(sinon.match({ url: url, forever: true, followRedirect: followRedirect }))) { numCalls++; } n++; } return numCalls; }; var numRobotsCalls = function () { return numCrawlsOfUrl("https://example.com/robots.txt", true); }; it("returns an instance when called as a function", function () { expect(Crawler()).to.be.an.instanceOf(Crawler); }); describe("#getUrlList", function () { it("if no urlList is specified, defaults to a FifoUrlList", function () { var crawler = new Crawler(); expect(crawler.getUrlList()).to.be.an.instanceOf(FifoUrlListMock); }); it("can use a specified UrlList instance", function () { var urlList = new FifoUrlListMock(); var crawler = new Crawler({ urlList: urlList }); expect(crawler.getUrlList()).to.equal(urlList); }); }); describe("#getInterval", function () { it("uses a default interval of 1000ms", function () { expect(new Crawler().getInterval()).to.equal(1000); }); it("will use a specified interval", function () { expect(new Crawler({ interval: 5000 }).getInterval()).to.equal(5000); }); }); describe("#getConcurrentRequestsLimit", function () { it("uses a default setting of 5", function () { expect(new Crawler().getConcurrentRequestsLimit()).to.equal(5); }); it("will use a specified limit", function () { expect(new Crawler({ concurrentRequestsLimit: 99 }).getConcurrentRequestsLimit()).to.equal(99); }); }); describe("#getUserAgent", function () { it("uses a default user agent", function () { expect(new Crawler().getUserAgent()).to.equal("Mozilla/5.0 " + "(compatible; supercrawler/1.0; " + "+https://github.com/brendonboshell/supercrawler)"); }); it("will use a specified user agent", function () { expect(new Crawler({ userAgent: "mybot/1.1" }).getUserAgent()).to.equal("mybot/1.1"); }); it("will accept a function as a user agent", function () { expect(new Crawler({ userAgent: () => "mybot/1.1" }).getUserAgent()).to.equal("mybot/1.1"); expect(new Crawler({ userAgent: (url) => { if (url === 'http://www.example.com/some/random/page') { return 'url specific user agent'; } return "mybot/1.1"; } }).getUserAgent()).to.equal("mybot/1.1"); }); }); describe("#start", function () { it("returns false if crawl is already running", function () { var crawler; crawler = new Crawler(); crawler.start(); expect(crawler.start()).to.equal(false); crawler.stop(); }); it("returns true if crawl is not already started", function () { var crawler; crawler = new Crawler(); expect(crawler.start()).to.equal(true); crawler.stop(); }); it("emits a urllistempty event if no URLs", function (done) { var crawler = new Crawler({ interval: 50 }), listenSpy = sinon.spy(); listSize = 0; crawler.on("urllistempty", listenSpy); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.called(listenSpy); done(); }, 100); }); it("emits urllistempty if list is intermittently empty", function (done) { var urlList = new FifoUrlListMock(), crawler = new Crawler({ urlList: urlList, interval: 200, concurrentRequestsLimit: 2 }), listenSpy = sinon.spy(); listSize = 1; urlList.upsert = function () { return Promise.delay(250).then(function () { urlList.callCount = 0; }); }; crawler.on("urllistempty", listenSpy); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.called(listenSpy); done(); }, 300); }); it("does not emit urllistcomplete if list is intermittently empty", function (done) { var urlList = new FifoUrlListMock(), crawler = new Crawler({ urlList: urlList, interval: 200, concurrentRequestsLimit: 2 }), listenSpy = sinon.spy(); listSize = 1; urlList.upsert = function () { return Promise.delay(250).then(function () { urlList.callCount = 0; }); }; crawler.on("urllistcomplete", listenSpy); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.notCalled(listenSpy); done(); }, 300); }); it("emits urllistcomplete if list is permanently empty", function (done) { var urlList = new FifoUrlListMock(), crawler = new Crawler({ urlList: urlList, interval: 200, concurrentRequestsLimit: 2 }), listenSpy = sinon.spy(); listSize = 0; urlList.upsert = function () { return Promise.delay(250).then(function () { urlList.callCount = 0; }); }; crawler.on("urllistcomplete", listenSpy); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.called(listenSpy); done(); }, 300); }); it("throttles requests according to the interval", function (done) { var crawler = new Crawler({ interval: 50 }); var fifoUrlList = crawler.getUrlList(); crawler.start(); // call at 0ms, 50ms, 100ms setTimeout(function () { crawler.stop(); expect(fifoUrlList.callCount).to.equal(3); done(); }, 130); }); it("obeys the concurrency limit", function (done) { var crawler = new Crawler({ interval: 50, concurrentRequestsLimit: 1 }); var fifoUrlList = crawler.getUrlList(); // simulate each request taking 75ms fifoUrlList.delayTime = 75; crawler.start(); // call at 0ms finished at 75ms // call at 75ms finishes at 150ms // call at 150ms finishes at 225ms // call at 225ms finsihes at 300ms setTimeout(function () { crawler.stop(); expect(fifoUrlList.callCount).to.equal(4); done(); }, 280); }); it("caches robots.txt for a default of 60 minutes", function () { var crawler = new Crawler({ interval: 1000 * 60 * 5, // get page every 5 minutes concurrentRequestsLimit: 1 }); // There's no easy way to test this without use sinon fakeTimers (see // top of this file.) expect(crawler._robotsCache.options.stdTTL).to.equal(3600); }); it("caches robots.txt for a specified amount of time", function (done) { var crawler = new Crawler({ interval: 20, // 20 ms concurrentRequestsLimit: 1, robotsCacheTime: 100 // 100ms }); crawler.start(); // get robots at 0ms, 100ms, 200ms setTimeout(function () { crawler.stop(); expect(numRobotsCalls()).to.equal(3); done(); }, 280); }); it("adds the robots.txt file itself to the crawl queue", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({ _url: "https://example.com/robots.txt" })); done(); }, 200); }); it("will add destination URL to queue when redirected", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); pageStatusCode = 301; pageLocationHeader = "https://example.com/destination.html"; setTimeout(function () { crawler.stop(); sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({ _url: "https://example.com/destination.html" })); done(); }, 200); }); it("will add relative destination URL to queue when redirected", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); pageStatusCode = 301; pageLocationHeader = "/destination2.html"; setTimeout(function () { crawler.stop(); sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({ _url: "https://example.com/destination2.html" })); done(); }, 200); }); it("requests a page that is not excluded by robots.txt", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); setTimeout(function () { crawler.stop(); expect(numCrawlsOfUrl("https://example.com/index18.html", false)).to.equal(1); done(); }, 200); }); it("emits the crawlurl event", function (done) { var crawler = new Crawler({ interval: 10 }); var spy = sinon.spy(); crawler.on("crawlurl", spy); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(spy, "https://example.com/index18.html"); done(); }, 200); }); it("emits a crawled event", function (done) { var crawler = new Crawler({ interval: 10 }); var spy = sinon.spy(); crawler.on("crawledurl", spy); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(spy, "https://example.com/index1.html", null, 200); done(); }, 200); }); it("skips page excluded by robots.txt, even if robots.txt not in cache", function (done) { var crawler = new Crawler({ interval: 10 }); robotsTxt = ["User-agent: *", "Allow: /", "Disallow: /index1.html" ].join("\n"); crawler.start(); setTimeout(function () { crawler.stop(); expect(numCrawlsOfUrl("https://example.com/index1.html", false)).to.equal(0); done(); }, 200); }); it("works with multiple User agent rows", function (done) { var crawler = new Crawler({ interval: 10 }); robotsTxt = ["User-agent: *", "User-agent: googlebot", "Allow: /", "Disallow: /index1.html" ].join("\n"); crawler.start(); setTimeout(function () { crawler.stop(); expect(numCrawlsOfUrl("https://example.com/index1.html", false)).to.equal(0); done(); }, 200); }); it("skips a page that is excluded by robots.txt", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); setTimeout(function () { crawler.stop(); expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(0); done(); }, 200); }); it("crawls all pages if robots.txt is 404", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); robotsStatusCode = 404; setTimeout(function () { crawler.stop(); expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(1); done(); }, 200); }); it("crawls all pages if robots.txt is 410", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); robotsStatusCode = 410; setTimeout(function () { crawler.stop(); expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(1); done(); }, 200); }); it("crawls all pages if robots.txt is 500 (robotsIgnoreServerError flag)", function (done) { var crawler = new Crawler({ interval: 10, robotsIgnoreServerError: true }); crawler.start(); robotsStatusCode = 500; setTimeout(function () { crawler.stop(); expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(1); done(); }, 200); }); it("does not crawl pages if robots.txt is 500", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); robotsStatusCode = 500; setTimeout(function () { crawler.stop(); expect(numCrawlsOfUrl("https://example.com/index17.html", false)).to.equal(0); done(); }, 200); }); it("excludes all pages if robots.txt could not be crawled", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); robotsStatusCode = 600; setTimeout(function () { crawler.stop(); expect(numCrawlsOfUrl("https://example.com/index5.html", false)).to.equal(0); done(); }, 200); }); it("includes all pages if robots.txt disabled", function (done) { var crawler = new Crawler({ interval: 10, robotsEnabled: false }); crawler.start(); robotsStatusCode = 600; setTimeout(function () { crawler.stop(); expect(numCrawlsOfUrl("https://example.com/index5.html", false)).to.equal(1); done(); }, 200); }); it("updates the error code to ROBOTS_NOT_ALLOWED", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(upsertSpy, sinon.match({ _url: "https://example.com/index17.html", _errorCode: "ROBOTS_NOT_ALLOWED" })); done(); }, 200); }); it("updates the error code for a 404", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); pageStatusCode = 404; setTimeout(function () { crawler.stop(); sinon.assert.calledWith(upsertSpy, sinon.match({ _url: "https://example.com/index1.html", _errorCode: "HTTP_ERROR", _statusCode: 404 })); done(); }, 200); }); it("updates the error code for a request error", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); pageStatusCode = 0; setTimeout(function () { crawler.stop(); sinon.assert.calledWith(upsertSpy, sinon.match({ _url: "https://example.com/index1.html", _errorCode: "REQUEST_ERROR" })); done(); }, 200); }); it("asks for a gzipped response", function (done) { var crawler = new Crawler({ interval: 10 }); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(requestSpy, sinon.match({ gzip: true })); done(); }, 100); }); it("accepts custom request options", function (done) { var crawler = new Crawler({ interval: 50, request: { headers: { 'x-test-header': 'test' } } }); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(requestSpy, sinon.match({ headers: { 'x-test-header': 'test' } })); done(); }, 200); }); }); describe("#addHandler", function () { var handler, handlerRet, handlerType; beforeEach(function () { handlerRet = []; handlerType = 'resolve'; handler = sinon.spy(function () { if (handlerType === 'reject') { return Promise.reject(handlerRet); } else { return handlerRet; } }); }); it("can listen to all content types", function (done) { var crawler = new Crawler({ interval: 100 }); crawler.addHandler(handler); crawler.start(); pageContentType = "text/plain"; setTimeout(function () { crawler.stop(); sinon.assert.calledWith(handler, sinon.match({ body: sinon.match(new Buffer("<html><body>test</body></html>")), url: "https://example.com/index1.html", contentType: "text/plain" })); done(); }, 200); }); it("fires for matching content types", function (done) { var crawler = new Crawler({ interval: 100 }); pageContentType = "text/html"; crawler.addHandler("text/html", handler); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(handler, sinon.match({ body: sinon.match(new Buffer("<html><body>test</body></html>")), url: "https://example.com/index1.html", contentType: "text/html" })); done(); }, 200); }); it("does not fire for non-matching content types", function (done) { var crawler = new Crawler({ interval: 100 }); pageContentType = "text/plain"; crawler.addHandler("text/html", handler); crawler.start(); setTimeout(function () { crawler.stop(); expect(handler.calledWith(sinon.match({ body: sinon.match("<html><body>test</body></html>"), url: "https://example.com/index1.html" }))).to.equal(false); done(); }, 200); }); it("fires for a wide-matching content type", function (done) { var crawler = new Crawler({ interval: 100 }); pageContentType = "text/plain"; crawler.addHandler("text", handler); crawler.start(); setTimeout(function () { crawler.stop(); expect(handler.calledWith(sinon.match({ body: sinon.match(new Buffer("<html><body>test</body></html>")), url: "https://example.com/index1.html" }))).to.equal(true); done(); }, 200); }); it("fires for a array content type", function (done) { var crawler = new Crawler({ interval: 100 }); pageContentType = "text/html"; crawler.addHandler(["text/plain", "text/html"], handler); crawler.start(); setTimeout(function () { crawler.stop(); expect(handler.calledWith(sinon.match({ body: sinon.match(new Buffer("<html><body>test</body></html>")), url: "https://example.com/index1.html" }))).to.equal(true); done(); }, 200); }); it("can hold fire for a array content type", function (done) { var crawler = new Crawler({ interval: 100 }); pageContentType = "text/xml"; crawler.addHandler(["text/plain", "text/html"], handler); crawler.start(); setTimeout(function () { crawler.stop(); expect(handler.calledWith(sinon.match({ body: sinon.match(new Buffer("<html><body>test</body></html>")), url: "https://example.com/index1.html" }))).to.equal(false); done(); }, 200); }); it("can fire when content type determined from extension", function (done) { var crawler = new Crawler({ interval: 100 }); pageContentType = ""; crawler.addHandler("text/html", handler); crawler.start(); setTimeout(function () { crawler.stop(); expect(handler.calledWith(sinon.match({ body: sinon.match(new Buffer("<html><body>test</body></html>")), url: "https://example.com/index1.html" }))).to.equal(true); done(); }, 200); }); it("can hold fire when content type determined from extension", function (done) { var crawler = new Crawler({ interval: 100 }); pageContentType = ""; crawler.addHandler("text/plain", handler); crawler.start(); setTimeout(function () { crawler.stop(); expect(handler.calledWith(sinon.match({ body: sinon.match("<html><body>test</body></html>"), url: "https://example.com/index1.html" }))).to.equal(false); done(); }, 200); }); it("passes the content type as the third argument", function (done) { var crawler = new Crawler({ interval: 100 }); crawler.addHandler(handler); crawler.start(); pageContentType = "text/plain"; setTimeout(function () { crawler.stop(); sinon.assert.calledWith(handler, sinon.match({ body: sinon.match(new Buffer("<html><body>test</body></html>")), url: "https://example.com/index1.html", contentType: "text/plain" })); done(); }, 200); }); it("adds URL to the queue", function (done) { var crawler = new Crawler({ interval: 100 }); handlerRet = [ "https://example.com/page98.html", "https://example.com/page99.html" ]; crawler.addHandler(handler); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({ _url: "https://example.com/page99.html" })); sinon.assert.calledWith(insertIfNotExistsSpy, sinon.match({ _url: "https://example.com/page98.html" })); done(); }, 200); }); it("uses the bulk insert method if it exists", function (done) { var crawler = new Crawler({ interval: 100 }); handlerRet = [ "https://example.com/page98.html", "https://example.com/page99.html" ]; crawler.addHandler(handler); crawler.getUrlList().insertIfNotExistsBulk = insertIfNotExistsBulkSpy; crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(insertIfNotExistsBulkSpy, sinon.match([sinon.match({ _url: "https://example.com/page98.html" }), sinon.match({ _url: "https://example.com/page99.html" })])); done(); }, 200); }); it("works if handler returns invalid value", function (done) { var crawler = new Crawler({ interval: 100 }); handlerRet = true; crawler.addHandler(handler); crawler.start(); setTimeout(function () { crawler.stop(); expect(insertIfNotExistsSpy.calledWith(sinon.match({ _url: true }))).to.equal(false); done(); }, 200); }); it("records a HANDLERS_ERROR if exception thrown", function (done) { var crawler = new Crawler({ interval: 100 }); handlerType = 'reject'; handlerRet = new Error("Something went wrong"); crawler.addHandler(handler); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(upsertSpy, sinon.match({ _url: "https://example.com/index1.html", _errorCode: "HANDLERS_ERROR" })); done(); }, 200); }); it("emits a handlersError event if exception thrown", function (done) { var crawler = new Crawler({ interval: 100 }); var listenSpy = sinon.spy(); var err = new Error("test"); handlerType = 'reject'; handlerRet = err; crawler.addHandler(handler); crawler.on("handlersError", listenSpy); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(listenSpy, err); done(); }, 200); }); it("emits arbitrary errors thrown by the UrlList", function (done) { var crawler = new Crawler({ interval: 100 }); var spy = sinon.spy(); crawler.getUrlList().insertIfNotExistsBulk = sinon.spy(function () { return Promise.reject(new Error('abitrary error')); }); crawler.on("crawledurl", spy); crawler.start(); setTimeout(function () { crawler.stop(); sinon.assert.calledWith(spy, "https://example.com/index1.html", "OTHER_ERROR", null, "abitrary error"); done(); }, 200); }); }); });