UNPKG

supercrawler

Version:

A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.

github.com/brendonboshell/supercrawler

brendonboshell/supercrawler

70 lines (59 loc) • 1.86 kB

JavaScript

var robotsParser = require("../../lib/handlers/robotsParser"), expect = require("chai").expect; describe("robotsParser", function () { var rb, robotsTxt; beforeEach(function () { robotsTxt = ["User-agent: *", "Disallow: /test/", "", //"Sitemap: /sitemap_index.xml", "Sitemap: http://subdomain.example.com/sitemap_index_1.xml" ].join("\n"); rb = robotsParser(); }); it("can extract extract a absolute path sitemap", function () { expect(rb({ body: new Buffer(robotsTxt), url: "http://example.com/robots.txt" })).to.deep.equal([ "http://subdomain.example.com/sitemap_index_1.xml" ]); }); it("can extract extract a relative path sitemap", function () { robotsTxt += "\nSitemap: /sitemap_index.xml"; expect(rb({ body: new Buffer(robotsTxt), url: "http://example.com/robots.txt" })).to.deep.equal([ "http://subdomain.example.com/sitemap_index_1.xml", "http://example.com/sitemap_index.xml" ]); }); it ("can apply a filter to the URLs discovered", function () { var rb = robotsParser({ urlFilter: function (sitemapUrl) { return sitemapUrl.indexOf("sitemap_index.xml") === -1; } }); expect(rb({ body: new Buffer(robotsTxt), url: "http://example.com/robots.txt" })).to.deep.equal([ "http://subdomain.example.com/sitemap_index_1.xml" ]); }); it("returns empty when there are no sitemaps", function () { robotsTxt = ""; expect(rb({ body: new Buffer(robotsTxt), url: "http://example.com/robots.txt" })).to.deep.equal([]); }); it("returns empty when the URL path is not /robots.txt", function () { expect(rb({ body: new Buffer(robotsTxt), url: "http://example.com/Iamnotarobots.txt" })).to.deep.equal([]); }); });