UNPKG

supercrawler

Version:

A web crawler. Supercrawler automatically crawls websites. Define custom handlers to parse content. Obeys robots.txt, rate limits and concurrency limits.

github.com/brendonboshell/supercrawler

brendonboshell/supercrawler

164 lines (132 loc) • 3.65 kB

JavaScript

var htmlLinkParser = require("../../lib/handlers/htmlLinkParser"), expect = require("chai").expect, makeHtmlWithLinks; makeHtmlWithLinks = function (links) { var html = "<html><head></head><body>"; links.forEach(function (link) { html += '<a href="' + link + '">anchor</a>'; }); html += "</body>"; return new Buffer(html); }; describe("HtmlLikParser", function () { it("can return an absolute url", function () { var hlp = htmlLinkParser(), html; html = makeHtmlWithLinks(["https://example.com/test"]); expect(hlp({ body: html, url: "https://example2.com/index" })).to.deep.equal([ "https://example.com/test" ]); }); it("can return an relative url", function () { var hlp = htmlLinkParser(), html; html = makeHtmlWithLinks(["page2.html"]); expect(hlp({ body: html, url: "https://example.com/my/page.html" })).to.deep.equal([ "https://example.com/my/page2.html" ]); }); it("can return an root-relative url", function () { var hlp = htmlLinkParser(), html; html = makeHtmlWithLinks(["/page2.html"]); expect(hlp({ body: html, url: "https://example.com/my/page.html" })).to.deep.equal([ "https://example.com/page2.html" ]); }); it("can return multiple URLs", function () { var hlp = htmlLinkParser(), html; html = makeHtmlWithLinks([ "/page2.html", "page3.html", "https://example2.com/55" ]); expect(hlp({ body: html, url: "https://example.com/my/page.html" })).to.deep.equal([ "https://example.com/page2.html", "https://example.com/my/page3.html", "https://example2.com/55" ]); }); it("ignores javascript urls", function () { var hlp = htmlLinkParser(), html; html = makeHtmlWithLinks([ "javascript:alert('test')" ]); expect(hlp({ body: html, url: "https://example.com/my/page.html" })).to.deep.equal([]); }); it("can restrict to specific hosts", function () { var hlp = htmlLinkParser({ hostnames: ["example.com"] }), html; html = makeHtmlWithLinks([ "/page2.html", "page3.html", "https://example.com/101?q=str", "https://example2.com/55" ]); expect(hlp({ body: html, url: "https://example.com/my/page.html" })).to.deep.equal([ "https://example.com/page2.html", "https://example.com/my/page3.html", "https://example.com/101?q=str" ]); }); it("identifies <link rel='alternative'> tags", function () { var hlp = htmlLinkParser(), html; html = "<link href=\"/index-es/\" hreflang=\"es\" rel=\"alternate\">"; expect(hlp({ body: html, url: "http://example.com" })).to.deep.equal([ "http://example.com/index-es/" ]); }); it("does not identify <link rel='stylesheet'> tags", function () { var hlp = htmlLinkParser(), html; html = "<link href=\"styles.css\" type=\"text/css\" rel=\"stylesheet\">"; expect(hlp({ body: html, url: "http://example.com" })).to.deep.equal([]); }); it ("can apply a filter to the URLs discovered", function () { var hlp = htmlLinkParser({ urlFilter: function (url) { return url.indexOf("page1") === -1; } }), html; html = makeHtmlWithLinks([ "page1.html", "page2.html" ]); expect(hlp({ body: html, url: "http://example.com" })).to.deep.equal([ "http://example.com/page2.html" ]); }); });