crawler
Version:
Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously
43 lines (35 loc) • 1.06 kB
JavaScript
const Crawler = require("crawler");
const base = "https://zenhub.com";
const crawledPages = { [base]: true };
const ignoreSelector = `:not([href$=".png"]):not([href$=".jpg"]):not([href$=".mp4"]):not([href$=".mp3"]):not([href$=".gif"])`;
const crawlOptions = {
skipEventRequest: false,
};
const callback = (error, res) => {
if (error) {
console.error(error);
} else {
const $ = res.$;
$(`a[href^="/"]${ignoreSelector},a[href^="${base}"]${ignoreSelector}`).each(
(_i, elem) => {
if (!crawledPages[elem.attribs.href]) {
crawledPages[elem.attribs.href] = true;
directCrawl(`${base}${elem.attribs.href}`);
}
}
);
}
};
const crawler = new Crawler({
maxConnections: 10,
rateLimit: 0,
callback,
});
const directCrawl = (uri) => {
crawler.direct({
uri,
callback,
...crawlOptions,
});
};
directCrawl(base);