@ragnaroksj/sitemap-generator
Version:
Easily create XML sitemaps for your website with url-parse 1.5.10 based on sitemap generator
61 lines (47 loc) • 1.56 kB
JavaScript
const url = require('url');
const cheerio = require('cheerio');
module.exports = (buffer, queueItem) => {
const $ = cheerio.load(buffer.toString('utf8'));
const metaRobots = $('meta[name="robots"]');
if (metaRobots.length && /nofollow/i.test(metaRobots.attr('content'))) {
return [];
}
const links = $('a[href]').map(function iteratee() {
let href = $(this).attr('href');
// exclude "mailto:" etc
if (/^[a-z]+:(?!\/\/)/i.test(href)) {
return null;
}
// exclude rel="nofollow" links
const rel = $(this).attr('rel');
if (/nofollow/i.test(rel)) {
return null;
}
// remove anchors
href = href.replace(/(#.*)$/, '');
//remove basic authentication
href = href.replace(/^\/?([^/]*@)/, '');
// handle "//"
if (/^\/\//.test(href)) {
return `${queueItem.protocol}:${href}`;
}
// check if link is relative
// (does not start with "http(s)" or "//")
if (!/^https?:\/\//.test(href)) {
const base = $('base').first();
if (base.length) {
// base tag is set, prepend it
if (base.attr('href') !== undefined) {
// base tags sometimes don't define href, they sometimes they only set target="_top", target="_blank"
href = url.resolve(base.attr('href'), href);
}
}
// handle links such as "./foo", "../foo", "/foo"
if (/^\.\.?\/.*/.test(href) || /^\/[^/].*/.test(href)) {
href = url.resolve(queueItem.url, href);
}
}
return href;
});
return links.get();
};