advanced-sitemap-generator
Version:
Easily create XML sitemaps for your website.
495 lines (448 loc) • 15.2 kB
JavaScript
const fs = require('fs');
const http = require('follow-redirects').http;
const https = require('follow-redirects').https;
const path = require('path');
const parseURL = require('url-parse');
const cpFile = require('cp-file');
const normalizeUrl = require('normalize-url');
const eachSeries = require('async/eachSeries');
const mitt = require('mitt');
const async = require('async');
const puppeteer = require('puppeteer');
const discoverResources = require('./discoverResources');
const createCrawler = require('./createCrawler');
const SitemapRotator = require('./SitemapRotator');
const createSitemapIndex = require('./createSitemapIndex');
const extendFilename = require('./helpers/extendFilename');
const validChangeFreq = require('./helpers/validChangeFreq');
const getLangCodeMap = require('./helpers/getLangCodeMap');
const isValidURL = require('./helpers/isValidURL');
const msg = require('./helpers/msg-helper');
const getCurrentDateTime = require('./helpers/getCurrentDateTime');
module.exports = function SitemapGenerator(uri, opts) {
let browser = null;
const defaultOpts = {
stripQuerystring: true,
maxEntriesPerFile: 50000,
filterByDomain: true,
ignoreWWWDomain: true,
maxDepth: 0,
maxConcurrency: 10,
filepath: path.join(process.cwd(), 'sitemap.xml'),
userAgent: 'Node/SitemapGenerator',
respectRobotsTxt: true,
ignoreInvalidSSL: true,
recommendAlternatives: false,
timeout: 120000,
decodeResponses: true,
changeFreq: '',
priorityMap: [],
forcedURLs: []
};
if (!uri) {
throw new Error('Requires a valid URL.');
}
const options = Object.assign({}, defaultOpts, opts);
let realCrawlingDepth = 0;
let savedOnDiskSitemapPaths = [];
let crawler = null;
const stats = {
add: 0,
ignore: 0,
error: 0
};
const getQueueReadyItems = () => {
const items = crawler.queue.filter((item) => {
return item.visited && item.isDiscoveryProcessDone && item.fetched === true;
});
return items;
};
const mergeQueueItems = (from, to, deep) => {
to.depth = to.depth > from.depth ? from.depth : to.depth;
to.lastMod = to.lastMod === '' ? from.lastMod : to.lastMod;
if (!deep) {
return;
}
for (const fromAlter of from.alternatives) {
const similarLangAlternatives = to.alternatives.filter((item) => {
return item.lang === fromAlter.lang;
});
const similarURLAlternatives = to.alternatives.filter((item) => {
return item.urlNormalized === fromAlter.urlNormalized;
});
if (!similarLangAlternatives.length && !similarURLAlternatives.length) {
to.alternatives.push(fromAlter);
} else if (similarURLAlternatives.length && !similarLangAlternatives.length) {
//en and en-US. In this case the more specific lang should be used en-US
similarURLAlternatives[0].lang = similarURLAlternatives[0].lang.length > fromAlter.lang.length ? similarURLAlternatives[0].lang : fromAlter.lang;
} else if (similarLangAlternatives.length && !similarURLAlternatives.length) {
//Same langs detected but diffrent URLs, In this case will always prefer the from's one
similarLangAlternatives[0].url = fromAlter.url;
similarLangAlternatives[0].urlNormalized = normalizeUrl(fromAlter.url, {
removeTrailingSlash: false,
forceHttps: true
});
}
}
};
const getStats = () => {
let queuedItems = getQueueReadyItems();
queuedItems = queuedItems.map((item) => {
return {
url: item.url,
lastMod: item.lastMod,
canonical: item.canonical,
lang: item.lang,
referrer: item.referrer,
depth: item.depth,
protocol: item.protocol,
path: item.path,
uriPath: item.uriPath,
port: item.port,
host: item.host,
};
});
const results = {
added: stats.add || 0,
ignored: stats.ignore || 0,
errored: stats.error || 0,
urls: queuedItems,
realCrawlingDepth: realCrawlingDepth
};
return results;
};
const getPaths = () => {
return savedOnDiskSitemapPaths;
};
// if changeFreq option was passed, check to see if the value is valid
if (opts && opts.changeFreq) {
options.changeFreq = validChangeFreq(opts.changeFreq);
}
const emitter = mitt();
const parsedUrl = parseURL(
normalizeUrl(uri, {
stripWWW: false,
removeTrailingSlash: false
})
);
const sitemapPath = path.resolve(options.filepath);
// we don't care about invalid certs
process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
const start = () => {
crawler.start();
};
const stop = () => {
if (!crawler.running) {
msg.error('CRAWLER ALREADY STOPPED');
return;
}
crawler.stop();
setTimeout(() => {
onCrawlerComplete();
msg.error('STOPPING THE CRAWLER');
}, 60000);
};
const queueURL = (url, referrer, force) => {
const result = crawler.queueURL(url, referrer, force);
if (result) {
msg.info('NEW ITEM ADDED TO THE QUEUE MANUALLY: ' + url);
}
};
// create sitemap stream
const sitemap = SitemapRotator(options);
const isEmittedBefore = {};
const emitError = (code, url) => {
isEmittedBefore[code] = isEmittedBefore[code] ? isEmittedBefore[code] : {};
if (isEmittedBefore[code][url]) {
return;
}
isEmittedBefore[code][url] = true;
emitter.emit('error', {
code,
message: http.STATUS_CODES[code],
url
});
};
const onCrawlerComplete = () => {
let queuedItems = getQueueReadyItems();
msg.green('CRAWLER HAS ' + queuedItems.length + ' ITEMS IN THE QUEUE');
const addBaseURLsToQueue = () => {
msg.info('ADDING BASE URLS TO THE GENERATED SITEMAP');
for (const url of options.forcedURLs) {
const item = {
depth: 100,
lastMod: '',
url: url.value,
urlNormalized: normalizeUrl(url.value, {
removeTrailingSlash: false,
forceHttps: true
})
};
item.alternatives = url.alternatives.map((alter) => {
alter.url = alter.value;
alter.urlNormalized = normalizeUrl(alter.url, {
removeTrailingSlash: false,
forceHttps: true
});
return alter;
});
const existingItem = queuedItems.filter((queueItem) => {
return item.url === queueItem.url;
})[0];
if (existingItem) {
mergeQueueItems(item, existingItem, true);
} else {
queuedItems.push(item);
}
}
};
const getLangFreeURL = (queueItem) => {
const langs = getLangCodeMap(queueItem.lang);
let pureURL = queueItem.url;
for (const lang of langs) {
pureURL = pureURL.replace('/' + lang, '');
}
return pureURL;
};
const recommendAlternatives = () => {
msg.info('RECOMMENDING ALTERNATIVES');
for (let queueItem of queuedItems) {
const pureURL = getLangFreeURL(queueItem);
for (let otherQueueItem of queuedItems) {
const otherPureURL = getLangFreeURL(otherQueueItem);
if (queueItem.url === otherQueueItem.url || pureURL !== otherPureURL) {
continue;
}
let isAlternativeAddedBefore = queueItem.alternatives.filter(function(alter) {
return (alter.urlNormalized === otherQueueItem.urlNormalized) || alter.lang === otherQueueItem.lang;
}).length;
if (isAlternativeAddedBefore) {
continue;
}
queueItem.alternatives.push({
url: otherQueueItem.url,
urlNormalized: normalizeUrl(otherQueueItem.url, {
removeTrailingSlash: false,
forceHttps: true
}),
flushed: false,
lang: otherQueueItem.lang
});
}
let isSelfRefrencingAlternativeAddedBefore = queueItem.alternatives.filter(function(alter) {
//IF THE URL WAS ADDED BEFORE OR THERE IS ANOTHER ONE FOR THIS LANG
return (alter.urlNormalized === queueItem.urlNormalized) || alter.lang === queueItem.lang;
}).length;
if (queueItem.alternatives.length === 0 || isSelfRefrencingAlternativeAddedBefore) {
continue;
}
queueItem.alternatives.push({
url: queueItem.url,
urlNormalized: normalizeUrl(queueItem.url, {
removeTrailingSlash: false,
forceHttps: true
}),
flushed: false,
lang: queueItem.lang
});
}
};
const handleCanonicals = () => {
msg.info('HANDLING CANONICAL URLS');
for (let queueItem of queuedItems) {
//CHECK IF CANONICAL ALREADY IN THE QUEUE
const canonicalItem = queuedItems.filter((item) => {
return queueItem.canonical === item.url && queueItem.id !== item.id;
})[0];
if (canonicalItem) {
mergeQueueItems(queueItem, canonicalItem, true);
queueItem.shouldBeDelete = true;
}
}
};
const handleUppercaseLettersURLs = () => {
msg.info('HANDLING SIMILAR URLS BUT WITH DIFFERENT CASE LETTERS');
for (let queueItem of queuedItems) {
//CHECK IF CANONICAL ALREADY IN THE QUEUE
const otherQueueItem = queuedItems.filter((item) => {
return queueItem.url.toLowerCase() === item.url.toLowerCase() &&
queueItem.id !== item.id;
})[0];
//THERE IS AN UPPER CASE LETTER
if (otherQueueItem && (otherQueueItem.url.toLowerCase() !== otherQueueItem.url)) {
mergeQueueItems(queueItem, otherQueueItem, true);
queueItem.shouldBeDelete = true;
} else if (otherQueueItem) {
mergeQueueItems(otherQueueItem, queueItem, true);
otherQueueItem.shouldBeDelete = true;
}
}
};
const init = () => {
msg.green('CRAWLER COMPLETE CRAWLING THE WEBSITE');
const finish = () => {
sitemap.finish();
const sitemaps = sitemap.getPaths();
msg.info(sitemaps);
const cb = () => emitter.emit('done', getStats());
// move files
if (sitemaps && sitemaps.length > 1) {
// multiple sitemaps
let count = 1;
eachSeries(
sitemaps,
(tmpPath, done) => {
const newPath = extendFilename(sitemapPath, `_part${count}`);
savedOnDiskSitemapPaths.push(newPath);
// copy and remove tmp file
(async () => {
await cpFile(tmpPath, newPath);
fs.unlink(tmpPath, () => {
done();
});
})();
count += 1;
},
() => {
const filename = path.basename(sitemapPath);
savedOnDiskSitemapPaths.push(sitemapPath);
fs.writeFile(
sitemapPath,
createSitemapIndex(parsedUrl.toString(), filename, sitemaps.length),
cb
);
}
);
} else if (sitemaps.length) {
savedOnDiskSitemapPaths.push(sitemapPath);
(async () => {
msg.green('SITEMAP GENERATED ON: ' + sitemaps[0]);
await cpFile(sitemaps[0], sitemapPath);
msg.green('MOVING SITEMAP TO THE TARGET DIR: ' + sitemapPath);
fs.unlink(sitemaps[0], cb);
})();
} else {
cb();
}
};
addBaseURLsToQueue();
handleCanonicals();
handleUppercaseLettersURLs();
msg.info('STARTING WITH ITEMS THAT ARE NOT DELETED');
queuedItems = queuedItems.filter((item) => {
return !item.shouldBeDelete;
});
if (options.recommendAlternatives) {
recommendAlternatives();
}
for (let queueItem of queuedItems) {
msg.blue('FLUSHING: ' + queueItem.url + ' WITH ' + (queueItem.alternatives ? queueItem.alternatives.length : 0) + ' ALTERNATIVES');
sitemap.addURL(queueItem);
}
sitemap.flush();
// Wait extra 10 seconds to make sure that sitemaps been saved on disk
//TODO: Refactor
setTimeout(finish, 10000);
};
// Wait extra 60 seconds to make sure that all pages were handled
setTimeout(init, 60000);
};
const init = async () => {
if (options.deep) {
browser = await puppeteer.launch({
headless: true,
args: ['--lang=en-US,us']
});
}
crawler = createCrawler(parsedUrl, options, browser);
crawler.on('fetch404', ({
url
}) => emitError(404, url));
crawler.on('fetchtimeout', ({
url
}) => emitError(408, url));
crawler.on('fetch410', ({
url
}) => emitError(410, url));
crawler.on('invaliddomain', ({
url
}) => emitError(403, url));
crawler.on('fetchprevented', ({
url
}) => emitError(403, url));
crawler.on('queueerror', ({
url
}) => emitError(500, url));
crawler.on('fetchconditionerror', ({
url
}) => emitError(500, url));
crawler.on('fetcherror', (queueItem, response) =>
emitError(response.statusCode, queueItem.url)
);
crawler.on('fetchclienterror', (queueError, errorData) => {
if (errorData.code === 'ENOTFOUND') {
emitError(errorData.code, errorData.hostname);
} else {
emitError(400, errorData.message);
}
});
crawler.on('fetchdisallowed', ({
url
}) => emitter.emit('ignore', url));
crawler.on('queueduplicate', (queueItem) => {
const items = crawler.queue.filter((item) => {
return item.url === queueItem.url;
});
mergeQueueItems(queueItem, items[0], false);
});
crawler.on('fetchheaders', (queueItem, page) => {
queueItem.flushed = false;
queueItem.visited = true;
let lastMod = queueItem.stateData.headers['last-modified'];
queueItem.lastMod = getCurrentDateTime(lastMod);
if (queueItem.depth > realCrawlingDepth) {
realCrawlingDepth = queueItem.depth;
}
});
crawler.on('fetchcomplete', (queueItem, page) => {
const {
url,
depth
} = queueItem;
// msg.info('FETCH COMPLETE FOR ' + url);
// check if robots noindex is present
if (/<meta(?=[^>]+noindex).*?>/.test(page)) {
emitter.emit('ignore', queueItem);
} else if (isValidURL(url)) {
msg.yellowBright('ADDING PROCESS FOR: ' + url + ' WAS DONE');
emitter.emit('add', queueItem);
} else {
emitError('404', url);
}
});
crawler.on('discoverycomplete', (queueItem, resources) => {});
crawler.on('complete', onCrawlerComplete);
emitter.on('add', (queueItem, page) => {
stats.add++;
});
emitter.on('ignore', (queueItem, page) => {
stats.ignore++;
});
emitter.on('error', (queueItem, page) => {
stats.error++;
});
};
(async () => {
await init();
emitter.emit('ready');
})();
return {
getStats,
start,
stop,
queueURL,
on: emitter.on,
off: emitter.off,
getPaths
};
};