@crstn/redirect
Version:
A small package to crawl a site and return a redirect template. This is helpful for migration from one to another website with different url schemes.
115 lines (98 loc) • 4 kB
JavaScript
const program = require('commander');
const pkginfo = require('pkginfo')(module, 'version');
const _ = require('lodash');
const validator = require('validator');
program.version(module.exports.version);
// Get to work by passing the url only as command.
program.on('command:*', () => {
if (!validator.isURL(program.args[0], {require_protocol: true})) {
console.log('\nPlease pass a canonical url.');
console.log('Example: http://www.example.com\n');
process.exit();
}
const rootUrl = program.args[0].replace(/\/$/, '');
const crawlDepth = program.args[1] ? _.toSafeInteger(program.args[1]) : 2;
const exclude = program.args[2] ? _.toString(program.args[2]) : 'n';
const status = program.args[3] ? _.toString(program.args[3]).split(',') : [];
const Crawler = require('js-crawler');
// Build RegExp to match urls starting with the rootUrl
// or ending with excluded file types.
const RegexEscape = require('regex-escape');
const urlReg = new RegExp('^' + RegexEscape(rootUrl));
let exReg;
if (exclude !== 'n') {
exReg = new RegExp('\\\.(' + RegexEscape(exclude).replace(',', '|') + ')(?=(\\\?.+)?$)');
}
const crawler = new Crawler().configure({
depth: crawlDepth,
// userAgent: 'crawler/js-crawler',
// maxRequestsPerSecond: 100,
// maxConcurrentRequests: 10,
shouldCrawl: (url) => {
// Exclude current url if it matches the exclude pattern.
if (exclude !== 'n' && !!url.match(exReg)) {
return false;
}
return !!url.match(urlReg) || url === rootUrl;
}
});
console.log('\nCrawling ' + rootUrl + ' with depth ' + crawlDepth + ' ...');
if (exclude !== 'n') {
console.log('Excluding urls with pattern: ' + exReg);
}
console.log('\n');
// Init pages object to compare the status for exclusion later.
let pages = {};
crawler.crawl({
url: rootUrl,
success: (page) => {
pages[page.url] = page.status;
console.log(page.status + ' – ' + page.url);
},
failure: (page) => {
if ( typeof page.status === 'undefined' ) {
console.log('Not found. Please pass a canonical url.\n');
process.exit();
} else {
pages[page.url] = page.status;
console.log('Failed: ' + page.status + ' – ' + page.url);
}
},
finished: (crawledUrls) => {
if ( crawledUrls.length === 0 ) {
console.log('Empty response. Tried a canonical url?\n');
process.exit();
}
console.log('\n--------------------------------------\n');
console.log('Redirects ready. Adjust to your needs:');
if (status.length > 0) {
console.log('Status excluded: ' + status);
}
console.log('\n--------------------------------------\n');
// Redirects for .htaccess by default.
crawledUrls.map((url) => {
// If page has status to exclude, don't output it.
if ( status.indexOf(_.toString(pages[url])) === -1 ) {
console.log('Redirect 301 ' + url.replace(rootUrl, '') + ' ' + rootUrl);
}
});
console.log('\n');
}
});
});
// Add examples to default help output.
program.on('--help', () => {
console.log('\n Examples:\n');
console.log(' $ redirect http://example.com create redirects for .htaccess');
console.log(' $ redirect http://example.com 3 custom crawl depth – default 2, do not pass a negative number');
console.log(' $ redirect http://example.com 3 jpg,pdf exclude urls with file types – separated by comma');
console.log(' $ redirect http://example.com 3 n 404,401 skip excluding file types and exclude status codes instead – separated by comma');
console.log(' $ redirect http://example.com 3 jpg,pdf 404,401 or exclude both');
});
program.parse(process.argv);
// Display help if no command is passed.
// Needs to come after parse to see examples.
if (process.argv.length <= 2) {
program.help();
}