UNPKG

@crstn/redirect

Version:

A small package to crawl a site and return a redirect template. This is helpful for migration from one to another website with different url schemes.

115 lines (98 loc) 4 kB
#!/usr/bin/env node const program = require('commander'); const pkginfo = require('pkginfo')(module, 'version'); const _ = require('lodash'); const validator = require('validator'); program.version(module.exports.version); // Get to work by passing the url only as command. program.on('command:*', () => { if (!validator.isURL(program.args[0], {require_protocol: true})) { console.log('\nPlease pass a canonical url.'); console.log('Example: http://www.example.com\n'); process.exit(); } const rootUrl = program.args[0].replace(/\/$/, ''); const crawlDepth = program.args[1] ? _.toSafeInteger(program.args[1]) : 2; const exclude = program.args[2] ? _.toString(program.args[2]) : 'n'; const status = program.args[3] ? _.toString(program.args[3]).split(',') : []; const Crawler = require('js-crawler'); // Build RegExp to match urls starting with the rootUrl // or ending with excluded file types. const RegexEscape = require('regex-escape'); const urlReg = new RegExp('^' + RegexEscape(rootUrl)); let exReg; if (exclude !== 'n') { exReg = new RegExp('\\\.(' + RegexEscape(exclude).replace(',', '|') + ')(?=(\\\?.+)?$)'); } const crawler = new Crawler().configure({ depth: crawlDepth, // userAgent: 'crawler/js-crawler', // maxRequestsPerSecond: 100, // maxConcurrentRequests: 10, shouldCrawl: (url) => { // Exclude current url if it matches the exclude pattern. if (exclude !== 'n' && !!url.match(exReg)) { return false; } return !!url.match(urlReg) || url === rootUrl; } }); console.log('\nCrawling ' + rootUrl + ' with depth ' + crawlDepth + ' ...'); if (exclude !== 'n') { console.log('Excluding urls with pattern: ' + exReg); } console.log('\n'); // Init pages object to compare the status for exclusion later. let pages = {}; crawler.crawl({ url: rootUrl, success: (page) => { pages[page.url] = page.status; console.log(page.status + ' – ' + page.url); }, failure: (page) => { if ( typeof page.status === 'undefined' ) { console.log('Not found. Please pass a canonical url.\n'); process.exit(); } else { pages[page.url] = page.status; console.log('Failed: ' + page.status + ' – ' + page.url); } }, finished: (crawledUrls) => { if ( crawledUrls.length === 0 ) { console.log('Empty response. Tried a canonical url?\n'); process.exit(); } console.log('\n--------------------------------------\n'); console.log('Redirects ready. Adjust to your needs:'); if (status.length > 0) { console.log('Status excluded: ' + status); } console.log('\n--------------------------------------\n'); // Redirects for .htaccess by default. crawledUrls.map((url) => { // If page has status to exclude, don't output it. if ( status.indexOf(_.toString(pages[url])) === -1 ) { console.log('Redirect 301 ' + url.replace(rootUrl, '') + ' ' + rootUrl); } }); console.log('\n'); } }); }); // Add examples to default help output. program.on('--help', () => { console.log('\n Examples:\n'); console.log(' $ redirect http://example.com create redirects for .htaccess'); console.log(' $ redirect http://example.com 3 custom crawl depth – default 2, do not pass a negative number'); console.log(' $ redirect http://example.com 3 jpg,pdf exclude urls with file types – separated by comma'); console.log(' $ redirect http://example.com 3 n 404,401 skip excluding file types and exclude status codes instead – separated by comma'); console.log(' $ redirect http://example.com 3 jpg,pdf 404,401 or exclude both'); }); program.parse(process.argv); // Display help if no command is passed. // Needs to come after parse to see examples. if (process.argv.length <= 2) { program.help(); }