UNPKG

@deptagency/octopus

Version:

Recursive and multi-threaded broken link checker

343 lines (281 loc) 9.7 kB
/** * Octopus module * @module lib/app */ /** * Required modules */ const got = require('got'); const { EOL } = require('os'); const async = require('async'); const { URL } = require('url'); const justify = require('justify'); const prettyMs = require('pretty-ms'); const prependHttp = require('prepend-http'); const cheerioLoad = require('cheerio')['load']; const differenceBy = require('lodash.differenceby'); const windowWidth = require('term-size')()['columns']; /** * App defaults */ let config; let baseUrl; let baseHost; let crawledLinks = []; let inboundLinks = []; let brokenLinks = []; /** * CLI colors */ const COLOR_GRAY = '\x1b[90m'; const COLOR_GREEN = '\x1b[32m'; const FORMAT_END = '\x1b[0m'; /** * App timing */ const NS_PER_SEC = 1e9; const MS_PER_NS = 1e-6; const executionTime = process.hrtime(); /** * Blacklisted protocols */ const ignoreProtocols = [ '[href^="javascript:"]', '[href^="mailto:"]', '[href^="telnet:"]', '[href^="file:"]', '[href^="news:"]', '[href^="tel:"]', '[href^="ftp:"]', '[href^="#"]' ]; /** * Output line length */ const maxLength = windowWidth - 20; /** * Console streaming */ require('draftlog').into(console); console.stream = console.draft(EOL); /** * Magic function for the brokenLinks object */ const brokenLinksObserver = new Proxy(brokenLinks, { set: function(target, key, value) { // Extract variables const {requestUrl, referenceUrl, statusMessage, statusCode} = value; // Push to object target.push(requestUrl); // Terminal output console.log( '%s%s%s%s%s: %s%s%s: %s (%d)%s', justify('⚠️', null, 5), requestUrl.substr(0, maxLength), EOL, COLOR_GRAY, justify(null, 'APPEARS ON', 14), referenceUrl.substr(0, maxLength), EOL, justify(null,'STATUS MSG', 14), statusMessage, statusCode, FORMAT_END ); // Slack notification config['slack-webhook'] && got( config['slack-webhook'], { method: 'POST', body: JSON.stringify({ "attachments": [ { "fallback": `Broken url: ${requestUrl}${EOL}Appears on: ${referenceUrl}${EOL}Status msg: ${statusMessage} (${statusCode})`, "fields": [ { "title": "Broken url", "value": requestUrl, }, { "title": "Appears on", "value": referenceUrl, }, { "title": "Status code", "value": statusCode, "short": true }, { "title": "Status message", "value": statusMessage, "short": true } ], "color": "danger" } ] }) } ); } } ); /** * Executes the URL request * @param {String} requestUrl - URL of the requested link * @param {String} referenceUrl - URL of the reference page * @param {Function} requestCallback - Callback function * @returns {Function} Callback function */ const request = async (requestUrl, referenceUrl, requestCallback) => { // Encode Url const encodedUrl = requestUrl.match(/%[0-9a-f]{2}/i) ? requestUrl : encodeURI(requestUrl); try { // Start request const response = await got( encodedUrl, { timeout: config.timeout, headers: { 'user-agent': 'Octopus' } } ); // Extract response data const { statusCode, statusMessage, headers, timings, body } = response; const contentType = headers['content-type']; // Parse url const parsedUrl = new URL(requestUrl); // Default let pageLinks = []; // Update stream if ( ! config.silent ) { console.stream( '%s%s %s(%d ms)%s', justify('🤖', null, 4), requestUrl.substr(0, maxLength), COLOR_GRAY, timings['phases'].total, FORMAT_END ); } // Check for status code if ( ! [200, 204].includes(statusCode) ) { if ( ! brokenLinks.includes(requestUrl) ) { brokenLinksObserver[brokenLinks.length] = { requestUrl, referenceUrl, statusCode, statusMessage }; } // Extract links only from internal HTML pages } else if ( parsedUrl.host === baseHost && contentType.startsWith('text/html') ) { const $ = cheerioLoad(body); $('a[href]').not( ignoreProtocols.join(',') ).each( (i, elem) => { if (elem.attribs.href) { const hrefUrl = new URL(elem.attribs.href, baseUrl).href; if ( ! pageLinks.includes(hrefUrl) ) { pageLinks.push(hrefUrl); } } }); if ( config['include-images'] ) { $('img[src]').each((i, elem) => { if (elem.attribs.src) { const srcUrl = new URL(elem.attribs.src, baseUrl).href; if (!pageLinks.includes(srcUrl)) { pageLinks.push(srcUrl); } } }); } } // Execute callback return requestCallback(requestUrl, pageLinks); } catch ( error ) { // Add to broken links on request error if ( ! brokenLinks.includes(requestUrl) ) { const statusCode = error.statusCode || ''; const statusMessage = ( error.code || error.statusMessage ).toUpperCase(); brokenLinksObserver[brokenLinks.length] = { requestUrl, referenceUrl, statusCode, statusMessage }; } // Execute callback return requestCallback(requestUrl, []); } }; /** * Starts the page crawling * @param {String} crawlUrl - URL of the crawled page * @param {String} [referenceUrl] - URL of the reference page * @returns {Promise} Promise object represents the crawling request */ const crawl = ( crawlUrl, referenceUrl = '' ) => { return request( crawlUrl, referenceUrl, (requestUrl, pageLinks) => { // Mark url as crawled crawledLinks.push( { 'requestUrl': requestUrl } ); // Async loop async.eachSeries( pageLinks, (pageLink, crawlCallback) => { // Parse url const parsedLink = new URL(pageLink); if ( ( ! config['ignore-external'] || ( config['ignore-external'] && parsedLink.host === baseHost ) ) && ( ! parsedLink.searchParams || ( parsedLink.searchParams && ! config['ignore-query'].filter(query => parsedLink.searchParams.get(query)).length ) ) && ( ! inboundLinks.filter(item => item.requestUrl === pageLink).length ) ) { inboundLinks.push( { 'referenceUrl': requestUrl, 'requestUrl': pageLink } ); } crawlCallback(); }, () => { // Evaluate links to crawl const nextUrls = differenceBy( inboundLinks, crawledLinks, 'requestUrl' ); // Stream and check next link if ( Object.getOwnPropertyNames(nextUrls).length > 1 ) { return crawl( nextUrls[0].requestUrl, nextUrls[0].referenceUrl ); // Nothing to check, log & exit } else { const diff = process.hrtime(executionTime); const ms = (diff[0] * NS_PER_SEC + diff[1]) * MS_PER_NS; console.log( '%s%s%s%d %s %s%s', EOL, COLOR_GREEN, justify('✅', null, 3), inboundLinks.length, 'links checked in', prettyMs( ms, { compact: true } ), FORMAT_END ); process.exit( 0 ); } } ); } ); }; /** * Initializes the website crawling * @param {Object} argv - CLI arguments provided from mri package * @returns {Promise} Promise object represents the crawling loop */ module.exports = (argv) => { // Config config = { 'timeout': Number(argv.timeout), 'silent': Boolean(argv['silent']), 'ignore-query': (Array.isArray(argv['ignore-query']) ? argv['ignore-query'] : Array(argv['ignore-query'])), 'ignore-external': Boolean(argv['ignore-external']), 'include-images': Boolean(argv['include-images']), 'slack-webhook': String(argv['slack-webhook']), }; // Skip nofollow links if ( argv['ignore-nofollow'] ) { ignoreProtocols.push('[rel~="nofollow"]'); } // Base data baseUrl = prependHttp(argv._[0], {https: true}); baseHost = new URL(baseUrl).host; // Fire! return crawl(baseUrl); };