@passmarked/links
Version:
Rules related to the links/assets on the page
249 lines (172 loc) • 6.3 kB
JavaScript
// get our required modules
const _ = require('underscore');
const S = require('string');
const cheerio = require('cheerio');
const url = require('url');
const async = require('async');
const request = require('request');
/**
* Checks if any links on the page return a 404
**/
module.exports = exports = function(payload, fn) {
// data
var data = payload.getData();
// get the page content
payload.getPageContent(function(err, content) {
// did we get a error ?
if(err) {
// output to stderr
payload.error(err);
// calback is done
return setImmediate(fn, err);
}
// sanity check if empty
if(S(content || '').isEmpty() === true)
return setImmediate(fn, null);
// local variables to work with
var lines = content.split('\n');
var last_current_line = -1;
// default url
var uri = url.parse(data.url);
// list of links that we found to test
var testableLinks = [];
var testingSlugs = [];
// local cheerio instance for lookups
var $ = cheerio.load(content);
// get all the links
$('body a').each(function(i, elem) {
// ref of the link
var href = S($(elem).attr('href') || '').trim().s;
// double check that we got a ref ...
if(S(href || '').isEmpty() === true) return;
// if the link does not point with a protocol
// we can assume this is a local link
if(href.toLowerCase().indexOf('https://') !== 0 &&
href.toLowerCase().indexOf('http://') !== 0 &&
href.toLowerCase().indexOf('//') !== 0 &&
href.toLowerCase().indexOf('/') !== 0) {
// resolve the path to make a complete url
href = url.resolve(uri.protocol + "//" + uri.host, href);
}
// check if it starts with relative
if(href.indexOf('//') == 0) {
// set it
href = uri.protocol + href;
} else if(href.indexOf('/') == 0) {
// set it
href = uri.protocol + '//' + uri.host + href;
}
// check that this is a actual url, this should stop
// a few other links like #,javascript:void(0); too
if(href.toLowerCase().indexOf('http://') !== 0 &&
href.toLowerCase().indexOf('https://') !== 0) return;
// check that this is not a js function
if(href.match(/^[A-Za-z]+.*\(/gi) !== null) return;
// now parse the href
var hrefUri = url.parse(href);
// could we parse the href ?
if(!hrefUri) return;
// skip links to current page
if(uri.hostname === hrefUri.hostname &&
uri.pathname.toLowerCase() === hrefUri.pathname.toLowerCase())
return;
// the final ref
var hrefTarget = url.format(hrefUri);
var hrefSlugged = S(hrefTarget).slugify().s;
// sanity check
if(S(hrefTarget || '').isEmpty() === true) return;
// avoid dups
if(testingSlugs.indexOf(hrefSlugged) != -1) return;
// add it
testingSlugs.push(hrefSlugged);
// add the link
testableLinks.push({
url: hrefTarget,
uri: hrefUri,
external: hrefUri.hostname != uri.hostname
});
});
// keep track of url last lines
var urlLastLines = {};
// loop each of the links
async.eachLimit(testableLinks || [], 100, function(link, cb) {
// params for the rule
var params = {
url: link.url,
display: 'url'
};
// get the url
var entryUri = url.parse(link.url);
// get the current line
var build = payload.getSnippetManager().build(lines, urlLastLines[link.url] || -1, entryUri.path);
// if we found it, this is code !
if(build) {
// set as code !!!!!!
params.display = 'code';
params.code = build;
// set the last line
urlLastLines[link.url] = build.subject;
}
// do the requests
payload.doRequest({
url: link.url,
type: 'GET',
session: data.session,
kill: true
}, function(err, response, body) {
// get the status code
var statusCode = parseInt((response || {}).statusCode || '');
// check for timeout
if(err &&
err.code === 'ETIMEDOUT') {
// output error
payload.error('Problem requesting page contents timed out', err);
// set the message
params.identifiers = [ link.url, 10 ]
params.message = 'GET request to $ timed out after $ seconds'
// add each rule
payload.addRule({
type: 'error',
message: 'Broken links in page content',
key: 'link'
}, params);
} else if(err &&
err.code === 'ECONNREFUSED') {
// output error
payload.error('Problem requesting page contents', err);
// set the message
params.identifiers = [ link.url ]
params.message = 'Unable to verify that $ is working with GET request'
// add each rule
payload.addRule({
type: 'error',
message: 'Broken links in page content',
key: 'link'
}, params);
} else if(!err &&
statusCode !== null &&
statusCode !== undefined &&
statusCode !== NaN &&
statusCode !== 405 &&
statusCode !== 403 &&
statusCode > 400 &&
statusCode <= 500) {
// set the message
params.identifiers = [ link.url, statusCode ];
params.message = '$ returned status code of $';
// add each rule
payload.addRule({
type: link.external === true ? 'notice' : 'error',
message: 'Broken links in page content',
key: 'link'
}, params);
}
// done
setImmediate(cb, null);
});
}, function(err) {
// done
setImmediate(fn, null);
});
});
};