@passmarked/seo
Version:
Rules related to checking for any SEO issues on the page given
422 lines (285 loc) • 10.3 kB
JavaScript
// load in the required modules
const cheerio = require('cheerio')
const url = require('url')
const S = require('string')
const async = require('async')
const request = require('request')
const _ = require('underscore')
const xml2js = require('xml2js');
// returns the sitemap txt based on the payload
var getSitemapContent = function(payload, options, fn) {
// get the data
var data = payload.getData();
// is sitemap defined ... ?
if(data.testingSitemap) return fn(null, data.testingSitemapResponse, data.testingSitemap);
// do the actual request
payload.doRequest({
type: 'GET',
url: options.url,
session: data.session
}, function(err, response, body) {
// check for a error
if(err) {
// output
payload.error('Problem downloading the website sitemap.xml', err);
// return error
return fn(null, response, '');
}
// check if the response is 200
if((response || {}).statusCode !== 200) {
// debug
payload.debug('The response code from the sitemap.xml given was ' + ((response || {}).statusCode || 200));
// nope out of there
return fn(null, response, '');
}
// done
setImmediate(fn, null, response, body || '');
})
};
// expose the items
module.exports = exports = function(payload, fn) {
// get the data
var data = payload.getData()
// go get the sitemap file
var uri = url.parse(data.url);
// create the actual target
var href = url.format( _.extend({}, uri, {
pathname: '/sitemap.xml',
search: ''
}) );
// list of known properties
const knownProperties = [
'loc',
'lastmod',
'changefreq',
'priority'
];
// list of required properties
const requiredProperties = [
'loc'
];
// the subject address
const subjectAddress = {
key: 'seo',
rule: 'sitemap',
subject: href
};
// check if not checked already for this site
payload.isMentioned(subjectAddress, function(err, isMentioned) {
// check for a error
if(err) {
// output to log
payload.error('Problem checking the sitemap txt', err);
// finish
return fn(null);
}
// is sitemap not empty
if(isMentioned === true) {
// done
return fn(null);
}
// else mention and do the check
payload.mention(subjectAddress, function() {
// get the sitemap from site
getSitemapContent(payload, {
method: 'GET',
type: 'GET',
url: href,
timeout: 5 * 1000
}, function(err, response, sitemapTxt) {
// check for a error
if(err) {
// output to log
payload.error('Problem checking the sitemap txt', err);
// finish
return fn(null);
}
// is sitemap not empty
if(S(sitemapTxt).isEmpty() === true) {
// done
return fn(null);
}
// check if the server handled the error
if((response || {}).statusCode !== 200) {
// all good
return fn(null);
}
// then we check the content of the sitemap
var contentType = S( ((response || {}).headers || {})['content-type'] || '' ).trim().s.toLowerCase();
// split the sitemap lines
var lines = (sitemapTxt || '').split('\n');
// did we find the content type ?
if(contentType.indexOf('application/xml') === -1 &&
contentType.indexOf('text/xml') === -1) {
// bad return format ...
payload.addRule({
type: 'error',
key: 'sitemap.format',
message: 'Defined Sitemap must return XML'
}, {
message: 'The sitemap at $, returned $ instead of XML',
display: 'code',
identifiers: [ href, contentType ],
code: {
text: lines.slice(0, 5),
start: 0,
end: lines.slice(0, 5).length + 1,
subject: 0
}
});
// done
return fn(null);
}
// else we try to parse the response
xml2js.parseString(sitemapTxt, function (err, result) {
// generic details of the rule itself,
// occurrences give more details to the rule itself
const rulePrologMeta = {
key: 'sitemap.prolog',
message: 'Missing or misconfigured prolog defined for sitemap',
type: 'error'
};
// gets the first line of the file
var firstLine = S(S(sitemapTxt).trim().s.toLowerCase().split('\n')[0] || '').trim().s;
// the file must start with the encoding
if( firstLine.startsWith('<?xml') === false ) {
// add the occurence
payload.addRule(rulePrologMeta, {
message: 'Sitemap must start with $ as part of $',
identifiers: [ '<?xml', '<?xml version="1.0" encoding="UTF-8" ?>' ],
file: href,
display: 'code',
code: {
text: lines.slice(0, 10),
start: 0,
end: lines.slice(0, 10).length + 1,
subject: 0
}
});
}
// the file must start with the encoding
if( firstLine.indexOf('version="1.0"') === -1 ) {
// add the occurence
payload.addRule(rulePrologMeta, {
message: 'Sitemap prolog must contain $ somewhere between $ and $ on the first line',
identifiers: [ 'version="1.0"', '<?xml', '?>' ],
file: href,
display: 'code',
code: {
text: lines.slice(0, 10),
start: 0,
end: lines.slice(0, 10).length + 1,
subject: 0
}
});
}
// the file must start with the encoding
if( firstLine.indexOf('encoding="') === -1 ) {
// add the occurence
payload.addRule(rulePrologMeta, {
message: 'Sitemap prolog must contain the encoding (like $) somewhere between $ and $ on the first line',
identifiers: [ 'encoding="UTF-8"', '<?xml', '?>' ],
file: href,
display: 'code',
code: {
text: lines.slice(0, 10),
start: 0,
end: lines.slice(0, 10).length + 1,
subject: 0
}
});
}
// prolog must end with ?>
if( S(firstLine).endsWith('?>') === false ) {
// add the occurence
payload.addRule(rulePrologMeta, {
message: 'Sitemap prolog have $ as a closing tag',
identifiers: [ '?>' ],
file: href,
display: 'code',
code: {
text: lines.slice(0, 10),
start: 0,
end: lines.slice(0, 10).length + 1,
subject: 0
}
});
}
// we got a error O_O, so this was not valid XML ...
if(err) {
// register that we are unable to parse this XML ...
payload.addRule({
type: 'critical',
key: 'sitemap.invalid',
message: 'Unable to parse defined sitemap'
}, {
message: 'Unable to parse the sitemap at $ as XML, indicating invalid XML',
file: href,
display: 'code',
identifiers: [ href, contentType ],
code: {
text: lines.slice(0, 20),
start: 0,
end: lines.slice(0, 20).length + 1,
subject: 0
}
});
// done
return fn(null);
}
// check if blank ... ?
if(result === null ||
result === undefined ||
S(sitemapTxt).isEmpty() === true) {
// add the rule
payload.addRule({
key: 'sitemap.empty',
message: 'Defined sitemap was empty',
type: 'error'
}, {
message: 'Sitemap at $ should have some content',
identifiers: [ '/sitemap.xml' ],
file: href,
display: 'code',
code: {
text: lines.slice(0, 10),
start: 0,
end: lines.slice(0, 10).length + 1,
subject: 0
}
});
// stop here then
return fn(null);
}
// get the keys of the root
var rootKeys = _.keys(result);
// constant meta for the schema output
const ruleSchemaMeta = {
key: 'sitemap.schema',
message: 'Defined sitemap contains incorrect schema',
type: 'error'
};
// there should only be one key ...
if(rootKeys.length !== 1 ||
(rootKeys[0] || '').toLowerCase() != 'urlset') {
// add the rule
payload.addRule(ruleSchemaMeta, {
message: 'Sitemap should contain only one root node of $',
identifiers: [ '<urlset>' ],
file: href,
display: 'code',
code: {
text: lines.slice(0, 10),
start: 0,
end: lines.slice(0, 10).length + 1,
subject: 0
}
});
}
// done
setImmediate(fn, null);
});
});
});
});
};