@sector-labs/seo-slip
Version:
Catches SEO regressions by sampling and checking a website against a set of rules
124 lines (103 loc) • 4.22 kB
JavaScript
const http = require('http');
const https = require('https');
const Crawler = require('simplecrawler');
const {
newEmptyAnalysis,
newEmptyAnalyses,
mergeItemAnalysis,
pushItemAnalysis,
newEmptyItemReport,
newEmptyReport,
mergeItemReport,
pushItemReport,
newEmptyItemResult,
newEmptyResults,
mergeItemResult,
pushItemResult,
} = require('../reporting');
module.exports = (fullPath, maxDepth, variables, checkers, done) => {
const crawler = new Crawler(fullPath);
crawler.needsAuth = variables.needsAuth;
crawler.authUser = variables.authUser;
crawler.authPass = variables.authPass;
crawler.maxConcurrency = variables.maxConcurrency || 3;
crawler.maxDepth = maxDepth || 1;
crawler.respectRobotsTxt = variables.respectRobotsTxt;
crawler.userAgent = variables.userAgent || 'seo-slip';
crawler.scanSubdomains = variables.scanSubdomains;
crawler.domainWhitelist = variables.domainWhitelist;
crawler.decodeResponses = variables.decodeResponses;
crawler.timeout = variables.timeout || 60000;
crawler.customHeaders = variables.customHeaders;
crawler.parseHTMLComments = variables.parseHTMLComments;
crawler.parseScriptTags = variables.parseScriptTags;
crawler.httpAgent = http.Agent({ keepAlive: true });
crawler.httpsAgent = https.Agent({ keepAlive: true });
const skipUrls = (variables.skipUrls || []).map((str) => new RegExp(str));
crawler.addFetchCondition((queueItem, referrerQueueItem, callback) => {
const fetch = !skipUrls.some((skipUrl) => skipUrl.test(queueItem.url));
callback(null, fetch);
});
const header = new Set();
let analyses = newEmptyAnalyses();
let report = newEmptyReport();
let results = newEmptyResults();
let isCrawlCompleted = false;
const fetchCompleted = (queueItem, responseBody, response) => {
const analysis = checkers
.filter((checker) => checker.analysis)
.map((checker) => checker.analysis(queueItem, responseBody, response))
.reduce(mergeItemAnalysis, newEmptyAnalysis());
analyses = pushItemAnalysis(analysis, analyses);
const itemReport = checkers
.filter((checker) => checker.report)
.map((checker) => checker.report(analysis))
.reduce(mergeItemReport, newEmptyItemReport());
report = pushItemReport(itemReport, report);
const itemResult = checkers
.filter((checker) => checker.check)
.map((checker) => checker.check(analysis))
.reduce(mergeItemResult, newEmptyItemResult());
results = pushItemResult(itemResult, results);
Object.keys(itemReport).forEach((key) => header.add(key));
};
const crawlCompleted = () => {
if (isCrawlCompleted) {
// workaround because the events library used by the crawler has a bug
return;
} else {
isCrawlCompleted = true;
}
results = checkers
.filter((checker) => checker.finalCheck)
.map((checker) => checker.finalCheck(analyses, report))
.reduce(pushItemResult, results);
done(results, Array.from(header), report);
};
const fetchStarted = (queueItem) => {
const stop = checkers
.filter((checker) => checker.shouldStop)
.map((checker) => checker.shouldStop(queueItem))
.some((shouldStop) => shouldStop);
if (stop) {
crawler.stop();
crawlCompleted();
}
};
crawler.on('fetchstart', fetchStarted);
crawler.on('fetchcomplete', fetchCompleted);
crawler.on('fetchredirect', fetchCompleted);
crawler.on('fetch404', fetchCompleted);
crawler.on('fetch410', fetchCompleted);
crawler.on('fetcherror', fetchCompleted);
crawler.on('fetchprevented', (queueItem) =>
console.log(`fetch for url=${queueItem.url} prevented by the rules`)
);
crawler.on('complete', crawlCompleted);
Promise.all(
checkers
.map((checker) => checker.init)
.filter((init) => init)
.map((init) => init())
).then(() => crawler.start());
};