@cake-hub/cake-screenshot_diffs
Version:
A CAKE Screenshot diffing tool that includes a setup to comapre two given resources by screenshots taken from the available pages.
147 lines (127 loc) • 4.56 kB
JavaScript
const path = require ("path");
const axios = require ("axios");
const { JSDOM } = require("jsdom");
const { default: Queue } = require('p-queue-es5');
class Crawler {
static get LOG_MESSAGE_INTERVAL () {
return 50;
}
constructor (url, config = {
maxLinkFollowDepth,
maxConcurrentConnections,
linkFileExtensionsToFollow,
assetFileExtensions,
}) {
this._url = this._cleanUrl (url);
this._maxLinkFollowDepth = config.maxLinkFollowDepth;
this._maxConcurrentConnections = config.maxConcurrentConnections;
this._linkFileExtensionsToFollow = config.linkFileExtensionsToFollow;
this._assetFileExtensions = config.assetFileExtensions;
this._links = [];
this._queue = new Queue ({
concurrency: this._maxConcurrentConnections,
autoStart: true,
});
}
async _waitForQueueFinished () {
let crawledLinks = 0;
this._queue.on('active', () => {
crawledLinks += 1;
if (crawledLinks % Crawler.LOG_MESSAGE_INTERVAL !== 0) {
return;
}
console.log ("Crawler", `Crawling ${this._url}`, `Crawled ${++crawledLinks} of ${crawledLinks + this._queue.size} pages. (queue: ${this._queue.size}, active: ${this._queue.pending})`);
});
return await this._queue.onIdle ();
}
async _extractLinks (uri) {
let links = [];
try {
let result = await axios.get (uri);
if (!result.data || result.status >= 400) {
return links;
}
const { window } = new JSDOM (result.data, {
url: uri,
contentType: result.headers ["content-type"] || "text/html",
});
const linkElements = window.document.querySelectorAll ("a");
for (const linkElement of linkElements) {
if (linkElement.href.indexOf ("http") < 0) {
continue;
}
links.push (linkElement.href);
}
window.close ();
} catch (e) {
console.error ("Crawler", "_extractLinks ()", "uri", uri, e.message);
}
return links;
}
async getLinks (filterMethod = (url, link) => link.startsWith (url)) {
this._links = [];
// Start link crawling
this._queue.add (() => this._iterateLinks (this._url, filterMethod));
// Wait for all promises to be resolved
await this._waitForQueueFinished ();
let result = [];
for (const link of this._links) {
result.push (link.substr (this._url.length, link.length));
}
return result;
}
async _iterateLinks (url, filterMethod, currentDepth = 0) {
// Check depth of site
if (currentDepth > this._maxLinkFollowDepth) {
return;
}
// Clean url and check against filters, link-list, …
url = this._cleanUrl (url);
if (!url || !filterMethod (this._url, url)) {
return;
}
if (this._links.includes (url) || !this._checkUrl (url)) {
return;
}
this._links.push (url);
// Extract links of page
const links = await this._extractLinks (url);
// Check each link and recurse through these links
for (let link of links) {
// Iterate links recusively
this._queue.add (() => this._iterateLinks (link, filterMethod, currentDepth + 1));
}
}
_cleanUrl (url) {
if (!url) {
return null;
}
if (url.indexOf ("#") >= 0) {
url = url.substr (0, url.indexOf ("#") - 1);
}
if (url.indexOf ("?") >= 0) {
url = url.substr (0, url.indexOf ("?") - 1);
}
return url;
}
_checkUrl (url) {
if (!url) {
return false;
}
const urlObj = new URL (url);
// No path available (startpage)
if (!urlObj.pathname || urlObj.pathname === "/") {
return true;
}
// Check if the extension fits the configuration
const extName = path.extname (urlObj.pathname);
if (!this._linkFileExtensionsToFollow.includes (extName) && this._linkFileExtensionsToFollow.length > 0) {
return false;
}
if (this._assetFileExtensions.includes (extName)) {
return false;
}
return true;
}
}
module.exports = Crawler;