UNPKG

@cake-hub/cake-screenshot_diffs

Version:

A CAKE Screenshot diffing tool that includes a setup to comapre two given resources by screenshots taken from the available pages.

147 lines (127 loc) 4.56 kB
const path = require ("path"); const axios = require ("axios"); const { JSDOM } = require("jsdom"); const { default: Queue } = require('p-queue-es5'); class Crawler { static get LOG_MESSAGE_INTERVAL () { return 50; } constructor (url, config = { maxLinkFollowDepth, maxConcurrentConnections, linkFileExtensionsToFollow, assetFileExtensions, }) { this._url = this._cleanUrl (url); this._maxLinkFollowDepth = config.maxLinkFollowDepth; this._maxConcurrentConnections = config.maxConcurrentConnections; this._linkFileExtensionsToFollow = config.linkFileExtensionsToFollow; this._assetFileExtensions = config.assetFileExtensions; this._links = []; this._queue = new Queue ({ concurrency: this._maxConcurrentConnections, autoStart: true, }); } async _waitForQueueFinished () { let crawledLinks = 0; this._queue.on('active', () => { crawledLinks += 1; if (crawledLinks % Crawler.LOG_MESSAGE_INTERVAL !== 0) { return; } console.log ("Crawler", `Crawling ${this._url}`, `Crawled ${++crawledLinks} of ${crawledLinks + this._queue.size} pages. (queue: ${this._queue.size}, active: ${this._queue.pending})`); }); return await this._queue.onIdle (); } async _extractLinks (uri) { let links = []; try { let result = await axios.get (uri); if (!result.data || result.status >= 400) { return links; } const { window } = new JSDOM (result.data, { url: uri, contentType: result.headers ["content-type"] || "text/html", }); const linkElements = window.document.querySelectorAll ("a"); for (const linkElement of linkElements) { if (linkElement.href.indexOf ("http") < 0) { continue; } links.push (linkElement.href); } window.close (); } catch (e) { console.error ("Crawler", "_extractLinks ()", "uri", uri, e.message); } return links; } async getLinks (filterMethod = (url, link) => link.startsWith (url)) { this._links = []; // Start link crawling this._queue.add (() => this._iterateLinks (this._url, filterMethod)); // Wait for all promises to be resolved await this._waitForQueueFinished (); let result = []; for (const link of this._links) { result.push (link.substr (this._url.length, link.length)); } return result; } async _iterateLinks (url, filterMethod, currentDepth = 0) { // Check depth of site if (currentDepth > this._maxLinkFollowDepth) { return; } // Clean url and check against filters, link-list, … url = this._cleanUrl (url); if (!url || !filterMethod (this._url, url)) { return; } if (this._links.includes (url) || !this._checkUrl (url)) { return; } this._links.push (url); // Extract links of page const links = await this._extractLinks (url); // Check each link and recurse through these links for (let link of links) { // Iterate links recusively this._queue.add (() => this._iterateLinks (link, filterMethod, currentDepth + 1)); } } _cleanUrl (url) { if (!url) { return null; } if (url.indexOf ("#") >= 0) { url = url.substr (0, url.indexOf ("#") - 1); } if (url.indexOf ("?") >= 0) { url = url.substr (0, url.indexOf ("?") - 1); } return url; } _checkUrl (url) { if (!url) { return false; } const urlObj = new URL (url); // No path available (startpage) if (!urlObj.pathname || urlObj.pathname === "/") { return true; } // Check if the extension fits the configuration const extName = path.extname (urlObj.pathname); if (!this._linkFileExtensionsToFollow.includes (extName) && this._linkFileExtensionsToFollow.length > 0) { return false; } if (this._assetFileExtensions.includes (extName)) { return false; } return true; } } module.exports = Crawler;