squint-cli
Version:
Squint makes visual reviews of web app releases easy
76 lines (75 loc) • 3.2 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.crawlPaths = void 0;
const url_1 = require("url");
const lodash_1 = __importDefault(require("lodash"));
const p_queue_1 = __importDefault(require("p-queue"));
const chalk_1 = __importDefault(require("chalk"));
function normalizeUrl(url, opts) {
const urlParts = new url_1.URL(url);
if (!opts.includeHash) {
urlParts.hash = '';
}
if (!opts.includeSearchQuery) {
urlParts.search = '';
}
if (opts.trailingSlashMode === 'add') {
urlParts.pathname = `${lodash_1.default.trimEnd(urlParts.pathname, '/')}/`;
}
else if (opts.trailingSlashMode === 'remove') {
urlParts.pathname = lodash_1.default.trimEnd(urlParts.pathname, '/');
}
return urlParts.toString();
}
// Breadth-first traversal of urls
async function crawlPaths(inputs, memory = {
visited: new Set(inputs.urlsToVisit),
depth: 0,
}) {
const { pagePool, urlsToVisit, shouldVisit = () => true, maxDepth } = inputs;
// Remember that memory is shared between the tasks that run concurrently
// with PQueue
const queue = new p_queue_1.default();
const newUrlsToVisit = new Set();
// Stop following links if max depth has been reached already
const maxLinkDepthReached = memory.depth >= maxDepth;
if (!maxLinkDepthReached) {
urlsToVisit.forEach((url) => {
const visitTask = async () => {
const hrefs = await pagePool.use(async (page) => {
await page.goto(url, { waitUntil: 'networkidle2' });
return page.$$eval('a', (as) => as.map((a) => a.href));
});
console.error(chalk_1.default.dim `Visited ${url}`);
hrefs.forEach((href) => {
const resolvedHrefParts = new url_1.URL(href, url);
const resolvedHref = normalizeUrl(resolvedHrefParts.toString(), inputs);
const isVisitedAlready = memory.visited.has(resolvedHref);
const isCorrectProtocol = ['http:', 'https:'].includes(resolvedHrefParts.protocol);
const shouldVisitResult = isCorrectProtocol &&
!isVisitedAlready &&
shouldVisit(resolvedHref, { currentUrl: url, href }, memory.visited);
if (shouldVisitResult) {
newUrlsToVisit.add(resolvedHref);
memory.visited.add(resolvedHref);
}
});
};
queue.add(visitTask);
});
}
await queue.onIdle();
if (newUrlsToVisit.size > 0) {
await crawlPaths({ ...inputs, urlsToVisit: newUrlsToVisit }, { ...memory, depth: memory.depth + 1 });
}
const urls = [...memory.visited];
const urlPaths = new Set(urls.map((url) => {
const parts = new url_1.URL(url);
return `${parts.pathname}${parts.search}${parts.hash}`;
}));
return [...urlPaths];
}
exports.crawlPaths = crawlPaths;