UNPKG

@qualweb/crawler

Version:
345 lines 14.4 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.Crawler = void 0; const log_update_1 = __importDefault(require("log-update")); class Crawler { browser; viewport; startingUrl; isDomain; waitUntil; urls; constructor(browser, startingUrl, viewport, waitUntil) { this.browser = browser; this.startingUrl = this.verifyStartingUrl(startingUrl); this.isDomain = this.isStaringUrlADomain(startingUrl); this.viewport = viewport; this.waitUntil = waitUntil ?? 'domcontentloaded'; this.urls = new Array(); } verifyStartingUrl(startingUrl) { const url = new URL(decodeURIComponent(startingUrl)); const newStartingUrl = url.origin + url.pathname; if (!newStartingUrl.endsWith('/')) { return newStartingUrl + '/'; } else { return newStartingUrl; } } isStaringUrlADomain(startingUrl) { const url = new URL(startingUrl); return url.pathname === '/'; } async crawl(options) { const maxDepth = options?.maxDepth ?? -1; const maxUrls = options?.maxUrls ?? -1; const parallel = options?.maxParallelCrawls || 5; const timeout = options?.timeout ?? -1; let currentDepth = 0; let currentUrlCount = 1; let continueCrawling = true; let surpassedMax = false; let timer = 0; const timerHandle = setInterval(() => { timer += 2; if (options?.logging) { this.log(currentDepth, currentUrlCount, timer); } }, 2000); let timeoutHandle = null; let timeoutReached = false; if (timeout > 0) { timeoutHandle = setTimeout(() => (timeoutReached = true), timeout * 1000); } if (options?.logging) { this.log(currentDepth, currentUrlCount, timer); } const urlsByDepth = {}; const urlsCrawled = {}; urlsCrawled[this.startingUrl] = true; const [firstPageUrls, relativePathsToTest] = await this.fetchPageLinks(this.startingUrl); urlsByDepth[currentDepth] = [...firstPageUrls]; const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePathsToTest)); urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls]; this.addUrlsToCrawl(urlsCrawled, firstPageUrls); this.addUrlsToCrawl(urlsCrawled, newUrls); currentUrlCount += firstPageUrls.length + newUrls.length; if (options?.logging) { this.log(currentDepth, currentUrlCount, timer); } if (maxUrls >= 0 && currentUrlCount >= maxUrls) { surpassedMax = true; } while (currentDepth !== maxDepth && currentUrlCount !== maxUrls && continueCrawling) { const promises = new Array(); currentDepth++; let depthCompleted = false; if (options?.logging) { this.log(currentDepth, currentUrlCount, timer); } while (!depthCompleted) { const letsCrawl = new Array(); let count = 0; for (const url of urlsByDepth[currentDepth - 1] ?? []) { if (!urlsCrawled[url]) { urlsCrawled[url] = true; letsCrawl.push(url); count++; } if (count === parallel) { break; } } if (count < parallel) { depthCompleted = true; } for (const url of letsCrawl ?? []) { promises.push(this.fetchPageLinks(url)); } const listUrls = await Promise.all(promises); urlsByDepth[currentDepth] = new Array(); for (const [urls, relativePaths] of listUrls ?? []) { urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...urls]; const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePaths)); urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls]; this.addUrlsToCrawl(urlsCrawled, urls); this.addUrlsToCrawl(urlsCrawled, newUrls); currentUrlCount = Object.keys(urlsCrawled).length; if (options?.logging) { this.log(currentDepth, currentUrlCount, timer); } if (maxUrls >= 0 && currentUrlCount >= maxUrls) { surpassedMax = true; depthCompleted = true; continueCrawling = false; break; } } if (timeoutReached) { continueCrawling = false; break; } } if (!urlsByDepth[currentDepth]?.length) { continueCrawling = false; } } if (timeoutHandle) { clearTimeout(timeoutHandle); } clearInterval(timerHandle); if (surpassedMax) { this.urls = Object.keys(urlsCrawled).slice(0, maxUrls); } else { this.urls = Object.keys(urlsCrawled); } } log(currentDepth, currentUrlCount, timer) { (0, log_update_1.default)(`Starting url: ${this.startingUrl} Current depth: ${currentDepth} Urls found: ${currentUrlCount} Time passed: ${timer} seconds`); } addUrlsToCrawl(urlsCrawled, urls) { for (const url of urls ?? []) { if (!urlsCrawled[url]) { urlsCrawled[url] = false; } } } async fetchPageLinks(url) { let urls = new Array(); let relativePathsToTest = new Array(); try { const page = await this.browser.newPage(); if (this.viewport) { await page.setViewport(this.viewport); } await page.goto(url, { waitUntil: this.waitUntil }); [urls, relativePathsToTest] = await page.evaluate((startingUrl, isDomain) => { function getUrlWithoutExtension(url) { if (!url.endsWith('/')) { const parts = url.split('/'); parts.pop(); return parts.join('/') + '/'; } else { return url; } } const notHtml = 'css|jpg|jpeg|gif|svg|pdf|docx|js|png|ico|xml|mp4|mp3|mkv|wav|rss|json|pptx|txt'.split('|'); const links = document.querySelectorAll('body a'); const urls = new Array(); const relativePathsToTest = new Array(); links.forEach((link) => { if (link.hasAttribute('href')) { let href = link.getAttribute('href')?.trim(); if (href?.startsWith('//')) href = href.replace('//', 'https://'); if (href && !isDomain && !href.startsWith('http') && !href.startsWith('#') && !href.includes('javascript:') && !href.includes('tel:') && !href.includes('mailto:')) { let valid = true; for (const not of notHtml || []) { if (href.endsWith(not)) { valid = false; break; } const parts = href.split('/'); if (parts.length > 0) { const lastPart = parts[parts.length - 1]; if (lastPart.startsWith('#')) { valid = false; break; } } } if (valid) { if (href.startsWith('/')) { const url = new URL(window.location.href); relativePathsToTest.push(url.origin + href); } else { relativePathsToTest.push(getUrlWithoutExtension(window.location.href) + href); } } } if (href && isDomain && (href.startsWith(startingUrl) || href.startsWith('/') || href.startsWith('./') || (!href.startsWith('http') && !href.startsWith('#'))) && !href.includes('javascript:') && !href.includes('tel:') && !href.includes('mailto:')) { let valid = true; for (const not of notHtml || []) { if (href.endsWith(not)) { valid = false; break; } const parts = href.split('/'); if (parts.length > 0) { const lastPart = parts[parts.length - 1]; if (lastPart.startsWith('#')) { valid = false; break; } } } if (valid) { try { let correctUrl = ''; if (href.startsWith(startingUrl)) { correctUrl = href; } else if (href.startsWith('./')) { correctUrl = startingUrl + href.slice(2); } else if (href.startsWith('/')) { correctUrl = startingUrl + href.slice(1); } else { correctUrl = startingUrl + href; } const parsedUrl = new URL(correctUrl); if (parsedUrl.hash.trim() === '') { urls.push(correctUrl); } } catch (err) { console.error(err); } } } } }); return [urls, relativePathsToTest]; }, this.startingUrl, this.isDomain); } catch (err) { console.error(err); } return [[], [...relativePathsToTest, ...this.normalizeAndSort(urls)]]; } async checkRelativePathsUrls(urls) { const newUrlsToValidate = new Array(); await Promise.all(urls.map(async (url) => { try { const page = await this.browser.newPage(); if (this.viewport) { await page.setViewport(this.viewport); } await page.goto(url, { waitUntil: this.waitUntil }); const newUrl = await page.evaluate((startingUrl) => { function getUrlWithoutExtension(url) { if (!url.endsWith('/')) { const parts = url.split('/'); parts.pop(); return parts.join('/') + '/'; } else { return url; } } if (window.location.href.startsWith(getUrlWithoutExtension(startingUrl))) { return window.location.href; } else { return null; } }, this.startingUrl); if (newUrl !== null) { newUrlsToValidate.push(newUrl); } await page.close(); } catch (err) { console.error(err); } })); return newUrlsToValidate; } normalizeAndSort(urls) { const normalizedUrls = urls.map((u) => { if (u.includes('#')) { const parts = u.split('#'); parts.pop(); u = parts.join('#'); } if (u.startsWith(this.startingUrl)) { return u.trim(); } else { return (this.startingUrl + u).trim(); } }); const unique = [...new Set(normalizedUrls)] .map((u) => { try { return decodeURIComponent(u); } catch (err) { return null; } }) .filter((u) => u !== null); return unique.sort(); } getResults() { return this.urls; } } exports.Crawler = Crawler; //# sourceMappingURL=Crawler.object.js.map