@qualweb/crawler
Version:
Webpage crawler for qualweb
345 lines • 14.4 kB
JavaScript
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Crawler = void 0;
const log_update_1 = __importDefault(require("log-update"));
class Crawler {
browser;
viewport;
startingUrl;
isDomain;
waitUntil;
urls;
constructor(browser, startingUrl, viewport, waitUntil) {
this.browser = browser;
this.startingUrl = this.verifyStartingUrl(startingUrl);
this.isDomain = this.isStaringUrlADomain(startingUrl);
this.viewport = viewport;
this.waitUntil = waitUntil ?? 'domcontentloaded';
this.urls = new Array();
}
verifyStartingUrl(startingUrl) {
const url = new URL(decodeURIComponent(startingUrl));
const newStartingUrl = url.origin + url.pathname;
if (!newStartingUrl.endsWith('/')) {
return newStartingUrl + '/';
}
else {
return newStartingUrl;
}
}
isStaringUrlADomain(startingUrl) {
const url = new URL(startingUrl);
return url.pathname === '/';
}
async crawl(options) {
const maxDepth = options?.maxDepth ?? -1;
const maxUrls = options?.maxUrls ?? -1;
const parallel = options?.maxParallelCrawls || 5;
const timeout = options?.timeout ?? -1;
let currentDepth = 0;
let currentUrlCount = 1;
let continueCrawling = true;
let surpassedMax = false;
let timer = 0;
const timerHandle = setInterval(() => {
timer += 2;
if (options?.logging) {
this.log(currentDepth, currentUrlCount, timer);
}
}, 2000);
let timeoutHandle = null;
let timeoutReached = false;
if (timeout > 0) {
timeoutHandle = setTimeout(() => (timeoutReached = true), timeout * 1000);
}
if (options?.logging) {
this.log(currentDepth, currentUrlCount, timer);
}
const urlsByDepth = {};
const urlsCrawled = {};
urlsCrawled[this.startingUrl] = true;
const [firstPageUrls, relativePathsToTest] = await this.fetchPageLinks(this.startingUrl);
urlsByDepth[currentDepth] = [...firstPageUrls];
const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePathsToTest));
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls];
this.addUrlsToCrawl(urlsCrawled, firstPageUrls);
this.addUrlsToCrawl(urlsCrawled, newUrls);
currentUrlCount += firstPageUrls.length + newUrls.length;
if (options?.logging) {
this.log(currentDepth, currentUrlCount, timer);
}
if (maxUrls >= 0 && currentUrlCount >= maxUrls) {
surpassedMax = true;
}
while (currentDepth !== maxDepth && currentUrlCount !== maxUrls && continueCrawling) {
const promises = new Array();
currentDepth++;
let depthCompleted = false;
if (options?.logging) {
this.log(currentDepth, currentUrlCount, timer);
}
while (!depthCompleted) {
const letsCrawl = new Array();
let count = 0;
for (const url of urlsByDepth[currentDepth - 1] ?? []) {
if (!urlsCrawled[url]) {
urlsCrawled[url] = true;
letsCrawl.push(url);
count++;
}
if (count === parallel) {
break;
}
}
if (count < parallel) {
depthCompleted = true;
}
for (const url of letsCrawl ?? []) {
promises.push(this.fetchPageLinks(url));
}
const listUrls = await Promise.all(promises);
urlsByDepth[currentDepth] = new Array();
for (const [urls, relativePaths] of listUrls ?? []) {
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...urls];
const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePaths));
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls];
this.addUrlsToCrawl(urlsCrawled, urls);
this.addUrlsToCrawl(urlsCrawled, newUrls);
currentUrlCount = Object.keys(urlsCrawled).length;
if (options?.logging) {
this.log(currentDepth, currentUrlCount, timer);
}
if (maxUrls >= 0 && currentUrlCount >= maxUrls) {
surpassedMax = true;
depthCompleted = true;
continueCrawling = false;
break;
}
}
if (timeoutReached) {
continueCrawling = false;
break;
}
}
if (!urlsByDepth[currentDepth]?.length) {
continueCrawling = false;
}
}
if (timeoutHandle) {
clearTimeout(timeoutHandle);
}
clearInterval(timerHandle);
if (surpassedMax) {
this.urls = Object.keys(urlsCrawled).slice(0, maxUrls);
}
else {
this.urls = Object.keys(urlsCrawled);
}
}
log(currentDepth, currentUrlCount, timer) {
(0, log_update_1.default)(`Starting url: ${this.startingUrl} Current depth: ${currentDepth} Urls found: ${currentUrlCount} Time passed: ${timer} seconds`);
}
addUrlsToCrawl(urlsCrawled, urls) {
for (const url of urls ?? []) {
if (!urlsCrawled[url]) {
urlsCrawled[url] = false;
}
}
}
async fetchPageLinks(url) {
let urls = new Array();
let relativePathsToTest = new Array();
try {
const page = await this.browser.newPage();
if (this.viewport) {
await page.setViewport(this.viewport);
}
await page.goto(url, {
waitUntil: this.waitUntil
});
[urls, relativePathsToTest] = await page.evaluate((startingUrl, isDomain) => {
function getUrlWithoutExtension(url) {
if (!url.endsWith('/')) {
const parts = url.split('/');
parts.pop();
return parts.join('/') + '/';
}
else {
return url;
}
}
const notHtml = 'css|jpg|jpeg|gif|svg|pdf|docx|js|png|ico|xml|mp4|mp3|mkv|wav|rss|json|pptx|txt'.split('|');
const links = document.querySelectorAll('body a');
const urls = new Array();
const relativePathsToTest = new Array();
links.forEach((link) => {
if (link.hasAttribute('href')) {
let href = link.getAttribute('href')?.trim();
if (href?.startsWith('//'))
href = href.replace('//', 'https://');
if (href &&
!isDomain &&
!href.startsWith('http') &&
!href.startsWith('#') &&
!href.includes('javascript:') &&
!href.includes('tel:') &&
!href.includes('mailto:')) {
let valid = true;
for (const not of notHtml || []) {
if (href.endsWith(not)) {
valid = false;
break;
}
const parts = href.split('/');
if (parts.length > 0) {
const lastPart = parts[parts.length - 1];
if (lastPart.startsWith('#')) {
valid = false;
break;
}
}
}
if (valid) {
if (href.startsWith('/')) {
const url = new URL(window.location.href);
relativePathsToTest.push(url.origin + href);
}
else {
relativePathsToTest.push(getUrlWithoutExtension(window.location.href) + href);
}
}
}
if (href &&
isDomain &&
(href.startsWith(startingUrl) ||
href.startsWith('/') ||
href.startsWith('./') ||
(!href.startsWith('http') && !href.startsWith('#'))) &&
!href.includes('javascript:') &&
!href.includes('tel:') &&
!href.includes('mailto:')) {
let valid = true;
for (const not of notHtml || []) {
if (href.endsWith(not)) {
valid = false;
break;
}
const parts = href.split('/');
if (parts.length > 0) {
const lastPart = parts[parts.length - 1];
if (lastPart.startsWith('#')) {
valid = false;
break;
}
}
}
if (valid) {
try {
let correctUrl = '';
if (href.startsWith(startingUrl)) {
correctUrl = href;
}
else if (href.startsWith('./')) {
correctUrl = startingUrl + href.slice(2);
}
else if (href.startsWith('/')) {
correctUrl = startingUrl + href.slice(1);
}
else {
correctUrl = startingUrl + href;
}
const parsedUrl = new URL(correctUrl);
if (parsedUrl.hash.trim() === '') {
urls.push(correctUrl);
}
}
catch (err) {
console.error(err);
}
}
}
}
});
return [urls, relativePathsToTest];
}, this.startingUrl, this.isDomain);
}
catch (err) {
console.error(err);
}
return [[], [...relativePathsToTest, ...this.normalizeAndSort(urls)]];
}
async checkRelativePathsUrls(urls) {
const newUrlsToValidate = new Array();
await Promise.all(urls.map(async (url) => {
try {
const page = await this.browser.newPage();
if (this.viewport) {
await page.setViewport(this.viewport);
}
await page.goto(url, {
waitUntil: this.waitUntil
});
const newUrl = await page.evaluate((startingUrl) => {
function getUrlWithoutExtension(url) {
if (!url.endsWith('/')) {
const parts = url.split('/');
parts.pop();
return parts.join('/') + '/';
}
else {
return url;
}
}
if (window.location.href.startsWith(getUrlWithoutExtension(startingUrl))) {
return window.location.href;
}
else {
return null;
}
}, this.startingUrl);
if (newUrl !== null) {
newUrlsToValidate.push(newUrl);
}
await page.close();
}
catch (err) {
console.error(err);
}
}));
return newUrlsToValidate;
}
normalizeAndSort(urls) {
const normalizedUrls = urls.map((u) => {
if (u.includes('#')) {
const parts = u.split('#');
parts.pop();
u = parts.join('#');
}
if (u.startsWith(this.startingUrl)) {
return u.trim();
}
else {
return (this.startingUrl + u).trim();
}
});
const unique = [...new Set(normalizedUrls)]
.map((u) => {
try {
return decodeURIComponent(u);
}
catch (err) {
return null;
}
})
.filter((u) => u !== null);
return unique.sort();
}
getResults() {
return this.urls;
}
}
exports.Crawler = Crawler;
//# sourceMappingURL=Crawler.object.js.map