spido
Version:
Web crawler/spider for node.js & nest.js server.
105 lines (104 loc) • 3.71 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Spido = void 0;
const queue_1 = require("./queue");
const utils_1 = require("./utils");
//define main spido crawler module class
class Spido {
url;
options;
cache;
utils;
visited;
queue;
websiteSeoData;
constructor(url, options) {
this.url = url;
this.options = {
internalLinks: true,
sitemap: false,
depth: 0,
};
this.cache = {};
Object.assign(this.options, options);
this.queue = new queue_1.Queue();
this.visited = new Set();
this.websiteSeoData = [];
this.utils = new utils_1.Utils();
}
async crawl() {
const baseURL = await this.utils.getBaseUrl(this.url);
this.queue.enqueue(baseURL);
while (!this.queue.isEmpty()) {
const currentURL = this.queue.dequeue();
if (!currentURL ||
this.visited.has(currentURL) ||
this.isDepthExceeded(currentURL)) {
continue;
}
const cachedResponse = this.cache[currentURL];
if (cachedResponse) {
await this.handleResponse(currentURL, cachedResponse.response);
}
else {
const response = await this.utils.getResponse(currentURL);
if (response) {
await this.handleResponse(currentURL, response);
}
}
this.visited.add(currentURL);
}
return this.websiteSeoData;
}
isDepthExceeded(url) {
const depth = this.options.depth;
return depth !== 0 && this.utils.getUrlPathDepth(url) > depth;
}
async handleResponse(url, response) {
try {
if (!this.cache[url]) {
const SEOData = await this.utils.getSeoDataFromResponse(response.response, url);
const cacheResponse = {
response: response,
internalLinks: await this.utils.getInternalLinks(response),
isValid: await this.utils.isValidUrl(response.response.status),
pathDepth: this.utils.getUrlPathDepth(url),
};
this.cache[url] = cacheResponse;
this.websiteSeoData.push(SEOData);
await this.enqueueURLs(cacheResponse.internalLinks);
}
}
catch (error) {
console.log(error);
}
}
async enqueueURLs(urls) {
for (const currentURL of urls) {
if (!this.visited.has(currentURL) &&
!this.queue.isURLInQueue(currentURL)) {
this.queue.enqueue(currentURL);
}
}
}
//fetching single page seo data from url & resolve promise with the data
async fetch(url) {
const response = await this.utils.getResponse(url);
const responseData = response?.response.data;
const seoData = await this.utils.getSeoDataFromResponse(responseData, url);
return this.websiteSeoData.push(seoData);
}
async internalLinksEnabled(url) {
const response = await this.utils.getResponse(url);
const responseData = response?.response.data;
const internalLinks = await this.utils.getInternalLinks(responseData);
internalLinks.forEach(async (link) => {
const isValidLink = await this.utils.isValidUrl(responseData.response.status);
if (isValidLink && !this.queue.urls.includes(link)) {
this.queue.enqueue(link);
}
});
}
}
exports.Spido = Spido;
module.exports = { Spido };