UNPKG

spido

Version:

Web crawler/spider for node.js & nest.js server.

105 lines (104 loc) 3.71 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Spido = void 0; const queue_1 = require("./queue"); const utils_1 = require("./utils"); //define main spido crawler module class class Spido { url; options; cache; utils; visited; queue; websiteSeoData; constructor(url, options) { this.url = url; this.options = { internalLinks: true, sitemap: false, depth: 0, }; this.cache = {}; Object.assign(this.options, options); this.queue = new queue_1.Queue(); this.visited = new Set(); this.websiteSeoData = []; this.utils = new utils_1.Utils(); } async crawl() { const baseURL = await this.utils.getBaseUrl(this.url); this.queue.enqueue(baseURL); while (!this.queue.isEmpty()) { const currentURL = this.queue.dequeue(); if (!currentURL || this.visited.has(currentURL) || this.isDepthExceeded(currentURL)) { continue; } const cachedResponse = this.cache[currentURL]; if (cachedResponse) { await this.handleResponse(currentURL, cachedResponse.response); } else { const response = await this.utils.getResponse(currentURL); if (response) { await this.handleResponse(currentURL, response); } } this.visited.add(currentURL); } return this.websiteSeoData; } isDepthExceeded(url) { const depth = this.options.depth; return depth !== 0 && this.utils.getUrlPathDepth(url) > depth; } async handleResponse(url, response) { try { if (!this.cache[url]) { const SEOData = await this.utils.getSeoDataFromResponse(response.response, url); const cacheResponse = { response: response, internalLinks: await this.utils.getInternalLinks(response), isValid: await this.utils.isValidUrl(response.response.status), pathDepth: this.utils.getUrlPathDepth(url), }; this.cache[url] = cacheResponse; this.websiteSeoData.push(SEOData); await this.enqueueURLs(cacheResponse.internalLinks); } } catch (error) { console.log(error); } } async enqueueURLs(urls) { for (const currentURL of urls) { if (!this.visited.has(currentURL) && !this.queue.isURLInQueue(currentURL)) { this.queue.enqueue(currentURL); } } } //fetching single page seo data from url & resolve promise with the data async fetch(url) { const response = await this.utils.getResponse(url); const responseData = response?.response.data; const seoData = await this.utils.getSeoDataFromResponse(responseData, url); return this.websiteSeoData.push(seoData); } async internalLinksEnabled(url) { const response = await this.utils.getResponse(url); const responseData = response?.response.data; const internalLinks = await this.utils.getInternalLinks(responseData); internalLinks.forEach(async (link) => { const isValidLink = await this.utils.isValidUrl(responseData.response.status); if (isValidLink && !this.queue.urls.includes(link)) { this.queue.enqueue(link); } }); } } exports.Spido = Spido; module.exports = { Spido };