spido
Version:
Web crawler/spider for node.js & nest.js server.
317 lines (316 loc) • 12.8 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Utils = void 0;
const axios_1 = __importDefault(require("axios"));
const cheerio = __importStar(require("cheerio"));
class Utils {
/**
* Retrieves the HTML from a specified URL using Axios.
* @async
* @function
* @param {string} url - The URL to retrieve the HTML from.
* @returns {Promise<HttpResponse>} - A Promise that resolves to an object containing the response and the response URL.
* @throws {Error} - If there is an error with the Axios request, an error will be thrown.
*/
getResponse = async (url) => {
try {
const response = await axios_1.default.get(url);
const responseURL = response.request.res.responseUrl;
if (response.status >= 200 && response.status < 400) {
return { response, responseURL };
}
}
catch (error) {
if (axios_1.default.isAxiosError(error) && error.response) {
const response = error.response;
const responseURL = response?.request.res.responseUrl;
return { response, responseURL };
}
else {
throw new Error(`Invalid URL! - ${url}`);
}
}
};
/**
* Extracts the base URL from a given URL.
* @param {string} url - The URL to extract the base URL from.
* @returns {Promise<string>} - The base URL of the URL.
* @throws {Error} - If the URL is invalid or the response status is not valid.
*/
getBaseUrl = async (url) => {
try {
const response = await this.getResponse(url);
if (!response) {
throw new Error(`Invalid URL! - ${url}`);
}
const responseURL = response.responseURL;
if (response.response.status >= 200 && response.response.status < 400) {
// If the response status is valid, return the base URL
return new URL(responseURL).origin;
}
else {
// If the response status is not valid, throw an error
throw new Error(`Invalid URL! - ${url}`);
}
}
catch (err) {
throw new Error(`Invalid URL! - ${url}`);
}
};
/**
* Checks if the given URL is valid and the response is OK.
* @param {number} status - The status code of the response to be checked.
* @returns {Promise<boolean>} - A Promise that resolves to a boolean indicating whether the URL is valid and the response is OK.
* @example
* isValidUrl(200)
* .then(valid => console.log(valid)); // returns true or false
*/
isValidUrl = async (status) => {
try {
if (status === 200) {
return true;
}
else {
return false;
}
}
catch (error) {
if (axios_1.default.isAxiosError(error) && error.response?.status) {
const status = error.response.status;
if ([300, 301, 302, 303, 304, 305, 306, 307, 308].includes(status)) {
try {
const response = await axios_1.default.get(error.response.headers.location);
if (response.status === 200) {
return true;
}
else {
return false;
}
}
catch {
return false;
}
}
}
return false;
}
};
/**
* Retrieves the sitemap URL of a given URL.
* @async
* @param {string} url - The URL to retrieve the sitemap URL from.
* @returns {Promise<string>} - A Promise that resolves to the sitemap URL.
*/
getSitemap = async (url) => {
const baseUrl = await this.getBaseUrl(url);
const sitemap = `${baseUrl}sitemap.xml`;
return sitemap;
};
/**
* Checks if the sitemap URL is valid and the response is 200.
* @async
* @param {string} url - The sitemap URL to check.
* @returns {Promise<boolean>} - A Promise that resolves to true if the sitemap URL is valid and the response is 200, otherwise false.
*/
isSitemap = async (url) => {
try {
const sitemapUrl = await this.getSitemap(url);
const httpResponse = await this.getResponse(sitemapUrl);
if (!httpResponse) {
throw new Error(`Invalid response! - ${url}`);
}
return httpResponse.response.status === 200;
}
catch (error) {
return false;
}
};
/**
* Retrieves links from a website sitemap.
* @async
* @function
* @param {string} url - The URL of the website.
* @returns {Promise<string[]>} - A Promise that resolves to an array of unique links from the sitemap.
*/
getLinksFromSitemap = async (url) => {
const sitemapUrl = await this.getSitemap(url);
const response = await this.getResponse(sitemapUrl);
if (!response) {
throw new Error(`Invalid response! - ${url}`);
}
const responseData = response.response.data;
const $ = cheerio.load(responseData);
const links = [];
$("loc").each((i, link) => {
links.push($(link).text());
});
const uniqueLinks = [...new Set(links)];
return uniqueLinks;
};
/**
* Retrieves SEO data from the HTML source code.
* @async
* @function
* @param {AxiosResponse} response - The Axios response containing the HTML source code.
* @param {string} url - The URL of the HTML source code.
* @returns {Promise<Metadata>} - A Promise that resolves to an object containing SEO data extracted from the HTML source code.
* @throws {Error} - If an error occurs while processing the HTML source code.
* @typedef {Object} Metadata
*/
getSeoDataFromResponse = async (response, url) => {
const $ = cheerio.load(response.data);
const seoData = {
url: url.toString(),
title: $("title").text(),
description: $("meta[name='description']").attr("content"),
canonical: $("link[rel='canonical']").attr("href"),
robots: $("meta[name='robots']").attr("content"),
links: (await this.getLinks(url, response)).length,
status: response.status,
};
return seoData;
};
/**
* Retrieves all the links from the HTML content of a webpage.
* @async
* @function
* @param {string} url - The URL of the webpage to be crawled.
* @param {AxiosResponse} response - The Axios response containing the HTML content of the webpage.
* @returns {Promise<string[]>} - A Promise that resolves to an array of links extracted from the HTML content.
* The links are normalized to include the hostname if they are relative URLs.
*/
getLinks = async (url, response) => {
const $ = cheerio.load(response.data);
const baseUrl = url;
const links = [];
$("a").each((i, link) => {
const href = $(link).attr("href");
if (href) {
// If the link is not a relative URL
if (href.startsWith(baseUrl)) {
links.push(href);
}
// If the link is a relative URL
else if (href.startsWith("/")) {
links.push(`${baseUrl}${href}`);
}
// Push every remaining link
else {
links.push(href);
}
}
});
return links;
};
/**
* Extracts internal links from the given HTML document.
* @async
* @function
* @param {HttpResponse} response - The HTTP response containing the HTML document.
* @returns {Promise<string[]>} - A Promise that resolves to an array of internal links extracted from the HTML document.
*/
getInternalLinks = async (response) => {
const responseURL = new URL(response.responseURL).origin;
const links = await this.getLinks(responseURL, response.response);
const internalLinks = links.filter((link) => link.startsWith(responseURL));
const uniqueLinks = [...new Set(internalLinks)];
return uniqueLinks;
};
/**
* Extracts external links from the given HTML response.
* @async
* @function
* @param {HttpResponse} response - The HTTP response containing the HTML document.
* @returns {Promise<string[]>} - A Promise that resolves to an array of external links extracted from the HTML document.
*/
getExternalLinks = async (response) => {
const responseURL = new URL(response.responseURL).origin;
const links = await this.getLinks(responseURL, response.response);
const externalLinks = links.filter((link) => !link.startsWith(responseURL));
return externalLinks;
};
/**
* Extracts images from an HTML page.
* @async
* @function
* @param {string} url - The URL of the HTML page to extract images from.
* @returns {Promise<Image[]>} - A Promise that resolves to an array of objects representing images in the HTML page. Each object has an `alt` property representing the alternative text of the image and a `src` property representing the source URL of the image.
*/
getImages = async (url) => {
const response = await this.getResponse(url);
const responseData = response?.response.data;
const images = [];
const $ = cheerio.load(responseData);
$("img").each((i, element) => {
const alt = $(element).attr("alt") || "";
const src = $(element).attr("src");
if (src) {
images.push({ alt, src });
}
});
return images;
};
/**
* Extracts headings from an HTML page.
* @async
* @function
* @param {string} url - The URL of the HTML page to extract headings from.
* @returns {Promise<Array<{ tag: string, text: string }>>} - A Promise that resolves to an array of objects representing headings in the HTML page. Each object has a `tag` property representing the heading tag (e.g., 'h1', 'h2', etc.) and a `text` property representing the text content of the heading.
*/
getHeadings = async (url) => {
const response = await this.getResponse(url);
const responseData = response?.response.data;
const headings = [];
const $ = cheerio.load(responseData);
$("h1, h2, h3, h4, h5, h6").each((i, element) => {
headings.push({
tag: element.name,
text: $(element).text(),
});
});
return headings;
};
/**
* Get the depth of a URL based on its subfolders.
* @function
* @param {string} url - The URL to get the depth from.
* @returns {number} - The number of subfolders in the URL.
*/
getUrlPathDepth = (url_param) => {
const url = new URL(url_param);
let path = url.pathname;
if (path.endsWith("/") && path !== "/") {
path = path.slice(0, -1);
}
const levels = (path.match(/\//g) || []).length;
return levels;
};
}
exports.Utils = Utils;
module.exports = { Utils: Utils };