UNPKG

spido

Version:

Web crawler/spider for node.js & nest.js server.

115 lines (114 loc) 5.57 kB
import { AxiosResponse } from "axios"; import { HttpResponse, Image, Metadata } from "./interfaces"; export declare class Utils { /** * Retrieves the HTML from a specified URL using Axios. * @async * @function * @param {string} url - The URL to retrieve the HTML from. * @returns {Promise<HttpResponse>} - A Promise that resolves to an object containing the response and the response URL. * @throws {Error} - If there is an error with the Axios request, an error will be thrown. */ getResponse: (url: string) => Promise<HttpResponse | undefined>; /** * Extracts the base URL from a given URL. * @param {string} url - The URL to extract the base URL from. * @returns {Promise<string>} - The base URL of the URL. * @throws {Error} - If the URL is invalid or the response status is not valid. */ getBaseUrl: (url: string) => Promise<string>; /** * Checks if the given URL is valid and the response is OK. * @param {number} status - The status code of the response to be checked. * @returns {Promise<boolean>} - A Promise that resolves to a boolean indicating whether the URL is valid and the response is OK. * @example * isValidUrl(200) * .then(valid => console.log(valid)); // returns true or false */ isValidUrl: (status: number) => Promise<boolean>; /** * Retrieves the sitemap URL of a given URL. * @async * @param {string} url - The URL to retrieve the sitemap URL from. * @returns {Promise<string>} - A Promise that resolves to the sitemap URL. */ getSitemap: (url: string) => Promise<string>; /** * Checks if the sitemap URL is valid and the response is 200. * @async * @param {string} url - The sitemap URL to check. * @returns {Promise<boolean>} - A Promise that resolves to true if the sitemap URL is valid and the response is 200, otherwise false. */ isSitemap: (url: string) => Promise<boolean>; /** * Retrieves links from a website sitemap. * @async * @function * @param {string} url - The URL of the website. * @returns {Promise<string[]>} - A Promise that resolves to an array of unique links from the sitemap. */ getLinksFromSitemap: (url: string) => Promise<string[]>; /** * Retrieves SEO data from the HTML source code. * @async * @function * @param {AxiosResponse} response - The Axios response containing the HTML source code. * @param {string} url - The URL of the HTML source code. * @returns {Promise<Metadata>} - A Promise that resolves to an object containing SEO data extracted from the HTML source code. * @throws {Error} - If an error occurs while processing the HTML source code. * @typedef {Object} Metadata */ getSeoDataFromResponse: (response: AxiosResponse, url: string) => Promise<Metadata>; /** * Retrieves all the links from the HTML content of a webpage. * @async * @function * @param {string} url - The URL of the webpage to be crawled. * @param {AxiosResponse} response - The Axios response containing the HTML content of the webpage. * @returns {Promise<string[]>} - A Promise that resolves to an array of links extracted from the HTML content. * The links are normalized to include the hostname if they are relative URLs. */ getLinks: (url: string, response: AxiosResponse) => Promise<string[]>; /** * Extracts internal links from the given HTML document. * @async * @function * @param {HttpResponse} response - The HTTP response containing the HTML document. * @returns {Promise<string[]>} - A Promise that resolves to an array of internal links extracted from the HTML document. */ getInternalLinks: (response: HttpResponse) => Promise<string[]>; /** * Extracts external links from the given HTML response. * @async * @function * @param {HttpResponse} response - The HTTP response containing the HTML document. * @returns {Promise<string[]>} - A Promise that resolves to an array of external links extracted from the HTML document. */ getExternalLinks: (response: HttpResponse) => Promise<string[]>; /** * Extracts images from an HTML page. * @async * @function * @param {string} url - The URL of the HTML page to extract images from. * @returns {Promise<Image[]>} - A Promise that resolves to an array of objects representing images in the HTML page. Each object has an `alt` property representing the alternative text of the image and a `src` property representing the source URL of the image. */ getImages: (url: string) => Promise<Image[]>; /** * Extracts headings from an HTML page. * @async * @function * @param {string} url - The URL of the HTML page to extract headings from. * @returns {Promise<Array<{ tag: string, text: string }>>} - A Promise that resolves to an array of objects representing headings in the HTML page. Each object has a `tag` property representing the heading tag (e.g., 'h1', 'h2', etc.) and a `text` property representing the text content of the heading. */ getHeadings: (url: string) => Promise<Array<{ tag: string; text: string; }>>; /** * Get the depth of a URL based on its subfolders. * @function * @param {string} url - The URL to get the depth from. * @returns {number} - The number of subfolders in the URL. */ getUrlPathDepth: (url_param: string) => number; }