spido
Version:
Web crawler/spider for node.js & nest.js server.
115 lines (114 loc) • 5.57 kB
TypeScript
import { AxiosResponse } from "axios";
import { HttpResponse, Image, Metadata } from "./interfaces";
export declare class Utils {
/**
* Retrieves the HTML from a specified URL using Axios.
* @async
* @function
* @param {string} url - The URL to retrieve the HTML from.
* @returns {Promise<HttpResponse>} - A Promise that resolves to an object containing the response and the response URL.
* @throws {Error} - If there is an error with the Axios request, an error will be thrown.
*/
getResponse: (url: string) => Promise<HttpResponse | undefined>;
/**
* Extracts the base URL from a given URL.
* @param {string} url - The URL to extract the base URL from.
* @returns {Promise<string>} - The base URL of the URL.
* @throws {Error} - If the URL is invalid or the response status is not valid.
*/
getBaseUrl: (url: string) => Promise<string>;
/**
* Checks if the given URL is valid and the response is OK.
* @param {number} status - The status code of the response to be checked.
* @returns {Promise<boolean>} - A Promise that resolves to a boolean indicating whether the URL is valid and the response is OK.
* @example
* isValidUrl(200)
* .then(valid => console.log(valid)); // returns true or false
*/
isValidUrl: (status: number) => Promise<boolean>;
/**
* Retrieves the sitemap URL of a given URL.
* @async
* @param {string} url - The URL to retrieve the sitemap URL from.
* @returns {Promise<string>} - A Promise that resolves to the sitemap URL.
*/
getSitemap: (url: string) => Promise<string>;
/**
* Checks if the sitemap URL is valid and the response is 200.
* @async
* @param {string} url - The sitemap URL to check.
* @returns {Promise<boolean>} - A Promise that resolves to true if the sitemap URL is valid and the response is 200, otherwise false.
*/
isSitemap: (url: string) => Promise<boolean>;
/**
* Retrieves links from a website sitemap.
* @async
* @function
* @param {string} url - The URL of the website.
* @returns {Promise<string[]>} - A Promise that resolves to an array of unique links from the sitemap.
*/
getLinksFromSitemap: (url: string) => Promise<string[]>;
/**
* Retrieves SEO data from the HTML source code.
* @async
* @function
* @param {AxiosResponse} response - The Axios response containing the HTML source code.
* @param {string} url - The URL of the HTML source code.
* @returns {Promise<Metadata>} - A Promise that resolves to an object containing SEO data extracted from the HTML source code.
* @throws {Error} - If an error occurs while processing the HTML source code.
* @typedef {Object} Metadata
*/
getSeoDataFromResponse: (response: AxiosResponse, url: string) => Promise<Metadata>;
/**
* Retrieves all the links from the HTML content of a webpage.
* @async
* @function
* @param {string} url - The URL of the webpage to be crawled.
* @param {AxiosResponse} response - The Axios response containing the HTML content of the webpage.
* @returns {Promise<string[]>} - A Promise that resolves to an array of links extracted from the HTML content.
* The links are normalized to include the hostname if they are relative URLs.
*/
getLinks: (url: string, response: AxiosResponse) => Promise<string[]>;
/**
* Extracts internal links from the given HTML document.
* @async
* @function
* @param {HttpResponse} response - The HTTP response containing the HTML document.
* @returns {Promise<string[]>} - A Promise that resolves to an array of internal links extracted from the HTML document.
*/
getInternalLinks: (response: HttpResponse) => Promise<string[]>;
/**
* Extracts external links from the given HTML response.
* @async
* @function
* @param {HttpResponse} response - The HTTP response containing the HTML document.
* @returns {Promise<string[]>} - A Promise that resolves to an array of external links extracted from the HTML document.
*/
getExternalLinks: (response: HttpResponse) => Promise<string[]>;
/**
* Extracts images from an HTML page.
* @async
* @function
* @param {string} url - The URL of the HTML page to extract images from.
* @returns {Promise<Image[]>} - A Promise that resolves to an array of objects representing images in the HTML page. Each object has an `alt` property representing the alternative text of the image and a `src` property representing the source URL of the image.
*/
getImages: (url: string) => Promise<Image[]>;
/**
* Extracts headings from an HTML page.
* @async
* @function
* @param {string} url - The URL of the HTML page to extract headings from.
* @returns {Promise<Array<{ tag: string, text: string }>>} - A Promise that resolves to an array of objects representing headings in the HTML page. Each object has a `tag` property representing the heading tag (e.g., 'h1', 'h2', etc.) and a `text` property representing the text content of the heading.
*/
getHeadings: (url: string) => Promise<Array<{
tag: string;
text: string;
}>>;
/**
* Get the depth of a URL based on its subfolders.
* @function
* @param {string} url - The URL to get the depth from.
* @returns {number} - The number of subfolders in the URL.
*/
getUrlPathDepth: (url_param: string) => number;
}