@crawlee/utils

Version:

A set of shared utilities that can be used by crawlers

58 lines • 2.27 kB

TypeScript

import { Sitemap } from './sitemap'; /** * Loads and queries information from a [robots.txt file](https://en.wikipedia.org/wiki/Robots.txt). * * **Example usage:** * ```javascript * // Load the robots.txt file * const robots = await RobotsTxtFile.find('https://crawlee.dev/js/docs/introduction/first-crawler'); * * // Check if a URL should be crawled according to robots.txt * const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler'; * if (robots.isAllowed(url)) { * await crawler.addRequests([url]); * } * * // Enqueue all links in the sitemap(s) * await crawler.addRequests(await robots.parseUrlsFromSitemaps()); * ``` */ export declare class RobotsTxtFile { private robots; private proxyUrl?; private constructor(); /** * Determine the location of a robots.txt file for a URL and fetch it. * @param url the URL to fetch robots.txt for * @param [proxyUrl] a proxy to be used for fetching the robots.txt file */ static find(url: string, proxyUrl?: string): Promise<RobotsTxtFile>; /** * Allows providing the URL and robots.txt content explicitly instead of loading it from the target site. * @param url the URL for robots.txt file * @param content contents of robots.txt * @param [proxyUrl] a proxy to be used for fetching the robots.txt file */ static from(url: string, content: string, proxyUrl?: string): RobotsTxtFile; protected static load(url: string, proxyUrl?: string): Promise<RobotsTxtFile>; /** * Check if a URL should be crawled by robots. * @param url the URL to check against the rules in robots.txt * @param [userAgent] relevant user agent, default to `*` */ isAllowed(url: string, userAgent?: string): boolean; /** * Get URLs of sitemaps referenced in the robots file. */ getSitemaps(): string[]; /** * Parse all the sitemaps referenced in the robots file. */ parseSitemaps(): Promise<Sitemap>; /** * Get all URLs from all the sitemaps referenced in the robots file. A shorthand for `(await robots.parseSitemaps()).urls`. */ parseUrlsFromSitemaps(): Promise<string[]>; } export { RobotsTxtFile as RobotsFile }; //# sourceMappingURL=robots.d.ts.map