@crawlee/utils
Version:
A set of shared utilities that can be used by crawlers
58 lines • 2.27 kB
TypeScript
import { Sitemap } from './sitemap';
/**
* Loads and queries information from a [robots.txt file](https://en.wikipedia.org/wiki/Robots.txt).
*
* **Example usage:**
* ```javascript
* // Load the robots.txt file
* const robots = await RobotsTxtFile.find('https://crawlee.dev/js/docs/introduction/first-crawler');
*
* // Check if a URL should be crawled according to robots.txt
* const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler';
* if (robots.isAllowed(url)) {
* await crawler.addRequests([url]);
* }
*
* // Enqueue all links in the sitemap(s)
* await crawler.addRequests(await robots.parseUrlsFromSitemaps());
* ```
*/
export declare class RobotsTxtFile {
private robots;
private proxyUrl?;
private constructor();
/**
* Determine the location of a robots.txt file for a URL and fetch it.
* @param url the URL to fetch robots.txt for
* @param [proxyUrl] a proxy to be used for fetching the robots.txt file
*/
static find(url: string, proxyUrl?: string): Promise<RobotsTxtFile>;
/**
* Allows providing the URL and robots.txt content explicitly instead of loading it from the target site.
* @param url the URL for robots.txt file
* @param content contents of robots.txt
* @param [proxyUrl] a proxy to be used for fetching the robots.txt file
*/
static from(url: string, content: string, proxyUrl?: string): RobotsTxtFile;
protected static load(url: string, proxyUrl?: string): Promise<RobotsTxtFile>;
/**
* Check if a URL should be crawled by robots.
* @param url the URL to check against the rules in robots.txt
* @param [userAgent] relevant user agent, default to `*`
*/
isAllowed(url: string, userAgent?: string): boolean;
/**
* Get URLs of sitemaps referenced in the robots file.
*/
getSitemaps(): string[];
/**
* Parse all the sitemaps referenced in the robots file.
*/
parseSitemaps(): Promise<Sitemap>;
/**
* Get all URLs from all the sitemaps referenced in the robots file. A shorthand for `(await robots.parseSitemaps()).urls`.
*/
parseUrlsFromSitemaps(): Promise<string[]>;
}
export { RobotsTxtFile as RobotsFile };
//# sourceMappingURL=robots.d.ts.map