UNPKG

@crawlee/utils

Version:

A set of shared utilities that can be used by crawlers

117 lines • 4.16 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.RobotsFile = void 0; const tslib_1 = require("tslib"); const robots_parser_1 = tslib_1.__importDefault(require("robots-parser")); const gotScraping_1 = require("./gotScraping"); const sitemap_1 = require("./sitemap"); let HTTPError; /** * Loads and queries information from a [robots.txt file](https://en.wikipedia.org/wiki/Robots.txt). * * **Example usage:** * ```javascript * // Load the robots.txt file * const robots = await RobotsFile.find('https://crawlee.dev/docs/introduction/first-crawler'); * * // Check if a URL should be crawled according to robots.txt * const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler'; * if (robots.isAllowed(url)) { * await crawler.addRequests([url]); * } * * // Enqueue all links in the sitemap(s) * await crawler.addRequests(await robots.parseUrlsFromSitemaps()); * ``` */ class RobotsFile { constructor(robots, proxyUrl) { Object.defineProperty(this, "robots", { enumerable: true, configurable: true, writable: true, value: robots }); Object.defineProperty(this, "proxyUrl", { enumerable: true, configurable: true, writable: true, value: proxyUrl }); } /** * Determine the location of a robots.txt file for a URL and fetch it. * @param url the URL to fetch robots.txt for * @param [proxyUrl] a proxy to be used for fetching the robots.txt file */ static async find(url, proxyUrl) { const robotsFileUrl = new URL(url); robotsFileUrl.pathname = '/robots.txt'; robotsFileUrl.search = ''; return RobotsFile.load(robotsFileUrl.toString(), proxyUrl); } /** * Allows providing the URL and robots.txt content explicitly instead of loading it from the target site. * @param url the URL for robots.txt file * @param content contents of robots.txt * @param [proxyUrl] a proxy to be used for fetching the robots.txt file */ static from(url, content, proxyUrl) { return new RobotsFile((0, robots_parser_1.default)(url, content), proxyUrl); } static async load(url, proxyUrl) { if (!HTTPError) { HTTPError = (await import('got-scraping')).HTTPError; } try { const response = await (0, gotScraping_1.gotScraping)({ url, proxyUrl, method: 'GET', responseType: 'text', }); return new RobotsFile((0, robots_parser_1.default)(url.toString(), response.body), proxyUrl); } catch (e) { if (e instanceof HTTPError && e.response.statusCode === 404) { return new RobotsFile({ isAllowed() { return true; }, getSitemaps() { return []; }, }, proxyUrl); } throw e; } } /** * Check if a URL should be crawled by robots. * @param url the URL to check against the rules in robots.txt * @param [userAgent] relevant user agent, default to `*` */ isAllowed(url, userAgent = '*') { return this.robots.isAllowed(url, userAgent) ?? true; // `undefined` means that there is no explicit rule for the requested URL - assume it's allowed } /** * Get URLs of sitemaps referenced in the robots file. */ getSitemaps() { return this.robots.getSitemaps(); } /** * Parse all the sitemaps referenced in the robots file. */ async parseSitemaps() { return sitemap_1.Sitemap.load(this.robots.getSitemaps(), this.proxyUrl); } /** * Get all URLs from all the sitemaps referenced in the robots file. A shorthand for `(await robots.parseSitemaps()).urls`. */ async parseUrlsFromSitemaps() { return (await this.parseSitemaps()).urls; } } exports.RobotsFile = RobotsFile; //# sourceMappingURL=robots.js.map