UNPKG

@crawlee/utils

Version:

A set of shared utilities that can be used by crawlers

118 lines • 4.25 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.RobotsFile = exports.RobotsTxtFile = void 0; const tslib_1 = require("tslib"); const robots_parser_1 = tslib_1.__importDefault(require("robots-parser")); const gotScraping_1 = require("./gotScraping"); const sitemap_1 = require("./sitemap"); let HTTPError; /** * Loads and queries information from a [robots.txt file](https://en.wikipedia.org/wiki/Robots.txt). * * **Example usage:** * ```javascript * // Load the robots.txt file * const robots = await RobotsTxtFile.find('https://crawlee.dev/js/docs/introduction/first-crawler'); * * // Check if a URL should be crawled according to robots.txt * const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler'; * if (robots.isAllowed(url)) { * await crawler.addRequests([url]); * } * * // Enqueue all links in the sitemap(s) * await crawler.addRequests(await robots.parseUrlsFromSitemaps()); * ``` */ class RobotsTxtFile { constructor(robots, proxyUrl) { Object.defineProperty(this, "robots", { enumerable: true, configurable: true, writable: true, value: robots }); Object.defineProperty(this, "proxyUrl", { enumerable: true, configurable: true, writable: true, value: proxyUrl }); } /** * Determine the location of a robots.txt file for a URL and fetch it. * @param url the URL to fetch robots.txt for * @param [proxyUrl] a proxy to be used for fetching the robots.txt file */ static async find(url, proxyUrl) { const robotsTxtFileUrl = new URL(url); robotsTxtFileUrl.pathname = '/robots.txt'; robotsTxtFileUrl.search = ''; return RobotsTxtFile.load(robotsTxtFileUrl.toString(), proxyUrl); } /** * Allows providing the URL and robots.txt content explicitly instead of loading it from the target site. * @param url the URL for robots.txt file * @param content contents of robots.txt * @param [proxyUrl] a proxy to be used for fetching the robots.txt file */ static from(url, content, proxyUrl) { return new RobotsTxtFile((0, robots_parser_1.default)(url, content), proxyUrl); } static async load(url, proxyUrl) { if (!HTTPError) { HTTPError = (await import('got-scraping')).HTTPError; } try { const response = await (0, gotScraping_1.gotScraping)({ url, proxyUrl, method: 'GET', responseType: 'text', }); return new RobotsTxtFile((0, robots_parser_1.default)(url.toString(), response.body), proxyUrl); } catch (e) { if (e instanceof HTTPError && e.response.statusCode === 404) { return new RobotsTxtFile({ isAllowed() { return true; }, getSitemaps() { return []; }, }, proxyUrl); } throw e; } } /** * Check if a URL should be crawled by robots. * @param url the URL to check against the rules in robots.txt * @param [userAgent] relevant user agent, default to `*` */ isAllowed(url, userAgent = '*') { return this.robots.isAllowed(url, userAgent) ?? true; // `undefined` means that there is no explicit rule for the requested URL - assume it's allowed } /** * Get URLs of sitemaps referenced in the robots file. */ getSitemaps() { return this.robots.getSitemaps(); } /** * Parse all the sitemaps referenced in the robots file. */ async parseSitemaps() { return sitemap_1.Sitemap.load(this.robots.getSitemaps(), this.proxyUrl); } /** * Get all URLs from all the sitemaps referenced in the robots file. A shorthand for `(await robots.parseSitemaps()).urls`. */ async parseUrlsFromSitemaps() { return (await this.parseSitemaps()).urls; } } exports.RobotsTxtFile = RobotsTxtFile; exports.RobotsFile = RobotsTxtFile; //# sourceMappingURL=robots.js.map