@crawlee/utils
Version:
A set of shared utilities that can be used by crawlers
118 lines • 4.25 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.RobotsFile = exports.RobotsTxtFile = void 0;
const tslib_1 = require("tslib");
const robots_parser_1 = tslib_1.__importDefault(require("robots-parser"));
const gotScraping_1 = require("./gotScraping");
const sitemap_1 = require("./sitemap");
let HTTPError;
/**
* Loads and queries information from a [robots.txt file](https://en.wikipedia.org/wiki/Robots.txt).
*
* **Example usage:**
* ```javascript
* // Load the robots.txt file
* const robots = await RobotsTxtFile.find('https://crawlee.dev/js/docs/introduction/first-crawler');
*
* // Check if a URL should be crawled according to robots.txt
* const url = 'https://crawlee.dev/api/puppeteer-crawler/class/PuppeteerCrawler';
* if (robots.isAllowed(url)) {
* await crawler.addRequests([url]);
* }
*
* // Enqueue all links in the sitemap(s)
* await crawler.addRequests(await robots.parseUrlsFromSitemaps());
* ```
*/
class RobotsTxtFile {
constructor(robots, proxyUrl) {
Object.defineProperty(this, "robots", {
enumerable: true,
configurable: true,
writable: true,
value: robots
});
Object.defineProperty(this, "proxyUrl", {
enumerable: true,
configurable: true,
writable: true,
value: proxyUrl
});
}
/**
* Determine the location of a robots.txt file for a URL and fetch it.
* @param url the URL to fetch robots.txt for
* @param [proxyUrl] a proxy to be used for fetching the robots.txt file
*/
static async find(url, proxyUrl) {
const robotsTxtFileUrl = new URL(url);
robotsTxtFileUrl.pathname = '/robots.txt';
robotsTxtFileUrl.search = '';
return RobotsTxtFile.load(robotsTxtFileUrl.toString(), proxyUrl);
}
/**
* Allows providing the URL and robots.txt content explicitly instead of loading it from the target site.
* @param url the URL for robots.txt file
* @param content contents of robots.txt
* @param [proxyUrl] a proxy to be used for fetching the robots.txt file
*/
static from(url, content, proxyUrl) {
return new RobotsTxtFile((0, robots_parser_1.default)(url, content), proxyUrl);
}
static async load(url, proxyUrl) {
if (!HTTPError) {
HTTPError = (await import('got-scraping')).HTTPError;
}
try {
const response = await (0, gotScraping_1.gotScraping)({
url,
proxyUrl,
method: 'GET',
responseType: 'text',
});
return new RobotsTxtFile((0, robots_parser_1.default)(url.toString(), response.body), proxyUrl);
}
catch (e) {
if (e instanceof HTTPError && e.response.statusCode === 404) {
return new RobotsTxtFile({
isAllowed() {
return true;
},
getSitemaps() {
return [];
},
}, proxyUrl);
}
throw e;
}
}
/**
* Check if a URL should be crawled by robots.
* @param url the URL to check against the rules in robots.txt
* @param [userAgent] relevant user agent, default to `*`
*/
isAllowed(url, userAgent = '*') {
return this.robots.isAllowed(url, userAgent) ?? true; // `undefined` means that there is no explicit rule for the requested URL - assume it's allowed
}
/**
* Get URLs of sitemaps referenced in the robots file.
*/
getSitemaps() {
return this.robots.getSitemaps();
}
/**
* Parse all the sitemaps referenced in the robots file.
*/
async parseSitemaps() {
return sitemap_1.Sitemap.load(this.robots.getSitemaps(), this.proxyUrl);
}
/**
* Get all URLs from all the sitemaps referenced in the robots file. A shorthand for `(await robots.parseSitemaps()).urls`.
*/
async parseUrlsFromSitemaps() {
return (await this.parseSitemaps()).urls;
}
}
exports.RobotsTxtFile = RobotsTxtFile;
exports.RobotsFile = RobotsTxtFile;
//# sourceMappingURL=robots.js.map