UNPKG

@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

439 lines • 18.4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Request = exports.RequestState = void 0; const tslib_1 = require("tslib"); const node_crypto_1 = tslib_1.__importDefault(require("node:crypto")); const node_util_1 = tslib_1.__importDefault(require("node:util")); const ow_1 = tslib_1.__importDefault(require("ow")); const utilities_1 = require("@apify/utilities"); const log_1 = require("./log"); const typedefs_1 = require("./typedefs"); // new properties on the Request object breaks serialization const log = log_1.log.child({ prefix: 'Request' }); const requestOptionalPredicates = { id: ow_1.default.optional.string, loadedUrl: ow_1.default.optional.string.url, uniqueKey: ow_1.default.optional.string, method: ow_1.default.optional.string, payload: ow_1.default.optional.any(ow_1.default.string, ow_1.default.uint8Array), noRetry: ow_1.default.optional.boolean, retryCount: ow_1.default.optional.number, sessionRotationCount: ow_1.default.optional.number, maxRetries: ow_1.default.optional.number, errorMessages: ow_1.default.optional.array.ofType(ow_1.default.string), headers: ow_1.default.optional.object, userData: ow_1.default.optional.object, label: ow_1.default.optional.string, handledAt: ow_1.default.optional.any(ow_1.default.string.date, ow_1.default.date), keepUrlFragment: ow_1.default.optional.boolean, useExtendedUniqueKey: ow_1.default.optional.boolean, skipNavigation: ow_1.default.optional.boolean, crawlDepth: ow_1.default.optional.number.greaterThanOrEqual(0), state: ow_1.default.optional.number.greaterThanOrEqual(0).lessThanOrEqual(6), }; var RequestState; (function (RequestState) { RequestState[RequestState["UNPROCESSED"] = 0] = "UNPROCESSED"; RequestState[RequestState["BEFORE_NAV"] = 1] = "BEFORE_NAV"; RequestState[RequestState["AFTER_NAV"] = 2] = "AFTER_NAV"; RequestState[RequestState["REQUEST_HANDLER"] = 3] = "REQUEST_HANDLER"; RequestState[RequestState["DONE"] = 4] = "DONE"; RequestState[RequestState["ERROR_HANDLER"] = 5] = "ERROR_HANDLER"; RequestState[RequestState["ERROR"] = 6] = "ERROR"; RequestState[RequestState["SKIPPED"] = 7] = "SKIPPED"; })(RequestState || (exports.RequestState = RequestState = {})); /** * Represents a URL to be crawled, optionally including HTTP method, headers, payload and other metadata. * The `Request` object also stores information about errors that occurred during processing of the request. * * Each `Request` instance has the `uniqueKey` property, which can be either specified * manually in the constructor or generated automatically from the URL. Two requests with the same `uniqueKey` * are considered as pointing to the same web resource. This behavior applies to all Crawlee classes, * such as {@link RequestList}, {@link RequestQueue}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler}. * * > To access and examine the actual request sent over http, with all autofilled headers you can access * `response.request` object from the request handler * * Example use: * * ```javascript * const request = new Request({ * url: 'http://www.example.com', * headers: { Accept: 'application/json' }, * }); * * ... * * request.userData.foo = 'bar'; * request.pushErrorMessage(new Error('Request failed!')); * * ... * * const foo = request.userData.foo; * ``` * @category Sources */ class Request { /** * `Request` parameters including the URL, HTTP method and headers, and others. */ constructor(options) { var _a; /** Request ID */ Object.defineProperty(this, "id", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** URL of the web page to crawl. */ Object.defineProperty(this, "url", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * An actually loaded URL after redirects, if present. HTTP redirects are guaranteed * to be included. * * When using {@link PuppeteerCrawler} or {@link PlaywrightCrawler}, meta tag and JavaScript redirects may, * or may not be included, depending on their nature. This generally means that redirects, * which happen immediately will most likely be included, but delayed redirects will not. */ Object.defineProperty(this, "loadedUrl", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * A unique key identifying the request. * Two requests with the same `uniqueKey` are considered as pointing to the same URL. */ Object.defineProperty(this, "uniqueKey", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** HTTP method, e.g. `GET` or `POST`. */ Object.defineProperty(this, "method", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** HTTP request payload, e.g. for POST requests. */ Object.defineProperty(this, "payload", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** The `true` value indicates that the request will not be automatically retried on error. */ Object.defineProperty(this, "noRetry", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** Indicates the number of times the crawling of the request has been retried on error. */ Object.defineProperty(this, "retryCount", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** An array of error messages from request processing. */ Object.defineProperty(this, "errorMessages", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** Object with HTTP headers. Key is header name, value is the value. */ Object.defineProperty(this, "headers", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** Private store for the custom user data assigned to the request. */ Object.defineProperty(this, "_userData", { enumerable: true, configurable: true, writable: true, value: {} }); /** Custom user data assigned to the request. */ Object.defineProperty(this, "userData", { enumerable: true, configurable: true, writable: true, value: {} }); /** * ISO datetime string that indicates the time when the request has been processed. * Is `null` if the request has not been crawled yet. */ Object.defineProperty(this, "handledAt", { enumerable: true, configurable: true, writable: true, value: void 0 }); (0, ow_1.default)(options, 'RequestOptions', ow_1.default.object); (0, ow_1.default)(options.url, 'RequestOptions.url', ow_1.default.string); // 'ow' validation is slow, because it checks all predicates // even if the validated object has only 1 property. // This custom validation loop iterates only over existing // properties and speeds up the validation cca 3-fold. // See https://github.com/sindresorhus/ow/issues/193 (0, typedefs_1.keys)(options).forEach((prop) => { // skip url, because it is validated above if (prop === 'url') { return; } const predicate = requestOptionalPredicates[prop]; const value = options[prop]; if (predicate) { (0, ow_1.default)(value, `RequestOptions.${prop}`, predicate); } }); const { id, url, loadedUrl, uniqueKey, payload, noRetry = false, retryCount = 0, sessionRotationCount = 0, maxRetries, errorMessages = [], headers = {}, userData = {}, label, handledAt, keepUrlFragment = false, useExtendedUniqueKey = false, skipNavigation, enqueueStrategy, crawlDepth, } = options; let { method = 'GET' } = options; method = method.toUpperCase(); if (method === 'GET' && payload) throw new Error('Request with GET method cannot have a payload.'); this.id = id; this.url = url; this.loadedUrl = loadedUrl; this.uniqueKey = uniqueKey || Request.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey }); this.method = method; this.payload = payload; this.noRetry = noRetry; this.retryCount = retryCount; this.sessionRotationCount = sessionRotationCount; this.errorMessages = [...errorMessages]; this.headers = { ...headers }; this.handledAt = handledAt instanceof Date ? handledAt.toISOString() : handledAt; if (label) { userData.label = label; } Object.defineProperties(this, { _userData: { value: { __crawlee: {}, ...userData }, enumerable: false, writable: true, }, userData: { get: () => this._userData, set: (value) => { Object.defineProperties(value, { __crawlee: { value: this._userData.__crawlee, enumerable: false, writable: true, }, toJSON: { value: () => { if (Object.keys(this._userData.__crawlee).length > 0) { return { ...this._userData, __crawlee: this._userData.__crawlee, }; } return this._userData; }, enumerable: false, writable: true, }, }); this._userData = value; }, enumerable: true, }, }); // reassign userData to ensure internal `__crawlee` object is non-enumerable this.userData = userData; if (skipNavigation != null) this.skipNavigation = skipNavigation; if (maxRetries != null) this.maxRetries = maxRetries; if (crawlDepth != null) (_a = this.userData.__crawlee).crawlDepth ?? (_a.crawlDepth = crawlDepth); // If it's already set, don't override it (for instance when fetching from storage) if (enqueueStrategy) { this.enqueueStrategy ?? (this.enqueueStrategy = enqueueStrategy); } } /** Tells the crawler processing this request to skip the navigation and process the request directly. */ get skipNavigation() { return this.userData.__crawlee?.skipNavigation ?? false; } /** Tells the crawler processing this request to skip the navigation and process the request directly. */ set skipNavigation(value) { if (!this.userData.__crawlee) { this.userData.__crawlee = { skipNavigation: value }; } else { this.userData.__crawlee.skipNavigation = value; } } /** * Depth of the request in the current crawl tree. * Note that this is dependent on the crawler setup and might produce unexpected results when used with multiple crawlers. */ get crawlDepth() { return this.userData.__crawlee?.crawlDepth ?? 0; } /** * Depth of the request in the current crawl tree. * Note that this is dependent on the crawler setup and might produce unexpected results when used with multiple crawlers. */ set crawlDepth(value) { var _a; (_a = this.userData).__crawlee ?? (_a.__crawlee = {}); this.userData.__crawlee.crawlDepth = value; } /** Indicates the number of times the crawling of the request has rotated the session due to a session or a proxy error. */ get sessionRotationCount() { return this.userData.__crawlee?.sessionRotationCount ?? 0; } /** Indicates the number of times the crawling of the request has rotated the session due to a session or a proxy error. */ set sessionRotationCount(value) { if (!this.userData.__crawlee) { this.userData.__crawlee = { sessionRotationCount: value }; } else { this.userData.__crawlee.sessionRotationCount = value; } } /** shortcut for getting `request.userData.label` */ get label() { return this.userData.label; } /** shortcut for setting `request.userData.label` */ set label(value) { this.userData.label = value; } /** Maximum number of retries for this request. Allows to override the global `maxRequestRetries` option of `BasicCrawler`. */ get maxRetries() { return this.userData.__crawlee?.maxRetries; } /** Maximum number of retries for this request. Allows to override the global `maxRequestRetries` option of `BasicCrawler`. */ set maxRetries(value) { if (!this.userData.__crawlee) { this.userData.__crawlee = { maxRetries: value }; } else { this.userData.__crawlee.maxRetries = value; } } /** Describes the request's current lifecycle state. */ get state() { return this.userData.__crawlee?.state ?? RequestState.UNPROCESSED; } /** Describes the request's current lifecycle state. */ set state(value) { if (!this.userData.__crawlee) { this.userData.__crawlee = { state: value }; } else { this.userData.__crawlee.state = value; } } get enqueueStrategy() { return this.userData.__crawlee?.enqueueStrategy; } set enqueueStrategy(value) { if (!this.userData.__crawlee) { this.userData.__crawlee = { enqueueStrategy: value }; } else { this.userData.__crawlee.enqueueStrategy = value; } } /** * Stores information about an error that occurred during processing of this request. * * You should always use Error instances when throwing errors in JavaScript. * * Nevertheless, to improve the debugging experience when using third party libraries * that may not always throw an Error instance, the function performs a type * inspection of the passed argument and attempts to extract as much information * as possible, since just throwing a bad type error makes any debugging rather difficult. * * @param errorOrMessage Error object or error message to be stored in the request. * @param [options] */ pushErrorMessage(errorOrMessage, options = {}) { const { omitStack } = options; let message; const type = typeof errorOrMessage; if (type === 'object') { if (!errorOrMessage) { message = 'null'; } else if (errorOrMessage instanceof Error) { message = omitStack ? errorOrMessage.message : // .stack includes the message errorOrMessage.stack; } else if (Reflect.has(Object(errorOrMessage), 'message')) { message = Reflect.get(Object(errorOrMessage), 'message'); } else if (errorOrMessage.toString() !== '[object Object]') { message = errorOrMessage.toString(); } else { try { message = node_util_1.default.inspect(errorOrMessage); } catch (err) { message = 'Unable to extract any message from the received object.'; } } } else if (type === 'undefined') { message = 'undefined'; } else { message = errorOrMessage.toString(); } this.errorMessages.push(message); } // TODO: only for better BC, remove in v4 _computeUniqueKey(options) { return Request.computeUniqueKey(options); } // TODO: only for better BC, remove in v4 _hashPayload(payload) { return Request.hashPayload(payload); } /** @internal */ static computeUniqueKey({ url, method = 'GET', payload, keepUrlFragment = false, useExtendedUniqueKey = false, }) { const normalizedMethod = method.toUpperCase(); const normalizedUrl = (0, utilities_1.normalizeUrl)(url, keepUrlFragment) || url; // It returns null when url is invalid, causing weird errors. if (!useExtendedUniqueKey) { if (normalizedMethod !== 'GET' && payload) { // Using log.deprecated to log only once. We should add log.once or some such. log.deprecated(`We've encountered a ${normalizedMethod} Request with a payload. ` + 'This is fine. Just letting you know that if your requests point to the same URL ' + 'and differ only in method and payload, you should see the "useExtendedUniqueKey" option of Request constructor.'); } return normalizedUrl; } const payloadHash = payload ? Request.hashPayload(payload) : ''; return `${normalizedMethod}(${payloadHash}):${normalizedUrl}`; } /** @internal */ static hashPayload(payload) { return node_crypto_1.default.createHash('sha256').update(payload).digest('base64').replace(/[+/=]/g, '').substring(0, 8); } } exports.Request = Request; //# sourceMappingURL=request.js.map