UNPKG

@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

333 lines • 17.3 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.RequestQueueV1 = void 0; const promises_1 = require("node:timers/promises"); const consts_1 = require("@apify/consts"); const configuration_1 = require("../configuration"); const access_checking_1 = require("./access_checking"); const request_provider_1 = require("./request_provider"); const utils_1 = require("./utils"); const MAX_CACHED_REQUESTS = 1000000; /** * This number must be large enough so that processing of all these requests cannot be done in * a time lower than expected maximum latency of DynamoDB, but low enough not to waste too much memory. * @internal */ const RECENTLY_HANDLED_CACHE_SIZE = 1000; /** * Represents a queue of URLs to crawl, which is used for deep crawling of websites * where you start with several URLs and then recursively * follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders. * * Each URL is represented using an instance of the {@link Request} class. * The queue can only contain unique URLs. More precisely, it can only contain {@link Request} instances * with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden. * To add a single URL multiple times to the queue, * corresponding {@link Request} objects will need to have different `uniqueKey` properties. * * Do not instantiate this class directly, use the {@link RequestQueue.open} function instead. * * `RequestQueue` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler} * and {@link PlaywrightCrawler} as a source of URLs to crawl. * Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests. * On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch. * * `RequestQueue` stores its data either on local disk or in the Apify Cloud, * depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variable is set. * * If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the queue data is stored in * that directory in an SQLite database file. * * If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the * [Apify Request Queue](https://docs.apify.com/storage/request-queue) * cloud storage. Note that you can force usage of the cloud storage also by passing the `forceCloud` * option to {@link RequestQueue.open} function, * even if the `APIFY_LOCAL_STORAGE_DIR` variable is set. * * **Example usage:** * * ```javascript * // Open the default request queue associated with the crawler run * const queue = await RequestQueue.open(); * * // Open a named request queue * const queueWithName = await RequestQueue.open('some-name'); * * // Enqueue few requests * await queue.addRequest({ url: 'http://example.com/aaa' }); * await queue.addRequest({ url: 'http://example.com/bbb' }); * await queue.addRequest({ url: 'http://example.com/foo/bar' }, { forefront: true }); * ``` * @category Sources * * @deprecated RequestQueue v1 is deprecated and will be removed in the future. Please use {@link RequestQueue} instead. */ class RequestQueue extends request_provider_1.RequestProvider { /** * @internal */ constructor(options, config = configuration_1.Configuration.getGlobalConfig()) { super({ ...options, logPrefix: 'RequestQueue', recentlyHandledRequestsMaxSize: RECENTLY_HANDLED_CACHE_SIZE, requestCacheMaxSize: MAX_CACHED_REQUESTS, }, config); Object.defineProperty(this, "queryQueueHeadPromise", { enumerable: true, configurable: true, writable: true, value: null }); Object.defineProperty(this, "inProgress", { enumerable: true, configurable: true, writable: true, value: new Set() }); } /** * @internal */ inProgressCount() { return this.inProgress.size; } /** * Returns a next request in the queue to be processed, or `null` if there are no more pending requests. * * Once you successfully finish processing of the request, you need to call * {@link RequestQueue.markRequestHandled} * to mark the request as handled in the queue. If there was some error in processing the request, * call {@link RequestQueue.reclaimRequest} instead, * so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function. * * Note that the `null` return value doesn't mean the queue processing finished, * it means there are currently no pending requests. * To check whether all requests in queue were finished, * use {@link RequestQueue.isFinished} instead. * * @returns * Returns the request object or `null` if there are no more pending requests. */ async fetchNextRequest() { (0, access_checking_1.checkStorageAccess)(); this.lastActivity = new Date(); await this.ensureHeadIsNonEmpty(); const nextRequestId = this.queueHeadIds.removeFirst(); // We are likely done at this point. if (!nextRequestId) return null; // This should never happen, but... if (this.inProgress.has(nextRequestId) || this.recentlyHandledRequestsCache.get(nextRequestId)) { this.log.warning('Queue head returned a request that is already in progress?!', { nextRequestId, inProgress: this.inProgress.has(nextRequestId), recentlyHandled: !!this.recentlyHandledRequestsCache.get(nextRequestId), }); return null; } this.inProgress.add(nextRequestId); this.lastActivity = new Date(); let request; try { request = await this.getRequest(nextRequestId); } catch (e) { // On error, remove the request from in progress, otherwise it would be there forever this.inProgress.delete(nextRequestId); throw e; } // NOTE: It can happen that the queue head index is inconsistent with the main queue table. This can occur in two situations: // 1) Queue head index is ahead of the main table and the request is not present in the main table yet (i.e. getRequest() returned null). // In this case, keep the request marked as in progress for a short while, // so that isFinished() doesn't return true and _ensureHeadIsNonEmpty() doesn't not load the request // into the queueHeadDict straight again. After the interval expires, fetchNextRequest() // will try to fetch this request again, until it eventually appears in the main table. if (!request) { this.log.debug('Cannot find a request from the beginning of queue, will be retried later', { nextRequestId, }); setTimeout(() => { this.inProgress.delete(nextRequestId); }, utils_1.STORAGE_CONSISTENCY_DELAY_MILLIS); return null; } // 2) Queue head index is behind the main table and the underlying request was already handled // (by some other client, since we keep the track of handled requests in recentlyHandled dictionary). // We just add the request to the recentlyHandled dictionary so that next call to _ensureHeadIsNonEmpty() // will not put the request again to queueHeadDict. if (request.handledAt) { this.log.debug('Request fetched from the beginning of queue was already handled', { nextRequestId }); this.recentlyHandledRequestsCache.add(nextRequestId, true); return null; } return request; } async ensureHeadIsNonEmpty() { // Alias for backwards compatibility await this._ensureHeadIsNonEmpty(); } /** * We always request more items than is in progress to ensure that something falls into head. * * @param [ensureConsistency] If true then query for queue head is retried until queueModifiedAt * is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS to ensure that queue * head is consistent. * @default false * @param [limit] How many queue head items will be fetched. * @param [iteration] Used when this function is called recursively to limit the recursion. * @returns Indicates if queue head is consistent (true) or inconsistent (false). */ async _ensureHeadIsNonEmpty(ensureConsistency = false, limit = Math.max(this.inProgressCount() * utils_1.QUERY_HEAD_BUFFER, utils_1.QUERY_HEAD_MIN_LENGTH), iteration = 0) { // If we are paused for migration, resolve immediately. if (this.queuePausedForMigration) { return true; } // If is nonempty resolve immediately. if (this.queueHeadIds.length() > 0) { return true; } if (!this.queryQueueHeadPromise) { const queryStartedAt = new Date(); this.queryQueueHeadPromise = this.client .listHead({ limit }) .then(({ items, queueModifiedAt, hadMultipleClients }) => { items.forEach(({ id: requestId, uniqueKey }) => { // Queue head index might be behind the main table, so ensure we don't recycle requests if (!requestId || !uniqueKey || this.inProgress.has(requestId) || this.recentlyHandledRequestsCache.get(requestId)) return; this.queueHeadIds.add(requestId, requestId, false); const forefront = this.requestCache.get((0, utils_1.getRequestId)(uniqueKey))?.forefront ?? false; this._cacheRequest((0, utils_1.getRequestId)(uniqueKey), { requestId, wasAlreadyHandled: false, wasAlreadyPresent: true, uniqueKey, forefront, }); }); // This is needed so that the next call to _ensureHeadIsNonEmpty() will fetch the queue head again. this.queryQueueHeadPromise = null; return { wasLimitReached: items.length >= limit, prevLimit: limit, queueModifiedAt: new Date(queueModifiedAt), queryStartedAt, hadMultipleClients, }; }); } const { queueModifiedAt, wasLimitReached, prevLimit, queryStartedAt, hadMultipleClients } = await this.queryQueueHeadPromise; // TODO: I feel this code below can be greatly simplified... // If queue is still empty then one of the following holds: // - the other calls waiting for this promise already consumed all the returned requests // - the limit was too low and contained only requests in progress // - the writes from other clients were not propagated yet // - the whole queue was processed and we are done // If limit was not reached in the call then there are no more requests to be returned. if (prevLimit >= consts_1.REQUEST_QUEUE_HEAD_MAX_LIMIT) { this.log.warning(`Reached the maximum number of requests in progress: ${consts_1.REQUEST_QUEUE_HEAD_MAX_LIMIT}.`); } const shouldRepeatWithHigherLimit = this.queueHeadIds.length() === 0 && wasLimitReached && prevLimit < consts_1.REQUEST_QUEUE_HEAD_MAX_LIMIT; // If ensureConsistency=true then we must ensure that either: // - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS // - hadMultipleClients=false and this.assumedTotalCount<=this.assumedHandledCount const isDatabaseConsistent = +queryStartedAt - +queueModifiedAt >= utils_1.API_PROCESSED_REQUESTS_DELAY_MILLIS; const isLocallyConsistent = !hadMultipleClients && this.assumedTotalCount <= this.assumedHandledCount; // Consistent information from one source is enough to consider request queue finished. const shouldRepeatForConsistency = ensureConsistency && !isDatabaseConsistent && !isLocallyConsistent; // If both are false then head is consistent and we may exit. if (!shouldRepeatWithHigherLimit && !shouldRepeatForConsistency) return true; // If we are querying for consistency then we limit the number of queries to MAX_QUERIES_FOR_CONSISTENCY. // If this is reached then we return false so that empty() and finished() returns possibly false negative. if (!shouldRepeatWithHigherLimit && iteration > utils_1.MAX_QUERIES_FOR_CONSISTENCY) return false; const nextLimit = shouldRepeatWithHigherLimit ? Math.round(prevLimit * 1.5) : prevLimit; // If we are repeating for consistency then wait required time. if (shouldRepeatForConsistency) { const delayMillis = utils_1.API_PROCESSED_REQUESTS_DELAY_MILLIS - (Date.now() - +queueModifiedAt); this.log.info(`Waiting for ${delayMillis}ms before considering the queue as finished to ensure that the data is consistent.`); await (0, promises_1.setTimeout)(delayMillis); } return this._ensureHeadIsNonEmpty(ensureConsistency, nextLimit, iteration + 1); } // RequestQueue v1 behavior overrides below async isFinished() { (0, access_checking_1.checkStorageAccess)(); if (Date.now() - +this.lastActivity > this.internalTimeoutMillis) { const message = `The request queue seems to be stuck for ${this.internalTimeoutMillis / 1e3}s, resetting internal state.`; this.log.warning(message, { inProgress: [...this.inProgress] }); this._reset(); } if (this.inProgressRequestBatchCount > 0) { return false; } if (this.queueHeadIds.length() > 0 || this.inProgressCount() > 0) return false; const isHeadConsistent = await this._ensureHeadIsNonEmpty(true); return isHeadConsistent && this.queueHeadIds.length() === 0 && this.inProgressCount() === 0; } /** * Reclaims a failed request back to the queue, so that it can be returned for processing later again * by another call to {@link RequestQueue.fetchNextRequest}. * The request record in the queue is updated using the provided `request` parameter. * For example, this lets you store the number of retries or error messages for the request. */ async reclaimRequest(...args) { (0, access_checking_1.checkStorageAccess)(); const [request, options] = args; const forefront = options?.forefront ?? false; const result = await super.reclaimRequest(...args); // Wait a little to increase a chance that the next call to fetchNextRequest() will return the request with updated data. // This is to compensate for the limitation of DynamoDB, where writes might not be immediately visible to subsequent reads. setTimeout(() => { if (!this.inProgress.has(request.id)) { this.log.debug('The request is no longer marked as in progress in the queue?!', { requestId: request.id, }); return; } this.inProgress.delete(request.id); // Performance optimization: add request straight to head if possible this._maybeAddRequestToQueueHead(request.id, forefront); }, utils_1.STORAGE_CONSISTENCY_DELAY_MILLIS); return result; } /** * @inheritdoc */ async markRequestHandled(request) { const res = await super.markRequestHandled(request); this.inProgress.delete(request.id); return res; } _reset() { super._reset(); this.inProgress.clear(); } /** * Opens a request queue and returns a promise resolving to an instance * of the {@link RequestQueue} class. * * {@link RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud. * The queue is used for deep crawling of websites, where you start with several URLs and then * recursively follow links to other pages. The data structure supports both breadth-first * and depth-first crawling orders. * * For more details and code examples, see the {@link RequestQueue} class. * * @param [queueIdOrName] * ID or name of the request queue to be opened. If `null` or `undefined`, * the function returns the default request queue associated with the crawler run. * @param [options] Open Request Queue options. */ static async open(...args) { return super.open(...args); } } exports.RequestQueueV1 = RequestQueue; //# sourceMappingURL=request_queue.js.map