@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
462 lines • 21.2 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.RequestQueue = void 0;
const configuration_1 = require("../configuration");
const access_checking_1 = require("./access_checking");
const request_provider_1 = require("./request_provider");
const utils_1 = require("./utils");
// Double the limit of RequestQueue v1 (1_000_000) as we also store keyed by request.id, not just from uniqueKey
const MAX_CACHED_REQUESTS = 2000000;
/**
* This number must be large enough so that processing of all these requests cannot be done in
* a time lower than expected maximum latency of DynamoDB, but low enough not to waste too much memory.
* @internal
*/
const RECENTLY_HANDLED_CACHE_SIZE = 1000;
const LIST_AND_LOCK_HEAD_LIMIT = 25;
const QUEUE_HEAD_REFILL_THRESHOLD = 1;
/**
* Represents a queue of URLs to crawl, which is used for deep crawling of websites
* where you start with several URLs and then recursively
* follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders.
*
* Each URL is represented using an instance of the {@link Request} class.
* The queue can only contain unique URLs. More precisely, it can only contain {@link Request} instances
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
* To add a single URL multiple times to the queue,
* corresponding {@link Request} objects will need to have different `uniqueKey` properties.
*
* Do not instantiate this class directly, use the {@link RequestQueue.open} function instead.
*
* `RequestQueue` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
* and {@link PlaywrightCrawler} as a source of URLs to crawl.
* Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests.
* On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch.
*
* **Example usage:**
*
* ```javascript
* // Open the default request queue associated with the crawler run
* const queue = await RequestQueue.open();
*
* // Open a named request queue
* const queueWithName = await RequestQueue.open('some-name');
*
* // Enqueue few requests
* await queue.addRequest({ url: 'http://example.com/aaa' });
* await queue.addRequest({ url: 'http://example.com/bbb' });
* await queue.addRequest({ url: 'http://example.com/foo/bar' }, { forefront: true });
* ```
* @category Sources
*/
class RequestQueue extends request_provider_1.RequestProvider {
constructor(options, config = configuration_1.Configuration.getGlobalConfig()) {
super({
...options,
logPrefix: 'RequestQueue2',
recentlyHandledRequestsMaxSize: RECENTLY_HANDLED_CACHE_SIZE,
requestCacheMaxSize: MAX_CACHED_REQUESTS,
}, config);
Object.defineProperty(this, "listHeadAndLockPromise", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
Object.defineProperty(this, "queueHasLockedRequests", {
enumerable: true,
configurable: true,
writable: true,
value: undefined
});
Object.defineProperty(this, "shouldCheckForForefrontRequests", {
enumerable: true,
configurable: true,
writable: true,
value: false
});
Object.defineProperty(this, "dequeuedRequestCount", {
enumerable: true,
configurable: true,
writable: true,
value: 0
});
const eventManager = config.getEventManager();
eventManager.on("migrating" /* EventType.MIGRATING */, async () => {
await this._clearPossibleLocks();
});
eventManager.on("aborting" /* EventType.ABORTING */, async () => {
await this._clearPossibleLocks();
});
}
/**
* Caches information about request to beware of unneeded addRequest() calls.
*/
_cacheRequest(cacheKey, queueOperationInfo) {
super._cacheRequest(cacheKey, queueOperationInfo);
this.requestCache.remove(queueOperationInfo.requestId);
this.requestCache.add(queueOperationInfo.requestId, {
id: queueOperationInfo.requestId,
isHandled: queueOperationInfo.wasAlreadyHandled,
uniqueKey: queueOperationInfo.uniqueKey,
forefront: queueOperationInfo.forefront,
hydrated: null,
lockExpiresAt: null,
});
}
/**
* @inheritDoc
*/
async addRequest(requestLike, options = {}) {
const result = await super.addRequest(requestLike, options);
if (!result.wasAlreadyPresent && options.forefront) {
this.shouldCheckForForefrontRequests = true;
}
return result;
}
/**
* @inheritDoc
*/
async addRequests(requestsLike, options = {}) {
const result = await super.addRequests(requestsLike, options);
for (const request of result.processedRequests) {
if (!request.wasAlreadyPresent && options.forefront) {
this.shouldCheckForForefrontRequests = true;
break;
}
}
return result;
}
/**
* @inheritDoc
*/
async fetchNextRequest() {
(0, access_checking_1.checkStorageAccess)();
if (this.queuePausedForMigration) {
return null;
}
this.lastActivity = new Date();
await this.ensureHeadIsNonEmpty();
const nextRequestId = this.queueHeadIds.removeFirst();
// We are likely done at this point.
if (!nextRequestId) {
return null;
}
const request = await this.getOrHydrateRequest(nextRequestId);
// NOTE: It can happen that the queue head index is inconsistent with the main queue table. This can occur in two situations:
// 1) Queue head index is ahead of the main table and the request is not present in the main table yet (i.e. getRequest() returned null).
// In this case, keep the request marked as in progress for a short while,
// so that isFinished() doesn't return true and _ensureHeadIsNonEmpty() doesn't not load the request
// into the queueHeadDict straight again. After the interval expires, fetchNextRequest()
// will try to fetch this request again, until it eventually appears in the main table.
if (!request) {
this.log.debug('Cannot find a request from the beginning of queue or lost lock, will be retried later', {
nextRequestId,
});
return null;
}
// 2) Queue head index is behind the main table and the underlying request was already handled
// (by some other client, since we keep the track of handled requests in recentlyHandled dictionary).
// We just add the request to the recentlyHandled dictionary so that next call to _ensureHeadIsNonEmpty()
// will not put the request again to queueHeadDict.
if (request.handledAt) {
this.log.debug('Request fetched from the beginning of queue was already handled', { nextRequestId });
return null;
}
this.dequeuedRequestCount += 1;
return request;
}
/**
* @inheritDoc
*/
async markRequestHandled(request) {
this.dequeuedRequestCount -= 1;
return await super.markRequestHandled(request);
}
/**
* @inheritDoc
*/
async isFinished() {
// We are not finished if we're still adding new requests in the background
if (this.inProgressRequestBatchCount > 0) {
return false;
}
// If the local queue head is non-empty, we don't need to query the "upstream" queue to know we are not finished yet
if (this.queueHeadIds.length() > 0) {
return false;
}
// Local queue head is empty - try to fetch and lock more requests
await this.ensureHeadIsNonEmpty();
// We managed to lock something - we are not finished
if (this.queueHeadIds.length() > 0) {
return false;
}
// We could not lock any new requests - decide based on whether the queue contains requests locked by another client
if (this.queueHasLockedRequests !== undefined) {
// The `% 25` was absolutely arbitrarily picked. It's just to not spam the logs too much.
if (this.queueHasLockedRequests &&
this.dequeuedRequestCount === 0 &&
++this.isFinishedCalledWhileHeadWasNotEmpty % 25 === 0) {
this.log.info('The queue still contains requests locked by another client');
}
return !this.queueHasLockedRequests;
}
// The following is a legacy algorithm for checking if the queue is finished. It is used only for request queue clients that do not provide the `queueHasLockedRequests` flag.
const currentHead = await this.client.listHead({ limit: 2 });
if (currentHead.items.length === 0) {
return true;
}
// Give users some more concrete info as to why their crawlers seem to be "hanging" doing nothing while we're waiting because the queue is technically
// not empty. We decided that a queue with elements in its head but that are also locked shouldn't return true in this function.
// If that ever changes, this function might need a rewrite
// The `% 25` was absolutely arbitrarily picked. It's just to not spam the logs too much. This is also a very specific path that most crawlers shouldn't hit
if (++this.isFinishedCalledWhileHeadWasNotEmpty % 25 === 0) {
const requests = await Promise.all(currentHead.items.map(async (item) => this.client.getRequest(item.id)));
this.log.info(`Queue head still returned requests that need to be processed (or that are locked by other clients)`, {
requests: requests
.map((r) => {
if (!r) {
return null;
}
return {
id: r.id,
lockExpiresAt: r.lockExpiresAt,
lockedBy: r.lockByClient,
};
})
.filter(Boolean),
clientKey: this.clientKey,
});
}
else {
this.log.debug('Queue head still returned requests that need to be processed (or that are locked by other clients)', {
requestIds: currentHead.items.map((item) => item.id),
});
}
return false;
}
/**
* @inheritDoc
*/
async reclaimRequest(...args) {
const res = await super.reclaimRequest(...args);
if (res) {
const [request, options] = args;
if (options?.forefront) {
this.shouldCheckForForefrontRequests = true;
}
// Try to delete the request lock if possible
try {
await this.client.deleteRequestLock(request.id, { forefront: options?.forefront ?? false });
}
catch (err) {
this.log.debug(`Failed to delete request lock for request ${request.id}`, { err });
}
}
return res;
}
async ensureHeadIsNonEmpty() {
(0, access_checking_1.checkStorageAccess)();
// Stop fetching if we are paused for migration
if (this.queuePausedForMigration) {
return;
}
// We want to fetch ahead of time to minimize dead time
// If we need to check for newly added forefront requests, we do it even if we already have some locked requests
if (this.queueHeadIds.length() > QUEUE_HEAD_REFILL_THRESHOLD && !this.shouldCheckForForefrontRequests) {
return;
}
this.listHeadAndLockPromise ?? (this.listHeadAndLockPromise = this._listHeadAndLock().finally(() => {
this.listHeadAndLockPromise = null;
}));
await this.listHeadAndLockPromise;
}
async giveUpLock(id, uniqueKey) {
if (id === undefined) {
return;
}
try {
await this.client.deleteRequestLock(id);
}
catch {
this.log.debug('Failed to delete request lock', { id, uniqueKey });
}
}
async _listHeadAndLock() {
// Make a copy so that we can clear the flag only if the whole method executes after the flag was set
// (i.e, it was not set in the middle of the execution of the method)
const shouldCheckForForefrontRequests = this.shouldCheckForForefrontRequests;
// NOTE: in theory, if we're not checking for forefront requests, we could fetch just enough requests to fill the local queue head to the limit.
// We chose not to do this because 1. it's simpler and 2. the queue is being processed while we're fetching and we want to avoid underruns.
// If we are checking for forefront requests, we need to fetch enough requests to be sure that we won't miss any new forefront ones.
const headData = await this.client.listAndLockHead({
limit: LIST_AND_LOCK_HEAD_LIMIT,
lockSecs: this.requestLockSecs,
});
this.queueHasLockedRequests = headData.queueHasLockedRequests;
const headIdBuffer = [];
const forefrontHeadIdBuffer = [];
// Go through the fetched requests, ensure they are cached locally and sort them into normal and forefront groups
for (const { id, uniqueKey } of headData.items) {
if (!id || !uniqueKey) {
this.log.warning(`Skipping request from queue head as it's invalid. Please report this with the provided metadata!`, {
id,
uniqueKey,
});
// Remove the lock from the request for now, so that it can be picked up later
// This may/may not succeed, but that's fine
await this.giveUpLock(id, uniqueKey);
continue;
}
// If we remember that we added the request ourselves and we added it to the forefront,
// we will put it to the beginning of the local queue head to preserve the expected order.
// If we do not remember that, we will enqueue it normally.
const forefront = this.requestCache.get((0, utils_1.getRequestId)(uniqueKey))?.forefront ?? false;
if (forefront) {
forefrontHeadIdBuffer.unshift(id);
}
else {
headIdBuffer.push(id);
}
// Ensure that the request is cached locally
this._cacheRequest((0, utils_1.getRequestId)(uniqueKey), {
requestId: id,
uniqueKey,
wasAlreadyPresent: true,
wasAlreadyHandled: false,
forefront,
});
}
// Insert the newly fetched requests into the local queue head
for (const id of headIdBuffer) {
this.queueHeadIds.add(id, id, false);
}
for (const id of forefrontHeadIdBuffer) {
this.queueHeadIds.add(id, id, true);
}
// Unlock and forget requests that would make the local queue head grow over the limit
const toUnlock = [];
const limit = shouldCheckForForefrontRequests
? LIST_AND_LOCK_HEAD_LIMIT // we may have received up to LIST_AND_LOCK_HEAD_LIMIT newly added forefront requests - we need to make sure that anything we already had in the queue gets unlocked
: LIST_AND_LOCK_HEAD_LIMIT + QUEUE_HEAD_REFILL_THRESHOLD; // we tolerate up to QUEUE_HEAD_REFILL_THRESHOLD additional requests to avoid frequent, yet unnecessary unlocks
while (this.queueHeadIds.length() > limit) {
toUnlock.push(this.queueHeadIds.removeLast());
}
if (toUnlock.length > 0) {
await Promise.all(toUnlock.map(async (id) => await this.giveUpLock(id)));
}
// We went through the whole procedure after `this.shouldCheckForForefrontRequests` was set -> we can clear the flag now
if (shouldCheckForForefrontRequests) {
this.shouldCheckForForefrontRequests = false;
}
}
async getOrHydrateRequest(requestId) {
(0, access_checking_1.checkStorageAccess)();
const cachedEntry = this.requestCache.get(requestId);
if (!cachedEntry) {
// 2.1. Attempt to prolong the request lock to see if we still own the request
const prolongResult = await this._prolongRequestLock(requestId);
if (!prolongResult) {
return null;
}
// 2.1.1. If successful, hydrate the request and return it
const hydratedRequest = await this.getRequest(requestId);
// Queue head index is ahead of the main table and the request is not present in the main table yet (i.e. getRequest() returned null).
if (!hydratedRequest) {
// Remove the lock from the request for now, so that it can be picked up later
// This may/may not succeed, but that's fine
try {
await this.client.deleteRequestLock(requestId);
}
catch {
// Ignore
}
return null;
}
this.requestCache.add(requestId, {
id: requestId,
uniqueKey: hydratedRequest.uniqueKey,
hydrated: hydratedRequest,
isHandled: hydratedRequest.handledAt !== null,
lockExpiresAt: prolongResult.getTime(),
forefront: false,
});
return hydratedRequest;
}
// 1.1. If hydrated, prolong the lock more and return it
if (cachedEntry.hydrated) {
// 1.1.1. If the lock expired on the hydrated requests, try to prolong. If we fail, we lost the request (or it was handled already)
if (cachedEntry.lockExpiresAt && cachedEntry.lockExpiresAt < Date.now()) {
const prolonged = await this._prolongRequestLock(cachedEntry.id);
if (!prolonged) {
return null;
}
cachedEntry.lockExpiresAt = prolonged.getTime();
}
return cachedEntry.hydrated;
}
// 1.2. If not hydrated, try to prolong the lock first (to ensure we keep it in our queue), hydrate and return it
const prolonged = await this._prolongRequestLock(cachedEntry.id);
if (!prolonged) {
return null;
}
// This might still return null if the queue head is inconsistent with the main queue table.
const hydratedRequest = await this.getRequest(cachedEntry.id);
cachedEntry.hydrated = hydratedRequest;
// Queue head index is ahead of the main table and the request is not present in the main table yet (i.e. getRequest() returned null).
if (!hydratedRequest) {
// Remove the lock from the request for now, so that it can be picked up later
// This may/may not succeed, but that's fine
try {
await this.client.deleteRequestLock(cachedEntry.id);
}
catch {
// Ignore
}
return null;
}
return hydratedRequest;
}
async _prolongRequestLock(requestId) {
try {
const res = await this.client.prolongRequestLock(requestId, { lockSecs: this.requestLockSecs });
return res.lockExpiresAt;
}
catch (err) {
// Most likely we do not own the lock anymore
this.log.warning(`Failed to prolong lock for cached request ${requestId}, either lost the lock or the request was already handled\n`, {
err,
});
return null;
}
}
_reset() {
super._reset();
this.listHeadAndLockPromise = null;
this.queueHasLockedRequests = undefined;
}
_maybeAddRequestToQueueHead() {
// Do nothing for request queue v2, as we are only able to lock requests when listing the head
}
async _clearPossibleLocks() {
this.queuePausedForMigration = true;
let requestId;
// eslint-disable-next-line no-cond-assign
while ((requestId = this.queueHeadIds.removeFirst()) !== null) {
try {
await this.client.deleteRequestLock(requestId);
}
catch {
// We don't have the lock, or the request was never locked. Either way it's fine
}
}
}
/**
* @inheritDoc
*/
static async open(...args) {
return super.open(...args);
}
}
exports.RequestQueue = RequestQueue;
//# sourceMappingURL=request_queue_v2.js.map