@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
143 lines • 4.78 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.RequestManagerTandem = void 0;
const log_1 = require("../log");
/**
* A request manager that combines a RequestList and a RequestQueue.
* It first reads requests from the RequestList and then, when needed,
* transfers them in batches to the RequestQueue.
*/
class RequestManagerTandem {
constructor(requestList, requestQueue) {
Object.defineProperty(this, "log", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "requestList", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "requestQueue", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.log = log_1.log.child({ prefix: 'RequestManagerTandem' });
this.requestList = requestList;
this.requestQueue = requestQueue;
}
/**
* Transfers a batch of requests from the RequestList to the RequestQueue.
* Handles both successful transfers and failures appropriately.
* @private
*/
async transferNextBatchToQueue() {
const request = await this.requestList.fetchNextRequest();
if (request === null) {
return;
}
try {
await this.requestQueue.addRequest(request, { forefront: true });
}
catch (error) {
// If requestQueue.addRequest() fails here then we must reclaim it back to
// the RequestList because probably it's not yet in the queue!
this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
await this.requestList.reclaimRequest(request);
return;
}
await this.requestList.markRequestHandled(request);
}
/**
* Fetches the next request from the RequestQueue. If the queue is empty and the RequestList
* is not finished, it will transfer a batch of requests from the list to the queue first.
* @inheritdoc
*/
async fetchNextRequest() {
// First, try to transfer a request from the requestList
const [listEmpty, listFinished] = await Promise.all([
this.requestList.isEmpty(),
this.requestList.isFinished(),
]);
if (!listEmpty && !listFinished) {
await this.transferNextBatchToQueue();
}
// Try to fetch from queue after potential transfer
return this.requestQueue.fetchNextRequest();
}
/**
* @inheritdoc
*/
async isFinished() {
const storagesFinished = await Promise.all([this.requestList.isFinished(), this.requestQueue.isFinished()]);
return storagesFinished.every(Boolean);
}
/**
* @inheritdoc
*/
async isEmpty() {
const storagesEmpty = await Promise.all([this.requestList.isEmpty(), this.requestQueue.isEmpty()]);
return storagesEmpty.every(Boolean);
}
/**
* @inheritdoc
*/
async handledCount() {
// Since one of the stores needs to have priority when both are present, we query the request queue - the request list will first be dumped into the queue and then left empty.
return await this.requestQueue.handledCount();
}
/**
* @inheritdoc
*/
getTotalCount() {
return this.requestQueue.getTotalCount();
}
/**
* @inheritdoc
*/
getPendingCount() {
return this.requestQueue.getPendingCount() + this.requestList.length() - this.requestList.handledCount();
}
/**
* @inheritdoc
*/
async *[Symbol.asyncIterator]() {
while (true) {
const req = await this.fetchNextRequest();
if (!req)
break;
yield req;
}
}
/**
* @inheritdoc
*/
async markRequestHandled(request) {
return this.requestQueue.markRequestHandled(request);
}
/**
* @inheritdoc
*/
async reclaimRequest(request, options) {
return await this.requestQueue.reclaimRequest(request, options);
}
/**
* @inheritdoc
*/
async addRequest(requestLike, options) {
return await this.requestQueue.addRequest(requestLike, options);
}
/**
* @inheritdoc
*/
async addRequestsBatched(requests, options) {
return await this.requestQueue.addRequestsBatched(requests, options);
}
}
exports.RequestManagerTandem = RequestManagerTandem;
//# sourceMappingURL=request_manager_tandem.js.map