@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
682 lines • 28.5 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.AutoscaledPool = void 0;
const tslib_1 = require("tslib");
const ow_1 = tslib_1.__importDefault(require("ow"));
const timeout_1 = require("@apify/timeout");
const utilities_1 = require("@apify/utilities");
const configuration_1 = require("../configuration");
const errors_1 = require("../errors");
const log_1 = require("../log");
const snapshotter_1 = require("./snapshotter");
const system_status_1 = require("./system_status");
/**
* Manages a pool of asynchronous resource-intensive tasks that are executed in parallel.
* The pool only starts new tasks if there is enough free CPU and memory available
* and the Javascript event loop is not blocked.
*
* The information about the CPU and memory usage is obtained by the {@link Snapshotter} class,
* which makes regular snapshots of system resources that may be either local
* or from the Apify cloud infrastructure in case the process is running on the Apify platform.
* Meaningful data gathered from these snapshots is provided to `AutoscaledPool` by the {@link SystemStatus} class.
*
* Before running the pool, you need to implement the following three functions:
* {@link AutoscaledPoolOptions.runTaskFunction},
* {@link AutoscaledPoolOptions.isTaskReadyFunction} and
* {@link AutoscaledPoolOptions.isFinishedFunction}.
*
* The auto-scaled pool is started by calling the {@link AutoscaledPool.run} function.
* The pool periodically queries the {@link AutoscaledPoolOptions.isTaskReadyFunction} function
* for more tasks, managing optimal concurrency, until the function resolves to `false`. The pool then queries
* the {@link AutoscaledPoolOptions.isFinishedFunction}. If it resolves to `true`, the run finishes after all running tasks complete.
* If it resolves to `false`, it assumes there will be more tasks available later and keeps periodically querying for tasks.
* If any of the tasks throws then the {@link AutoscaledPool.run} function rejects the promise with an error.
*
* The pool evaluates whether it should start a new task every time one of the tasks finishes
* and also in the interval set by the `options.maybeRunIntervalSecs` parameter.
*
* **Example usage:**
*
* ```javascript
* const pool = new AutoscaledPool({
* maxConcurrency: 50,
* runTaskFunction: async () => {
* // Run some resource-intensive asynchronous operation here.
* },
* isTaskReadyFunction: async () => {
* // Tell the pool whether more tasks are ready to be processed.
* // Return true or false
* },
* isFinishedFunction: async () => {
* // Tell the pool whether it should finish
* // or wait for more tasks to become available.
* // Return true or false
* }
* });
*
* await pool.run();
* ```
* @category Scaling
*/
class AutoscaledPool {
constructor(options, config = configuration_1.Configuration.getGlobalConfig()) {
Object.defineProperty(this, "config", {
enumerable: true,
configurable: true,
writable: true,
value: config
});
Object.defineProperty(this, "log", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
// Configurable properties.
Object.defineProperty(this, "desiredConcurrencyRatio", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "scaleUpStepRatio", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "scaleDownStepRatio", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maybeRunIntervalMillis", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "loggingIntervalMillis", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "autoscaleIntervalMillis", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "taskTimeoutMillis", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "runTaskFunction", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "isFinishedFunction", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "isTaskReadyFunction", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxTasksPerMinute", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
// Internal properties.
Object.defineProperty(this, "_minConcurrency", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_maxConcurrency", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_desiredConcurrency", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_currentConcurrency", {
enumerable: true,
configurable: true,
writable: true,
value: 0
});
Object.defineProperty(this, "isStopped", {
enumerable: true,
configurable: true,
writable: true,
value: false
});
Object.defineProperty(this, "lastLoggingTime", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "resolve", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
Object.defineProperty(this, "reject", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
Object.defineProperty(this, "snapshotter", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "systemStatus", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "autoscaleInterval", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maybeRunInterval", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "queryingIsTaskReady", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "queryingIsFinished", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "tasksDonePerSecondInterval", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "_tasksPerMinute", {
enumerable: true,
configurable: true,
writable: true,
value: Array.from({ length: 60 }, () => 0)
});
(0, ow_1.default)(options, ow_1.default.object.exactShape({
runTaskFunction: ow_1.default.function,
isFinishedFunction: ow_1.default.function,
isTaskReadyFunction: ow_1.default.function,
maxConcurrency: ow_1.default.optional.number.integer.greaterThanOrEqual(1),
minConcurrency: ow_1.default.optional.number.integer.greaterThanOrEqual(1),
desiredConcurrency: ow_1.default.optional.number.integer.greaterThanOrEqual(1),
desiredConcurrencyRatio: ow_1.default.optional.number.greaterThan(0).lessThan(1),
scaleUpStepRatio: ow_1.default.optional.number.greaterThan(0).lessThan(1),
scaleDownStepRatio: ow_1.default.optional.number.greaterThan(0).lessThan(1),
maybeRunIntervalSecs: ow_1.default.optional.number.greaterThan(0),
loggingIntervalSecs: ow_1.default.any(ow_1.default.number.greaterThan(0), ow_1.default.nullOrUndefined),
autoscaleIntervalSecs: ow_1.default.optional.number.greaterThan(0),
taskTimeoutSecs: ow_1.default.optional.number.greaterThanOrEqual(0),
systemStatusOptions: ow_1.default.optional.object,
snapshotterOptions: ow_1.default.optional.object,
log: ow_1.default.optional.object,
maxTasksPerMinute: ow_1.default.optional.number.integerOrInfinite.greaterThanOrEqual(1),
}));
const { runTaskFunction, isFinishedFunction, isTaskReadyFunction, maxConcurrency = 200, minConcurrency = 1, desiredConcurrency, desiredConcurrencyRatio = 0.9, scaleUpStepRatio = 0.05, scaleDownStepRatio = 0.05, maybeRunIntervalSecs = 0.5, loggingIntervalSecs = 60, taskTimeoutSecs = 0, autoscaleIntervalSecs = 10, systemStatusOptions, snapshotterOptions, log = log_1.log, maxTasksPerMinute = Infinity, } = options;
this.log = log.child({ prefix: 'AutoscaledPool' });
// Configurable properties.
this.desiredConcurrencyRatio = desiredConcurrencyRatio;
this.scaleUpStepRatio = scaleUpStepRatio;
this.scaleDownStepRatio = scaleDownStepRatio;
this.maybeRunIntervalMillis = maybeRunIntervalSecs * 1000;
this.loggingIntervalMillis = loggingIntervalSecs * 1000;
this.autoscaleIntervalMillis = autoscaleIntervalSecs * 1000;
this.taskTimeoutMillis = taskTimeoutSecs * 1000;
this.runTaskFunction = runTaskFunction;
this.isFinishedFunction = isFinishedFunction;
this.isTaskReadyFunction = isTaskReadyFunction;
this.maxTasksPerMinute = maxTasksPerMinute;
// Internal properties.
this._minConcurrency = minConcurrency;
this._maxConcurrency = maxConcurrency;
this._desiredConcurrency = Math.min(desiredConcurrency ?? minConcurrency, maxConcurrency);
this._currentConcurrency = 0;
this.isStopped = false;
this.resolve = null;
this.reject = null;
this._autoscale = this._autoscale.bind(this);
this._maybeRunTask = this._maybeRunTask.bind(this);
this._incrementTasksDonePerSecond = this._incrementTasksDonePerSecond.bind(this);
// Create instances with correct options.
const ssoCopy = { ...systemStatusOptions };
ssoCopy.snapshotter ?? (ssoCopy.snapshotter = new snapshotter_1.Snapshotter({
...snapshotterOptions,
log: this.log,
config: this.config,
client: this.config.getStorageClient(),
}));
ssoCopy.config ?? (ssoCopy.config = this.config);
this.snapshotter = ssoCopy.snapshotter;
this.systemStatus = new system_status_1.SystemStatus(ssoCopy);
}
/**
* Gets the minimum number of tasks running in parallel.
*/
get minConcurrency() {
return this._minConcurrency;
}
/**
* Sets the minimum number of tasks running in parallel.
*
* *WARNING:* If you set this value too high with respect to the available system memory and CPU, your code might run extremely slow or crash.
* If you're not sure, just keep the default value and the concurrency will scale up automatically.
*/
set minConcurrency(value) {
(0, ow_1.default)(value, ow_1.default.optional.number.integer.greaterThanOrEqual(1));
this._minConcurrency = value;
}
/**
* Gets the maximum number of tasks running in parallel.
*/
get maxConcurrency() {
return this._maxConcurrency;
}
/**
* Sets the maximum number of tasks running in parallel.
*/
set maxConcurrency(value) {
(0, ow_1.default)(value, ow_1.default.optional.number.integer.greaterThanOrEqual(1));
this._maxConcurrency = value;
}
/**
* Gets the desired concurrency for the pool,
* which is an estimated number of parallel tasks that the system can currently support.
*/
get desiredConcurrency() {
return this._desiredConcurrency;
}
/**
* Sets the desired concurrency for the pool, i.e. the number of tasks that should be running
* in parallel if there's large enough supply of tasks.
*/
set desiredConcurrency(value) {
(0, ow_1.default)(value, ow_1.default.optional.number.integer.greaterThanOrEqual(1));
this._desiredConcurrency = value;
}
/**
* Gets the number of parallel tasks currently running in the pool.
*/
get currentConcurrency() {
return this._currentConcurrency;
}
/**
* Runs the auto-scaled pool. Returns a promise that gets resolved or rejected once
* all the tasks are finished or one of them fails.
*/
async run() {
const poolPromise = new Promise((resolve, reject) => {
this.resolve = resolve;
this.reject = reject;
});
await this.snapshotter.start();
// This interval checks the system status and updates the desired concurrency accordingly.
this.autoscaleInterval = (0, utilities_1.betterSetInterval)(this._autoscale, this.autoscaleIntervalMillis);
// This is here because if we scale down to let's say 1, then after each promise is finished
// this._maybeRunTask() doesn't trigger another one. So if that 1 instance gets stuck it results
// in the crawler getting stuck and even after scaling up it never triggers another promise.
this.maybeRunInterval = (0, utilities_1.betterSetInterval)(this._maybeRunTask, this.maybeRunIntervalMillis);
if (this.maxTasksPerMinute !== Infinity) {
// Start the interval that resets the counter of tasks per minute.
this.tasksDonePerSecondInterval = (0, utilities_1.betterSetInterval)(this._incrementTasksDonePerSecond, 1000);
}
try {
await poolPromise;
}
finally {
// If resolve is null, the pool is already destroyed.
if (this.resolve)
await this._destroy();
}
}
/**
* Aborts the run of the auto-scaled pool and destroys it. The promise returned from
* the {@link AutoscaledPool.run} function will immediately resolve, no more new tasks
* will be spawned and all running tasks will be left in their current state.
*
* Due to the nature of the tasks, auto-scaled pool cannot reliably guarantee abortion
* of all the running tasks, therefore, no abortion is attempted and some of the tasks
* may finish, while others may not. Essentially, auto-scaled pool doesn't care about
* their state after the invocation of `.abort()`, but that does not mean that some
* parts of their asynchronous chains of commands will not execute.
*/
async abort() {
this.isStopped = true;
if (this.resolve) {
this.resolve();
await this._destroy();
}
}
/**
* Prevents the auto-scaled pool from starting new tasks, but allows the running ones to finish
* (unlike abort, which terminates them). Used together with {@link AutoscaledPool.resume}
*
* The function's promise will resolve once all running tasks have completed and the pool
* is effectively idle. If the `timeoutSecs` argument is provided, the promise will reject
* with a timeout error after the `timeoutSecs` seconds.
*
* The promise returned from the {@link AutoscaledPool.run} function will not resolve
* when `.pause()` is invoked (unlike abort, which resolves it).
*/
async pause(timeoutSecs) {
if (this.isStopped)
return;
this.isStopped = true;
await new Promise((resolve, reject) => {
let timeout;
if (timeoutSecs) {
timeout = setTimeout(() => {
const err = new Error("The pool's running tasks did not finish" +
`in ${timeoutSecs} secs after pool.pause() invocation.`);
reject(err);
}, timeoutSecs);
}
const interval = setInterval(() => {
if (this._currentConcurrency <= 0) {
// Clean up timeout and interval to prevent process hanging.
if (timeout)
clearTimeout(timeout);
clearInterval(interval);
resolve();
}
}, this.maybeRunIntervalMillis);
});
}
/**
* Resumes the operation of the autoscaled-pool by allowing more tasks to be run.
* Used together with {@link AutoscaledPool.pause}
*
* Tasks will automatically start running again in `options.maybeRunIntervalSecs`.
*/
resume() {
this.isStopped = false;
}
/**
* Explicitly check the queue for new tasks. The AutoscaledPool checks the queue for new tasks periodically,
* every `maybeRunIntervalSecs` seconds. If you want to trigger the processing immediately, use this method.
*/
async notify() {
setImmediate(this._maybeRunTask);
}
/**
* Starts a new task
* if the number of running tasks (current concurrency) is lower than desired concurrency
* and the system is not currently overloaded
* and this.isTaskReadyFunction() returns true.
*
* It doesn't allow multiple concurrent runs of this method.
*/
async _maybeRunTask(intervalCallback) {
this.log.perf('Attempting to run a task.');
// Check if the function was invoked by the maybeRunInterval and use an empty function if not.
const done = intervalCallback || (() => { });
// Prevent starting a new task if:
// - the pool is paused or aborted
if (this.isStopped) {
this.log.perf('Task will not run. AutoscaledPool is stopped.');
return done();
}
// - we are already querying for a task.
if (this.queryingIsTaskReady) {
this.log.perf('Task will not run. Waiting for a ready task.');
return done();
}
// - we would exceed desired concurrency.
if (this._currentConcurrency >= this._desiredConcurrency) {
this.log.perf('Task will not run. Desired concurrency achieved.');
return done();
}
// - system is overloaded now and we are at or above minConcurrency
const currentStatus = this.systemStatus.getCurrentStatus();
const { isSystemIdle } = currentStatus;
if (!isSystemIdle && this._currentConcurrency >= this._minConcurrency) {
this.log.perf('Task will not be run. System is overloaded.', currentStatus);
return done();
}
// - a task is ready.
this.queryingIsTaskReady = true;
let isTaskReady;
try {
this.log.perf('Checking for ready tasks.');
isTaskReady = await this.isTaskReadyFunction();
}
catch (e) {
const err = e;
this.log.perf('Checking for ready tasks failed.');
// We might have already rejected this promise.
if (this.reject) {
// No need to log all concurrent errors.
this.log.exception(err, 'isTaskReadyFunction failed');
this.reject(err);
}
}
finally {
this.queryingIsTaskReady = false;
}
if (!isTaskReady) {
this.log.perf('Task will not run. No tasks are ready.');
done();
// No tasks could mean that we're finished with all tasks.
return this._maybeFinish();
}
// - we have already reached the maximum tasks per minute
// we need to check this *after* checking if a task is ready to prevent hanging the pool
// for an extra minute if there are no more tasks
if (this._isOverMaxRequestLimit) {
this.log.perf('Task will not run. Maximum tasks per minute reached.');
return done();
}
try {
// Everything's fine. Run task.
this._currentConcurrency++;
this._tasksPerMinute[0]++;
// Try to run next task to build up concurrency,
// but defer it so it doesn't create a cycle.
setImmediate(this._maybeRunTask);
// We need to restart interval here, so that it doesn't get blocked by a stalled task.
done();
// Execute the current task.
this.log.perf('Running a task.');
if (this.taskTimeoutMillis > 0) {
await (0, timeout_1.addTimeoutToPromise)(async () => this.runTaskFunction(), this.taskTimeoutMillis, `runTaskFunction timed out after ${this.taskTimeoutMillis / 1000} seconds.`);
}
else {
await this.runTaskFunction();
}
this.log.perf('Task finished.');
this._currentConcurrency--;
// Run task after the previous one finished.
setImmediate(this._maybeRunTask);
}
catch (e) {
const err = e;
this.log.perf('Running a task failed.');
// We might have already rejected this promise.
if (this.reject) {
// No need to log all concurrent errors.
if (
// avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
!(e instanceof errors_1.CriticalError)) {
this.log.exception(err, 'runTaskFunction failed.');
}
this.reject(err);
}
}
return undefined;
}
/**
* Gets called every autoScaleIntervalSecs and evaluates the current system status.
* If the system IS NOT overloaded and the settings allow it, it scales up.
* If the system IS overloaded and the settings allow it, it scales down.
*/
_autoscale(intervalCallback) {
// Don't scale if paused.
if (this.isStopped)
return intervalCallback();
// Don't scale if we've hit the maximum requests per minute
if (this._isOverMaxRequestLimit)
return intervalCallback();
// Only scale up if:
// - system has not been overloaded lately.
const systemStatus = this.systemStatus.getHistoricalStatus();
const { isSystemIdle } = systemStatus;
// - we're not already at max concurrency.
const weAreNotAtMax = this._desiredConcurrency < this._maxConcurrency;
// - current concurrency reaches at least the given ratio of desired concurrency.
const minCurrentConcurrency = Math.floor(this._desiredConcurrency * this.desiredConcurrencyRatio);
const weAreReachingDesiredConcurrency = this._currentConcurrency >= minCurrentConcurrency;
if (isSystemIdle && weAreNotAtMax && weAreReachingDesiredConcurrency)
this._scaleUp(systemStatus);
// Always scale down if:
// - the system has been overloaded lately.
const isSystemOverloaded = !isSystemIdle;
// - we're over min concurrency.
const weAreNotAtMin = this._desiredConcurrency > this._minConcurrency;
if (isSystemOverloaded && weAreNotAtMin)
this._scaleDown(systemStatus);
// On periodic intervals, print comprehensive log information
if (this.loggingIntervalMillis > 0) {
const now = Date.now();
if (this.lastLoggingTime == null) {
this.lastLoggingTime = now;
}
else if (now > this.lastLoggingTime + this.loggingIntervalMillis) {
this.lastLoggingTime = now;
this.log.info('state', {
currentConcurrency: this._currentConcurrency,
desiredConcurrency: this._desiredConcurrency,
systemStatus,
});
}
}
// Start a new interval cycle.
return intervalCallback();
}
/**
* Scales the pool up by increasing
* the desired concurrency by the scaleUpStepRatio.
*
* @param systemStatus for logging
*/
_scaleUp(systemStatus) {
const step = Math.ceil(this._desiredConcurrency * this.scaleUpStepRatio);
this._desiredConcurrency = Math.min(this._maxConcurrency, this._desiredConcurrency + step);
this.log.debug('scaling up', {
oldConcurrency: this._desiredConcurrency - step,
newConcurrency: this._desiredConcurrency,
systemStatus,
});
}
/**
* Scales the pool down by decreasing
* the desired concurrency by the scaleDownStepRatio.
*
* @param systemStatus for logging
*/
_scaleDown(systemStatus) {
const step = Math.ceil(this._desiredConcurrency * this.scaleDownStepRatio);
this._desiredConcurrency = Math.max(this._minConcurrency, this._desiredConcurrency - step);
this.log.debug('scaling down', {
oldConcurrency: this._desiredConcurrency + step,
newConcurrency: this._desiredConcurrency,
systemStatus,
});
}
/**
* If there are no running tasks and this.isFinishedFunction() returns true then closes
* the pool and resolves the pool's promise returned by the run() method.
*
* It doesn't allow multiple concurrent runs of this method.
*/
async _maybeFinish() {
if (this.queryingIsFinished)
return;
if (this._currentConcurrency > 0)
return;
this.queryingIsFinished = true;
try {
const isFinished = await this.isFinishedFunction();
if (isFinished && this.resolve)
this.resolve();
}
catch (e) {
const err = e;
if (this.reject) {
// No need to log all concurrent errors.
this.log.exception(err, 'isFinishedFunction failed.');
this.reject(err);
}
}
finally {
this.queryingIsFinished = false;
}
}
/**
* Cleans up resources.
*/
async _destroy() {
this.resolve = null;
this.reject = null;
(0, utilities_1.betterClearInterval)(this.autoscaleInterval);
(0, utilities_1.betterClearInterval)(this.maybeRunInterval);
if (this.tasksDonePerSecondInterval)
(0, utilities_1.betterClearInterval)(this.tasksDonePerSecondInterval);
if (this.snapshotter)
await this.snapshotter.stop();
}
_incrementTasksDonePerSecond(intervalCallback) {
this._tasksPerMinute.unshift(0);
this._tasksPerMinute.pop();
return intervalCallback();
}
get _isOverMaxRequestLimit() {
if (this.maxTasksPerMinute === Infinity) {
return false;
}
return this._tasksPerMinute.reduce((acc, curr) => acc + curr, 0) >= this.maxTasksPerMinute;
}
}
exports.AutoscaledPool = AutoscaledPool;
//# sourceMappingURL=autoscaled_pool.js.map