@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

crawlee.dev

apify/crawlee

300 lines • 12.9 kB

TypeScript

import type { Log } from '@apify/log'; import { Configuration } from '../configuration'; import type { SnapshotterOptions } from './snapshotter'; import type { SystemInfo, SystemStatusOptions } from './system_status'; export interface AutoscaledPoolOptions { /** * A function that performs an asynchronous resource-intensive task. * The function must either be labeled `async` or return a promise. */ runTaskFunction?: () => Promise<unknown>; /** * A function that indicates whether `runTaskFunction` should be called. * This function is called every time there is free capacity for a new task and it should * indicate whether it should start a new task or not by resolving to either `true` or `false`. * Besides its obvious use, it is also useful for task throttling to save resources. */ isTaskReadyFunction?: () => Promise<boolean>; /** * A function that is called only when there are no tasks to be processed. * If it resolves to `true` then the pool's run finishes. Being called only * when there are no tasks being processed means that as long as `isTaskReadyFunction()` * keeps resolving to `true`, `isFinishedFunction()` will never be called. * To abort a run, use the {@link AutoscaledPool.abort} method. */ isFinishedFunction?: () => Promise<boolean>; /** * The minimum number of tasks running in parallel. * * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your code might run extremely slow or crash. * If you're not sure, just keep the default value and the concurrency will scale up automatically. * @default 1 */ minConcurrency?: number; /** * The maximum number of tasks running in parallel. * @default 200 */ maxConcurrency?: number; /** * The desired number of tasks that should be running parallel on the start of the pool, * if there is a large enough supply of them. * By default, it is `minConcurrency`. */ desiredConcurrency?: number; /** * Minimum level of desired concurrency to reach before more scaling up is allowed. * @default 0.90 */ desiredConcurrencyRatio?: number; /** * Defines the fractional amount of desired concurrency to be added with each scaling up. * The minimum scaling step is one. * @default 0.05 */ scaleUpStepRatio?: number; /** * Defines the amount of desired concurrency to be subtracted with each scaling down. * The minimum scaling step is one. * @default 0.05 */ scaleDownStepRatio?: number; /** * Indicates how often the pool should call the `runTaskFunction()` to start a new task, in seconds. * This has no effect on starting new tasks immediately after a task completes. * @default 0.5 */ maybeRunIntervalSecs?: number; /** * Specifies a period in which the instance logs its state, in seconds. * Set to `null` to disable periodic logging. * @default 60 */ loggingIntervalSecs?: number | null; /** * Defines in seconds how often the pool should attempt to adjust the desired concurrency * based on the latest system status. Setting it lower than 1 might have a severe impact on performance. * We suggest using a value from 5 to 20. * @default 10 */ autoscaleIntervalSecs?: number; /** * Timeout in which the `runTaskFunction` needs to finish, given in seconds. * @default 0 */ taskTimeoutSecs?: number; /** * Options to be passed down to the {@link Snapshotter} constructor. This is useful for fine-tuning * the snapshot intervals and history. */ snapshotterOptions?: SnapshotterOptions; /** * Options to be passed down to the {@link SystemStatus} constructor. This is useful for fine-tuning * the system status reports. If a custom snapshotter is set in the options, it will be used * by the pool. */ systemStatusOptions?: SystemStatusOptions; /** * The maximum number of tasks per minute the pool can run. * By default, this is set to `Infinity`, but you can pass any positive, non-zero integer. */ maxTasksPerMinute?: number; log?: Log; } /** * Manages a pool of asynchronous resource-intensive tasks that are executed in parallel. * The pool only starts new tasks if there is enough free CPU and memory available * and the Javascript event loop is not blocked. * * The information about the CPU and memory usage is obtained by the {@link Snapshotter} class, * which makes regular snapshots of system resources that may be either local * or from the Apify cloud infrastructure in case the process is running on the Apify platform. * Meaningful data gathered from these snapshots is provided to `AutoscaledPool` by the {@link SystemStatus} class. * * Before running the pool, you need to implement the following three functions: * {@link AutoscaledPoolOptions.runTaskFunction}, * {@link AutoscaledPoolOptions.isTaskReadyFunction} and * {@link AutoscaledPoolOptions.isFinishedFunction}. * * The auto-scaled pool is started by calling the {@link AutoscaledPool.run} function. * The pool periodically queries the {@link AutoscaledPoolOptions.isTaskReadyFunction} function * for more tasks, managing optimal concurrency, until the function resolves to `false`. The pool then queries * the {@link AutoscaledPoolOptions.isFinishedFunction}. If it resolves to `true`, the run finishes after all running tasks complete. * If it resolves to `false`, it assumes there will be more tasks available later and keeps periodically querying for tasks. * If any of the tasks throws then the {@link AutoscaledPool.run} function rejects the promise with an error. * * The pool evaluates whether it should start a new task every time one of the tasks finishes * and also in the interval set by the `options.maybeRunIntervalSecs` parameter. * * **Example usage:** * * ```javascript * const pool = new AutoscaledPool({ * maxConcurrency: 50, * runTaskFunction: async () => { * // Run some resource-intensive asynchronous operation here. * }, * isTaskReadyFunction: async () => { * // Tell the pool whether more tasks are ready to be processed. * // Return true or false * }, * isFinishedFunction: async () => { * // Tell the pool whether it should finish * // or wait for more tasks to become available. * // Return true or false * } * }); * * await pool.run(); * ``` * @category Scaling */ export declare class AutoscaledPool { private readonly config; private readonly log; private readonly desiredConcurrencyRatio; private readonly scaleUpStepRatio; private readonly scaleDownStepRatio; private readonly maybeRunIntervalMillis; private readonly loggingIntervalMillis; private readonly autoscaleIntervalMillis; private readonly taskTimeoutMillis; private readonly runTaskFunction; private readonly isFinishedFunction; private readonly isTaskReadyFunction; private readonly maxTasksPerMinute; private _minConcurrency; private _maxConcurrency; private _desiredConcurrency; private _currentConcurrency; private isStopped; private lastLoggingTime?; private resolve; private reject; private snapshotter; private systemStatus; private autoscaleInterval; private maybeRunInterval; private queryingIsTaskReady; private queryingIsFinished; private tasksDonePerSecondInterval?; private _tasksPerMinute; constructor(options: AutoscaledPoolOptions, config?: Configuration); /** * Gets the minimum number of tasks running in parallel. */ get minConcurrency(): number; /** * Sets the minimum number of tasks running in parallel. * * *WARNING:* If you set this value too high with respect to the available system memory and CPU, your code might run extremely slow or crash. * If you're not sure, just keep the default value and the concurrency will scale up automatically. */ set minConcurrency(value: number); /** * Gets the maximum number of tasks running in parallel. */ get maxConcurrency(): number; /** * Sets the maximum number of tasks running in parallel. */ set maxConcurrency(value: number); /** * Gets the desired concurrency for the pool, * which is an estimated number of parallel tasks that the system can currently support. */ get desiredConcurrency(): number; /** * Sets the desired concurrency for the pool, i.e. the number of tasks that should be running * in parallel if there's large enough supply of tasks. */ set desiredConcurrency(value: number); /** * Gets the number of parallel tasks currently running in the pool. */ get currentConcurrency(): number; /** * Runs the auto-scaled pool. Returns a promise that gets resolved or rejected once * all the tasks are finished or one of them fails. */ run(): Promise<void>; /** * Aborts the run of the auto-scaled pool and destroys it. The promise returned from * the {@link AutoscaledPool.run} function will immediately resolve, no more new tasks * will be spawned and all running tasks will be left in their current state. * * Due to the nature of the tasks, auto-scaled pool cannot reliably guarantee abortion * of all the running tasks, therefore, no abortion is attempted and some of the tasks * may finish, while others may not. Essentially, auto-scaled pool doesn't care about * their state after the invocation of `.abort()`, but that does not mean that some * parts of their asynchronous chains of commands will not execute. */ abort(): Promise<void>; /** * Prevents the auto-scaled pool from starting new tasks, but allows the running ones to finish * (unlike abort, which terminates them). Used together with {@link AutoscaledPool.resume} * * The function's promise will resolve once all running tasks have completed and the pool * is effectively idle. If the `timeoutSecs` argument is provided, the promise will reject * with a timeout error after the `timeoutSecs` seconds. * * The promise returned from the {@link AutoscaledPool.run} function will not resolve * when `.pause()` is invoked (unlike abort, which resolves it). */ pause(timeoutSecs?: number): Promise<void>; /** * Resumes the operation of the autoscaled-pool by allowing more tasks to be run. * Used together with {@link AutoscaledPool.pause} * * Tasks will automatically start running again in `options.maybeRunIntervalSecs`. */ resume(): void; /** * Explicitly check the queue for new tasks. The AutoscaledPool checks the queue for new tasks periodically, * every `maybeRunIntervalSecs` seconds. If you want to trigger the processing immediately, use this method. */ notify(): Promise<void>; /** * Starts a new task * if the number of running tasks (current concurrency) is lower than desired concurrency * and the system is not currently overloaded * and this.isTaskReadyFunction() returns true. * * It doesn't allow multiple concurrent runs of this method. */ protected _maybeRunTask(intervalCallback?: () => void): Promise<void>; /** * Gets called every autoScaleIntervalSecs and evaluates the current system status. * If the system IS NOT overloaded and the settings allow it, it scales up. * If the system IS overloaded and the settings allow it, it scales down. */ protected _autoscale(intervalCallback: () => void): void; /** * Scales the pool up by increasing * the desired concurrency by the scaleUpStepRatio. * * @param systemStatus for logging */ protected _scaleUp(systemStatus: SystemInfo): void; /** * Scales the pool down by decreasing * the desired concurrency by the scaleDownStepRatio. * * @param systemStatus for logging */ protected _scaleDown(systemStatus: SystemInfo): void; /** * If there are no running tasks and this.isFinishedFunction() returns true then closes * the pool and resolves the pool's promise returned by the run() method. * * It doesn't allow multiple concurrent runs of this method. */ protected _maybeFinish(): Promise<void>; /** * Cleans up resources. */ protected _destroy(): Promise<void>; protected _incrementTasksDonePerSecond(intervalCallback: () => void): void; protected get _isOverMaxRequestLimit(): boolean; } //# sourceMappingURL=autoscaled_pool.d.ts.map