@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
387 lines • 16.4 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Snapshotter = void 0;
const tslib_1 = require("tslib");
const utils_1 = require("@crawlee/utils");
const ow_1 = tslib_1.__importDefault(require("ow"));
const utilities_1 = require("@apify/utilities");
const configuration_1 = require("../configuration");
const log_1 = require("../log");
const RESERVE_MEMORY_RATIO = 0.5;
const CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT = 2;
const CRITICAL_OVERLOAD_RATE_LIMIT_MILLIS = 10000;
/**
* Creates snapshots of system resources at given intervals and marks the resource
* as either overloaded or not during the last interval. Keeps a history of the snapshots.
* It tracks the following resources: Memory, EventLoop, API and CPU.
* The class is used by the {@link AutoscaledPool} class.
*
* When running on the Apify platform, the CPU and memory statistics are provided by the platform,
* as collected from the running Docker container. When running locally, `Snapshotter`
* makes its own statistics by querying the OS.
*
* CPU becomes overloaded locally when its current use exceeds the `maxUsedCpuRatio` option or
* when Apify platform marks it as overloaded.
*
* Memory becomes overloaded if its current use exceeds the `maxUsedMemoryRatio` option.
* It's computed using the total memory available to the container when running on
* the Apify platform and a quarter of total system memory when running locally.
* Max total memory when running locally may be overridden by using the `CRAWLEE_MEMORY_MBYTES`
* environment variable.
*
* Event loop becomes overloaded if it slows down by more than the `maxBlockedMillis` option.
*
* Client becomes overloaded when rate limit errors (429 - Too Many Requests),
* typically received from the request queue, exceed the set limit within the set interval.
* @category Scaling
*/
class Snapshotter {
/**
* @param [options] All `Snapshotter` configuration options.
*/
constructor(options = {}) {
Object.defineProperty(this, "log", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "client", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "config", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "events", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "eventLoopSnapshotIntervalMillis", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "clientSnapshotIntervalMillis", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "snapshotHistoryMillis", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxBlockedMillis", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxUsedMemoryRatio", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxClientErrors", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxMemoryBytes", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "cpuSnapshots", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
Object.defineProperty(this, "eventLoopSnapshots", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
Object.defineProperty(this, "memorySnapshots", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
Object.defineProperty(this, "clientSnapshots", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
Object.defineProperty(this, "eventLoopInterval", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
Object.defineProperty(this, "clientInterval", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
Object.defineProperty(this, "lastLoggedCriticalMemoryOverloadAt", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
(0, ow_1.default)(options, ow_1.default.object.exactShape({
eventLoopSnapshotIntervalSecs: ow_1.default.optional.number,
clientSnapshotIntervalSecs: ow_1.default.optional.number,
snapshotHistorySecs: ow_1.default.optional.number,
maxBlockedMillis: ow_1.default.optional.number,
maxUsedMemoryRatio: ow_1.default.optional.number,
maxClientErrors: ow_1.default.optional.number,
log: ow_1.default.optional.object,
client: ow_1.default.optional.object,
config: ow_1.default.optional.object,
}));
const { eventLoopSnapshotIntervalSecs = 0.5, clientSnapshotIntervalSecs = 1, snapshotHistorySecs = 30, maxBlockedMillis = 50, maxUsedMemoryRatio = 0.9, maxClientErrors = 3, log = log_1.log, config = configuration_1.Configuration.getGlobalConfig(), client = config.getStorageClient(), } = options;
this.log = log.child({ prefix: 'Snapshotter' });
this.client = client;
this.config = config;
this.events = this.config.getEventManager();
this.eventLoopSnapshotIntervalMillis = eventLoopSnapshotIntervalSecs * 1000;
this.clientSnapshotIntervalMillis = clientSnapshotIntervalSecs * 1000;
this.snapshotHistoryMillis = snapshotHistorySecs * 1000;
this.maxBlockedMillis = maxBlockedMillis;
this.maxUsedMemoryRatio = maxUsedMemoryRatio;
this.maxClientErrors = maxClientErrors;
// We need to pre-bind those functions to be able to successfully remove listeners.
this._snapshotCpu = this._snapshotCpu.bind(this);
this._snapshotMemory = this._snapshotMemory.bind(this);
}
/**
* Starts capturing snapshots at configured intervals.
*/
async start() {
const memoryMbytes = this.config.get('memoryMbytes', 0);
if (memoryMbytes > 0) {
this.maxMemoryBytes = memoryMbytes * 1024 * 1024;
}
else {
let totalBytes;
if (this.config.get('systemInfoV2')) {
const containerized = this.config.get('containerized', await (0, utils_1.isContainerized)());
const memInfo = await (0, utils_1.getMemoryInfoV2)(containerized);
totalBytes = memInfo.totalBytes;
}
else {
const memInfo = await (0, utils_1.getMemoryInfo)();
totalBytes = memInfo.totalBytes;
}
this.maxMemoryBytes = Math.ceil(totalBytes * this.config.get('availableMemoryRatio'));
this.log.debug(`Setting max memory of this run to ${Math.round(this.maxMemoryBytes / 1024 / 1024)} MB. ` +
'Use the CRAWLEE_MEMORY_MBYTES or CRAWLEE_AVAILABLE_MEMORY_RATIO environment variable to override it.');
}
// Start snapshotting.
this.eventLoopInterval = (0, utilities_1.betterSetInterval)(this._snapshotEventLoop.bind(this), this.eventLoopSnapshotIntervalMillis);
this.clientInterval = (0, utilities_1.betterSetInterval)(this._snapshotClient.bind(this), this.clientSnapshotIntervalMillis);
this.events.on("systemInfo" /* EventType.SYSTEM_INFO */, this._snapshotCpu);
this.events.on("systemInfo" /* EventType.SYSTEM_INFO */, this._snapshotMemory);
}
/**
* Stops all resource capturing.
*/
async stop() {
(0, utilities_1.betterClearInterval)(this.eventLoopInterval);
(0, utilities_1.betterClearInterval)(this.clientInterval);
this.events.off("systemInfo" /* EventType.SYSTEM_INFO */, this._snapshotCpu);
this.events.off("systemInfo" /* EventType.SYSTEM_INFO */, this._snapshotMemory);
// Allow microtask queue to unwind before stop returns.
await new Promise((resolve) => {
setImmediate(resolve);
});
}
/**
* Returns a sample of latest memory snapshots, with the size of the sample defined
* by the sampleDurationMillis parameter. If omitted, it returns a full snapshot history.
*/
getMemorySample(sampleDurationMillis) {
return this._getSample(this.memorySnapshots, sampleDurationMillis);
}
/**
* Returns a sample of latest event loop snapshots, with the size of the sample defined
* by the sampleDurationMillis parameter. If omitted, it returns a full snapshot history.
*/
getEventLoopSample(sampleDurationMillis) {
return this._getSample(this.eventLoopSnapshots, sampleDurationMillis);
}
/**
* Returns a sample of latest CPU snapshots, with the size of the sample defined
* by the sampleDurationMillis parameter. If omitted, it returns a full snapshot history.
*/
getCpuSample(sampleDurationMillis) {
return this._getSample(this.cpuSnapshots, sampleDurationMillis);
}
/**
* Returns a sample of latest Client snapshots, with the size of the sample defined
* by the sampleDurationMillis parameter. If omitted, it returns a full snapshot history.
*/
getClientSample(sampleDurationMillis) {
return this._getSample(this.clientSnapshots, sampleDurationMillis);
}
/**
* Finds the latest snapshots by sampleDurationMillis in the provided array.
*/
_getSample(snapshots, sampleDurationMillis) {
if (!sampleDurationMillis)
return snapshots;
const sample = [];
let idx = snapshots.length;
if (!idx)
return sample;
const latestTime = snapshots[idx - 1].createdAt;
while (idx--) {
const snapshot = snapshots[idx];
if (+latestTime - +snapshot.createdAt <= sampleDurationMillis) {
sample.unshift(snapshot);
}
else {
break;
}
}
return sample;
}
/**
* Creates a snapshot of current memory usage
* using the Apify platform `systemInfo` event.
*/
_snapshotMemory(systemInfo) {
const createdAt = systemInfo.createdAt ? new Date(systemInfo.createdAt) : new Date();
this._pruneSnapshots(this.memorySnapshots, createdAt);
const { memCurrentBytes } = systemInfo;
const snapshot = {
createdAt,
isOverloaded: memCurrentBytes / this.maxMemoryBytes > this.maxUsedMemoryRatio,
usedBytes: memCurrentBytes,
};
this.memorySnapshots.push(snapshot);
this._memoryOverloadWarning(systemInfo);
}
/**
* Checks for critical memory overload and logs it to the console.
*/
_memoryOverloadWarning(systemInfo) {
const { memCurrentBytes } = systemInfo;
const createdAt = systemInfo.createdAt ? new Date(systemInfo.createdAt) : new Date();
if (this.lastLoggedCriticalMemoryOverloadAt &&
+createdAt < +this.lastLoggedCriticalMemoryOverloadAt + CRITICAL_OVERLOAD_RATE_LIMIT_MILLIS)
return;
const maxDesiredMemoryBytes = this.maxUsedMemoryRatio * this.maxMemoryBytes;
const reserveMemory = this.maxMemoryBytes * (1 - this.maxUsedMemoryRatio) * RESERVE_MEMORY_RATIO;
const criticalOverloadBytes = maxDesiredMemoryBytes + reserveMemory;
const isCriticalOverload = memCurrentBytes > criticalOverloadBytes;
if (isCriticalOverload) {
const usedPercentage = Math.round((memCurrentBytes / this.maxMemoryBytes) * 100);
const toMb = (bytes) => Math.round(bytes / 1024 ** 2);
this.log.warning('Memory is critically overloaded. ' +
`Using ${toMb(memCurrentBytes)} MB of ${toMb(this.maxMemoryBytes)} MB (${usedPercentage}%). Consider increasing available memory.`);
this.lastLoggedCriticalMemoryOverloadAt = createdAt;
}
}
/**
* Creates a snapshot of current event loop delay.
*/
_snapshotEventLoop(intervalCallback) {
const now = new Date();
this._pruneSnapshots(this.eventLoopSnapshots, now);
const snapshot = {
createdAt: now,
isOverloaded: false,
exceededMillis: 0,
};
const previousSnapshot = this.eventLoopSnapshots[this.eventLoopSnapshots.length - 1];
if (previousSnapshot) {
const { createdAt } = previousSnapshot;
const delta = now.getTime() - +createdAt - this.eventLoopSnapshotIntervalMillis;
if (delta > this.maxBlockedMillis)
snapshot.isOverloaded = true;
snapshot.exceededMillis = Math.max(delta - this.maxBlockedMillis, 0);
}
this.eventLoopSnapshots.push(snapshot);
intervalCallback();
}
/**
* Creates a snapshot of current CPU usage using the Apify platform `systemInfo` event.
*/
_snapshotCpu(systemInfo) {
const { cpuCurrentUsage, isCpuOverloaded } = systemInfo;
const createdAt = systemInfo.createdAt ? new Date(systemInfo.createdAt) : new Date();
this._pruneSnapshots(this.cpuSnapshots, createdAt);
this.cpuSnapshots.push({
createdAt,
isOverloaded: isCpuOverloaded,
usedRatio: Math.ceil(cpuCurrentUsage / 100),
});
}
/**
* Creates a snapshot of current API state by checking for
* rate limit errors. Only errors produced by a 2nd retry
* of the API call are considered for snapshotting since
* earlier errors may just be caused by a random spike in
* number of requests and do not necessarily signify API
* overloading.
*/
_snapshotClient(intervalCallback) {
const now = new Date();
this._pruneSnapshots(this.clientSnapshots, now);
const allErrorCounts = this.client.stats?.rateLimitErrors ?? []; // storage client might not support this
const currentErrCount = allErrorCounts[CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT] || 0;
// Handle empty snapshots array
const snapshot = {
createdAt: now,
isOverloaded: false,
rateLimitErrorCount: currentErrCount,
};
const previousSnapshot = this.clientSnapshots[this.clientSnapshots.length - 1];
if (previousSnapshot) {
const { rateLimitErrorCount } = previousSnapshot;
const delta = currentErrCount - rateLimitErrorCount;
if (delta > this.maxClientErrors)
snapshot.isOverloaded = true;
}
this.clientSnapshots.push(snapshot);
intervalCallback();
}
/**
* Removes snapshots that are older than the snapshotHistorySecs option
* from the array (destructively - in place).
*/
_pruneSnapshots(snapshots, now) {
let oldCount = 0;
for (let i = 0; i < snapshots.length; i++) {
const { createdAt } = snapshots[i];
if (now.getTime() - new Date(createdAt).getTime() > this.snapshotHistoryMillis)
oldCount++;
else
break;
}
snapshots.splice(0, oldCount);
}
}
exports.Snapshotter = Snapshotter;
//# sourceMappingURL=snapshotter.js.map