UNPKG

@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

454 lines • 17.5 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Statistics = void 0; const tslib_1 = require("tslib"); const ow_1 = tslib_1.__importDefault(require("ow")); const configuration_1 = require("../configuration"); const log_1 = require("../log"); const key_value_store_1 = require("../storages/key_value_store"); const error_tracker_1 = require("./error_tracker"); /** * @ignore */ class Job { constructor() { Object.defineProperty(this, "lastRunAt", { enumerable: true, configurable: true, writable: true, value: null }); Object.defineProperty(this, "durationMillis", { enumerable: true, configurable: true, writable: true, value: void 0 }); } run() { this.lastRunAt = Date.now(); } finish() { this.durationMillis = Date.now() - this.lastRunAt; return this.durationMillis; } } const errorTrackerConfig = { showErrorCode: true, showErrorName: true, showStackTrace: true, showFullStack: false, showErrorMessage: true, showFullMessage: false, }; /** * The statistics class provides an interface to collecting and logging run * statistics for requests. * * All statistic information is saved on key value store * under the key `SDK_CRAWLER_STATISTICS_*`, persists between * migrations and abort/resurrect * * @category Crawlers */ class Statistics { /** * @internal */ constructor(options = {}) { /** * An error tracker for final retry errors. */ Object.defineProperty(this, "errorTracker", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * An error tracker for retry errors prior to the final retry. */ Object.defineProperty(this, "errorTrackerRetry", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * Statistic instance id. */ Object.defineProperty(this, "id", { enumerable: true, configurable: true, writable: true, value: Statistics.id++ }); // assign an id while incrementing so it can be saved/restored from KV /** * Current statistic state used for doing calculations on {@link Statistics.calculate} calls */ Object.defineProperty(this, "state", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * Contains the current retries histogram. Index 0 means 0 retries, index 2, 2 retries, and so on */ Object.defineProperty(this, "requestRetryHistogram", { enumerable: true, configurable: true, writable: true, value: [] }); /** * Contains the associated Configuration instance */ Object.defineProperty(this, "config", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "keyValueStore", { enumerable: true, configurable: true, writable: true, value: undefined }); Object.defineProperty(this, "persistStateKey", { enumerable: true, configurable: true, writable: true, value: `SDK_CRAWLER_STATISTICS_${this.id}` }); Object.defineProperty(this, "logIntervalMillis", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "logMessage", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "listener", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "requestsInProgress", { enumerable: true, configurable: true, writable: true, value: new Map() }); Object.defineProperty(this, "log", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "instanceStart", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "logInterval", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "events", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "persistenceOptions", { enumerable: true, configurable: true, writable: true, value: void 0 }); (0, ow_1.default)(options, ow_1.default.object.exactShape({ logIntervalSecs: ow_1.default.optional.number, logMessage: ow_1.default.optional.string, log: ow_1.default.optional.object, keyValueStore: ow_1.default.optional.object, config: ow_1.default.optional.object, persistenceOptions: ow_1.default.optional.object, saveErrorSnapshots: ow_1.default.optional.boolean, })); const { logIntervalSecs = 60, logMessage = 'Statistics', keyValueStore, config = configuration_1.Configuration.getGlobalConfig(), persistenceOptions = { enable: true, }, saveErrorSnapshots = false, } = options; this.log = (options.log ?? log_1.log).child({ prefix: 'Statistics' }); this.errorTracker = new error_tracker_1.ErrorTracker({ ...errorTrackerConfig, saveErrorSnapshots }); this.errorTrackerRetry = new error_tracker_1.ErrorTracker({ ...errorTrackerConfig, saveErrorSnapshots }); this.logIntervalMillis = logIntervalSecs * 1000; this.logMessage = logMessage; this.keyValueStore = keyValueStore; this.listener = this.persistState.bind(this); this.events = config.getEventManager(); this.config = config; this.persistenceOptions = persistenceOptions; // initialize by "resetting" this.reset(); } /** * Set the current statistic instance to pristine values */ reset() { this.errorTracker.reset(); this.errorTrackerRetry.reset(); this.state = { requestsFinished: 0, requestsFailed: 0, requestsRetries: 0, requestsFailedPerMinute: 0, requestsFinishedPerMinute: 0, requestMinDurationMillis: Infinity, requestMaxDurationMillis: 0, requestTotalFailedDurationMillis: 0, requestTotalFinishedDurationMillis: 0, crawlerStartedAt: null, crawlerFinishedAt: null, statsPersistedAt: null, crawlerRuntimeMillis: 0, requestsWithStatusCode: {}, errors: this.errorTracker.result, retryErrors: this.errorTrackerRetry.result, }; this.requestRetryHistogram.length = 0; this.requestsInProgress.clear(); this.instanceStart = Date.now(); this._teardown(); } /** * @param options - Override the persistence options provided in the constructor */ async resetStore(options) { if (!this.persistenceOptions.enable && !options?.enable) { return; } if (!this.keyValueStore) { return; } await this.keyValueStore.setValue(this.persistStateKey, null); } /** * Increments the status code counter. */ registerStatusCode(code) { const s = String(code); if (this.state.requestsWithStatusCode[s] === undefined) { this.state.requestsWithStatusCode[s] = 0; } this.state.requestsWithStatusCode[s]++; } /** * Starts a job * @ignore */ startJob(id) { let job = this.requestsInProgress.get(id); if (!job) job = new Job(); job.run(); this.requestsInProgress.set(id, job); } /** * Mark job as finished and sets the state * @ignore */ finishJob(id, retryCount) { const job = this.requestsInProgress.get(id); if (!job) return; const jobDurationMillis = job.finish(); this.state.requestsFinished++; this.state.requestTotalFinishedDurationMillis += jobDurationMillis; this._saveRetryCountForJob(retryCount); if (jobDurationMillis < this.state.requestMinDurationMillis) this.state.requestMinDurationMillis = jobDurationMillis; if (jobDurationMillis > this.state.requestMaxDurationMillis) this.state.requestMaxDurationMillis = jobDurationMillis; this.requestsInProgress.delete(id); } /** * Mark job as failed and sets the state * @ignore */ failJob(id, retryCount) { const job = this.requestsInProgress.get(id); if (!job) return; this.state.requestTotalFailedDurationMillis += job.finish(); this.state.requestsFailed++; this._saveRetryCountForJob(retryCount); this.requestsInProgress.delete(id); } /** * Calculate the current statistics */ calculate() { const { requestsFailed, requestsFinished, requestTotalFailedDurationMillis, requestTotalFinishedDurationMillis, } = this.state; const totalMillis = Date.now() - this.instanceStart; const totalMinutes = totalMillis / 1000 / 60; return { requestAvgFailedDurationMillis: Math.round(requestTotalFailedDurationMillis / requestsFailed) || Infinity, requestAvgFinishedDurationMillis: Math.round(requestTotalFinishedDurationMillis / requestsFinished) || Infinity, requestsFinishedPerMinute: Math.round(requestsFinished / totalMinutes) || 0, requestsFailedPerMinute: Math.floor(requestsFailed / totalMinutes) || 0, requestTotalDurationMillis: requestTotalFinishedDurationMillis + requestTotalFailedDurationMillis, requestsTotal: requestsFailed + requestsFinished, crawlerRuntimeMillis: totalMillis, }; } /** * Initializes the key value store for persisting the statistics, * displaying the current state in predefined intervals */ async startCapturing() { this.keyValueStore ?? (this.keyValueStore = await key_value_store_1.KeyValueStore.open(null, { config: this.config })); if (this.state.crawlerStartedAt === null) { this.state.crawlerStartedAt = new Date(); } if (this.persistenceOptions.enable) { await this._maybeLoadStatistics(); this.events.on("persistState" /* EventType.PERSIST_STATE */, this.listener); } this.logInterval = setInterval(() => { this.log.info(this.logMessage, { ...this.calculate(), retryHistogram: this.requestRetryHistogram, }); }, this.logIntervalMillis); } /** * Stops logging and remove event listeners, then persist */ async stopCapturing() { this._teardown(); this.state.crawlerFinishedAt = new Date(); await this.persistState(); } _saveRetryCountForJob(retryCount) { var _a; if (retryCount > 0) this.state.requestsRetries++; (_a = this.requestRetryHistogram)[retryCount] ?? (_a[retryCount] = 0); this.requestRetryHistogram[retryCount]++; } /** * Persist internal state to the key value store * @param options - Override the persistence options provided in the constructor */ async persistState(options) { if (!this.persistenceOptions.enable && !options?.enable) { return; } // this might be called before startCapturing was called without using await, should not crash if (!this.keyValueStore) { return; } this.log.debug('Persisting state', { persistStateKey: this.persistStateKey }); // use half the interval of `persistState` to avoid race conditions const persistStateIntervalMillis = this.config.get('persistStateIntervalMillis'); const timeoutSecs = persistStateIntervalMillis / 2000; await this.keyValueStore .setValue(this.persistStateKey, this.toJSON(), { timeoutSecs, doNotRetryTimeouts: true, }) .catch((error) => this.log.warning(`Failed to persist the statistics to ${this.persistStateKey}`, { error })); } /** * Loads the current statistic from the key value store if any */ async _maybeLoadStatistics() { // this might be called before startCapturing was called without using await, should not crash if (!this.keyValueStore) { return; } const savedState = await this.keyValueStore.getValue(this.persistStateKey); if (!savedState) return; // We saw a run where the requestRetryHistogram was not iterable and crashed // the crawler. Adding some logging to monitor this problem in the future. if (!Array.isArray(savedState.requestRetryHistogram)) { this.log.warning('Received invalid state from Key-value store.', { persistStateKey: this.persistStateKey, state: savedState, }); } this.log.debug('Recreating state from KeyValueStore', { persistStateKey: this.persistStateKey }); // the `requestRetryHistogram` array might be very large, we could end up with // `RangeError: Maximum call stack size exceeded` if we use `a.push(...b)` savedState.requestRetryHistogram.forEach((idx) => this.requestRetryHistogram.push(idx)); this.state.requestsFinished = savedState.requestsFinished; this.state.requestsFailed = savedState.requestsFailed; this.state.requestsRetries = savedState.requestsRetries; this.state.requestTotalFailedDurationMillis = savedState.requestTotalFailedDurationMillis; this.state.requestTotalFinishedDurationMillis = savedState.requestTotalFinishedDurationMillis; this.state.requestMinDurationMillis = savedState.requestMinDurationMillis; this.state.requestMaxDurationMillis = savedState.requestMaxDurationMillis; // persisted state uses ISO date strings this.state.crawlerFinishedAt = savedState.crawlerFinishedAt ? new Date(savedState.crawlerFinishedAt) : null; this.state.crawlerStartedAt = savedState.crawlerStartedAt ? new Date(savedState.crawlerStartedAt) : null; this.state.statsPersistedAt = savedState.statsPersistedAt ? new Date(savedState.statsPersistedAt) : null; this.state.crawlerRuntimeMillis = savedState.crawlerRuntimeMillis; this.instanceStart = Date.now() - (+this.state.statsPersistedAt - savedState.crawlerLastStartTimestamp); this.log.debug('Loaded from KeyValueStore'); } _teardown() { // this can be called before a call to startCapturing happens (or in a 'finally' block) this.events.off("persistState" /* EventType.PERSIST_STATE */, this.listener); if (this.logInterval) { clearInterval(this.logInterval); this.logInterval = null; } } /** * Make this class serializable when called with `JSON.stringify(statsInstance)` directly * or through `keyValueStore.setValue('KEY', statsInstance)` */ toJSON() { // merge all the current state information that can be used from the outside // without the need to reconstruct for the sake of stats.calculate() // omit duplicated information const result = { ...this.state, crawlerLastStartTimestamp: this.instanceStart, crawlerFinishedAt: this.state.crawlerFinishedAt ? new Date(this.state.crawlerFinishedAt).toISOString() : null, crawlerStartedAt: this.state.crawlerStartedAt ? new Date(this.state.crawlerStartedAt).toISOString() : null, requestRetryHistogram: this.requestRetryHistogram, statsId: this.id, statsPersistedAt: new Date().toISOString(), ...this.calculate(), }; Reflect.deleteProperty(result, 'requestsWithStatusCode'); Reflect.deleteProperty(result, 'errors'); Reflect.deleteProperty(result, 'retryErrors'); result.requestsWithStatusCode = this.state.requestsWithStatusCode; result.errors = this.state.errors; result.retryErrors = this.state.retryErrors; return result; } } exports.Statistics = Statistics; Object.defineProperty(Statistics, "id", { enumerable: true, configurable: true, writable: true, value: 0 }); //# sourceMappingURL=statistics.js.map