@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
353 lines • 13.3 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Configuration = void 0;
const tslib_1 = require("tslib");
const node_async_hooks_1 = require("node:async_hooks");
const node_events_1 = require("node:events");
const node_path_1 = require("node:path");
const memory_storage_1 = require("@crawlee/memory-storage");
const fs_extra_1 = require("fs-extra");
const log_1 = tslib_1.__importStar(require("@apify/log"));
const events_1 = require("./events");
const typedefs_1 = require("./typedefs");
/**
* `Configuration` is a value object holding Crawlee configuration. By default, there is a
* global singleton instance of this class available via `Configuration.getGlobalConfig()`.
* Places that depend on a configurable behaviour depend on this class, as they have the global
* instance as the default value.
*
* *Using global configuration:*
* ```js
* import { BasicCrawler, Configuration } from 'crawlee';
*
* // Get the global configuration
* const config = Configuration.getGlobalConfig();
* // Set the 'persistStateIntervalMillis' option
* // of global configuration to 10 seconds
* config.set('persistStateIntervalMillis', 10_000);
*
* // No need to pass the configuration to the crawler,
* // as it's using the global configuration by default
* const crawler = new BasicCrawler();
* ```
*
* *Using custom configuration:*
* ```js
* import { BasicCrawler, Configuration } from 'crawlee';
*
* // Create a new configuration
* const config = new Configuration({ persistStateIntervalMillis: 30_000 });
* // Pass the configuration to the crawler
* const crawler = new BasicCrawler({ ... }, config);
* ```
*
* The configuration provided via environment variables always takes precedence. We can also
* define the `crawlee.json` file in the project root directory which will serve as a baseline,
* so the options provided in constructor will override those. In other words, the precedence is:
*
* ```text
* crawlee.json < constructor options < environment variables
* ```
*
* ## Supported Configuration Options
*
* Key | Environment Variable | Default Value
* ---|---|---
* `memoryMbytes` | `CRAWLEE_MEMORY_MBYTES` | -
* `logLevel` | `CRAWLEE_LOG_LEVEL` | -
* `headless` | `CRAWLEE_HEADLESS` | `true`
* `defaultDatasetId` | `CRAWLEE_DEFAULT_DATASET_ID` | `'default'`
* `defaultKeyValueStoreId` | `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID` | `'default'`
* `defaultRequestQueueId` | `CRAWLEE_DEFAULT_REQUEST_QUEUE_ID` | `'default'`
* `persistStateIntervalMillis` | `CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS` | `60_000`
* `purgeOnStart` | `CRAWLEE_PURGE_ON_START` | `true`
* `persistStorage` | `CRAWLEE_PERSIST_STORAGE` | `true`
*
* ## Advanced Configuration Options
*
* Key | Environment Variable | Default Value
* ---|---|---
* `inputKey` | `CRAWLEE_INPUT_KEY` | `'INPUT'`
* `xvfb` | `CRAWLEE_XVFB` | -
* `chromeExecutablePath` | `CRAWLEE_CHROME_EXECUTABLE_PATH` | -
* `defaultBrowserPath` | `CRAWLEE_DEFAULT_BROWSER_PATH` | -
* `disableBrowserSandbox` | `CRAWLEE_DISABLE_BROWSER_SANDBOX` | -
* `availableMemoryRatio` | `CRAWLEE_AVAILABLE_MEMORY_RATIO` | `0.25`
* `systemInfoV2` | `CRAWLEE_SYSTEM_INFO_V2` | false
* `containerized | `CRAWLEE_CONTAINERIZED | -
*/
class Configuration {
/**
* Creates new `Configuration` instance with provided options. Env vars will have precedence over those.
*/
constructor(options = {}) {
Object.defineProperty(this, "options", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "services", {
enumerable: true,
configurable: true,
writable: true,
value: new Map()
});
Object.defineProperty(this, "storageManagers", {
enumerable: true,
configurable: true,
writable: true,
value: new Map()
});
this.buildOptions(options);
// Increase the global limit for event emitter memory leak warnings.
node_events_1.EventEmitter.defaultMaxListeners = 50;
// set the log level to support CRAWLEE_ prefixed env var too
const logLevel = this.get('logLevel');
if (logLevel) {
const level = Number.isFinite(+logLevel)
? +logLevel
: log_1.LogLevel[String(logLevel).toUpperCase()];
log_1.default.setLevel(level);
}
}
/**
* Returns configured value. First checks the environment variables, then provided configuration,
* fallbacks to the `defaultValue` argument if provided, otherwise uses the default value as described
* in the above section.
*/
get(key, defaultValue) {
// prefer env vars, always iterate through the whole map as there might be duplicate env vars for the same option
let envValue;
for (const [k, v] of (0, typedefs_1.entries)(Configuration.ENV_MAP)) {
if (key === v) {
envValue = process.env[k];
if (envValue) {
break;
}
}
}
if (envValue != null) {
return this._castEnvValue(key, envValue);
}
// check instance level options
if (this.options.has(key)) {
return this.options.get(key);
}
// fallback to defaults
return (defaultValue ?? Configuration.DEFAULTS[key] ?? envValue);
}
_castEnvValue(key, value) {
if (Configuration.INTEGER_VARS.includes(key)) {
return +value;
}
if (Configuration.BOOLEAN_VARS.includes(key)) {
// 0, false and empty string are considered falsy values
return !['0', 'false', ''].includes(String(value).toLowerCase());
}
if (Configuration.COMMA_SEPARATED_LIST_VARS.includes(key)) {
if (!value)
return [];
return String(value)
.split(',')
.map((v) => v.trim());
}
return value;
}
/**
* Sets value for given option. Only affects this `Configuration` instance, the value will not be propagated down to the env var.
* To reset a value, we can omit the `value` argument or pass `undefined` there.
*/
set(key, value) {
this.options.set(key, value);
}
/**
* Sets value for given option. Only affects the global `Configuration` instance, the value will not be propagated down to the env var.
* To reset a value, we can omit the `value` argument or pass `undefined` there.
*/
static set(key, value) {
this.getGlobalConfig().set(key, value);
}
/**
* Returns cached instance of {@link StorageClient} using options as defined in the environment variables or in
* this {@link Configuration} instance. Only first call of this method will create the client, following calls will
* return the same client instance.
*
* Caching works based on the `storageClientOptions`, so calling this method with different options will return
* multiple instances, one for each variant of the options.
* @internal
*/
getStorageClient() {
if (this.options.has('storageClient')) {
return this.options.get('storageClient');
}
const options = this.options.get('storageClientOptions');
return this.createMemoryStorage(options);
}
getEventManager() {
if (this.options.has('eventManager')) {
return this.options.get('eventManager');
}
if (this.services.has('eventManager')) {
return this.services.get('eventManager');
}
const eventManager = new events_1.LocalEventManager(this);
this.services.set('eventManager', eventManager);
return eventManager;
}
/**
* Creates an instance of MemoryStorage using options as defined in the environment variables or in this `Configuration` instance.
* @internal
*/
createMemoryStorage(options = {}) {
const cacheKey = `MemoryStorage-${JSON.stringify(options)}`;
if (this.services.has(cacheKey)) {
return this.services.get(cacheKey);
}
const storage = new memory_storage_1.MemoryStorage({
persistStorage: this.get('persistStorage'),
// Override persistStorage if user provides it via storageClientOptions
...options,
});
this.services.set(cacheKey, storage);
return storage;
}
useStorageClient(client) {
this.options.set('storageClient', client);
}
static useStorageClient(client) {
this.getGlobalConfig().useStorageClient(client);
}
useEventManager(events) {
this.options.set('eventManager', events);
}
/**
* Returns the global configuration instance. It will respect the environment variables.
*/
static getGlobalConfig() {
if (Configuration.storage.getStore()) {
return Configuration.storage.getStore();
}
Configuration.globalConfig ?? (Configuration.globalConfig = new Configuration());
return Configuration.globalConfig;
}
/**
* Gets default {@link StorageClient} instance.
*/
static getStorageClient() {
return this.getGlobalConfig().getStorageClient();
}
/**
* Gets default {@link EventManager} instance.
*/
static getEventManager() {
return this.getGlobalConfig().getEventManager();
}
/**
* Resets global configuration instance. The default instance holds configuration based on env vars,
* if we want to change them, we need to first reset the global state. Used mainly for testing purposes.
*/
static resetGlobalState() {
delete this.globalConfig;
}
buildOptions(options) {
// try to load configuration from crawlee.json as the baseline
const path = (0, node_path_1.join)(process.cwd(), 'crawlee.json');
if ((0, fs_extra_1.pathExistsSync)(path)) {
try {
const file = (0, fs_extra_1.readFileSync)(path);
const optionsFromFileConfig = JSON.parse(file.toString());
Object.assign(options, optionsFromFileConfig);
}
catch {
// ignore
}
}
this.options = new Map((0, typedefs_1.entries)(options));
}
}
exports.Configuration = Configuration;
/**
* Maps environment variables to config keys (e.g. `CRAWLEE_MEMORY_MBYTES` to `memoryMbytes`)
*/
Object.defineProperty(Configuration, "ENV_MAP", {
enumerable: true,
configurable: true,
writable: true,
value: {
CRAWLEE_AVAILABLE_MEMORY_RATIO: 'availableMemoryRatio',
CRAWLEE_PURGE_ON_START: 'purgeOnStart',
CRAWLEE_MEMORY_MBYTES: 'memoryMbytes',
CRAWLEE_DEFAULT_DATASET_ID: 'defaultDatasetId',
CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID: 'defaultKeyValueStoreId',
CRAWLEE_DEFAULT_REQUEST_QUEUE_ID: 'defaultRequestQueueId',
CRAWLEE_INPUT_KEY: 'inputKey',
CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS: 'persistStateIntervalMillis',
CRAWLEE_HEADLESS: 'headless',
CRAWLEE_XVFB: 'xvfb',
CRAWLEE_CHROME_EXECUTABLE_PATH: 'chromeExecutablePath',
CRAWLEE_DEFAULT_BROWSER_PATH: 'defaultBrowserPath',
CRAWLEE_DISABLE_BROWSER_SANDBOX: 'disableBrowserSandbox',
CRAWLEE_LOG_LEVEL: 'logLevel',
CRAWLEE_PERSIST_STORAGE: 'persistStorage',
CRAWLEE_SYSTEM_INFO_V2: 'systemInfoV2',
CRAWLEE_CONTAINERIZED: 'containerized',
}
});
Object.defineProperty(Configuration, "BOOLEAN_VARS", {
enumerable: true,
configurable: true,
writable: true,
value: [
'purgeOnStart',
'headless',
'xvfb',
'disableBrowserSandbox',
'persistStorage',
'systemInfoV2',
'containerized',
]
});
Object.defineProperty(Configuration, "INTEGER_VARS", {
enumerable: true,
configurable: true,
writable: true,
value: ['memoryMbytes', 'persistStateIntervalMillis', 'systemInfoIntervalMillis']
});
Object.defineProperty(Configuration, "COMMA_SEPARATED_LIST_VARS", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
Object.defineProperty(Configuration, "DEFAULTS", {
enumerable: true,
configurable: true,
writable: true,
value: {
defaultKeyValueStoreId: 'default',
defaultDatasetId: 'default',
defaultRequestQueueId: 'default',
inputKey: 'INPUT',
maxUsedCpuRatio: 0.95,
availableMemoryRatio: 0.25,
storageClientOptions: {},
purgeOnStart: true,
headless: true,
persistStateIntervalMillis: 60000,
systemInfoIntervalMillis: 1000,
persistStorage: true,
systemInfoV2: true,
}
});
/**
* Provides access to the current-instance-scoped Configuration without passing it around in parameters.
* @internal
*/
Object.defineProperty(Configuration, "storage", {
enumerable: true,
configurable: true,
writable: true,
value: new node_async_hooks_1.AsyncLocalStorage()
});
//# sourceMappingURL=configuration.js.map