UNPKG

@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

549 lines • 25 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.KeyValueStore = exports.maybeStringify = void 0; const tslib_1 = require("tslib"); const promises_1 = require("node:fs/promises"); const node_path_1 = require("node:path"); const json5_1 = tslib_1.__importDefault(require("json5")); const ow_1 = tslib_1.__importStar(require("ow")); const consts_1 = require("@apify/consts"); const log_1 = tslib_1.__importDefault(require("@apify/log")); const utilities_1 = require("@apify/utilities"); const configuration_1 = require("../configuration"); const access_checking_1 = require("./access_checking"); const storage_manager_1 = require("./storage_manager"); const utils_1 = require("./utils"); /** * Helper function to possibly stringify value if options.contentType is not set. * * @ignore */ const maybeStringify = (value, options) => { // If contentType is missing, value will be stringified to JSON if (options.contentType === null || options.contentType === undefined) { options.contentType = 'application/json; charset=utf-8'; try { // Format JSON to simplify debugging, the overheads with compression is negligible value = (0, utilities_1.jsonStringifyExtended)(value, null, 2); } catch (e) { const error = e; // Give more meaningful error message if (error.message?.includes('Invalid string length')) { error.message = 'Object is too large'; } throw new Error(`The "value" parameter cannot be stringified to JSON: ${error.message}`); } if (value === undefined) { throw new Error('The "value" parameter was stringified to JSON and returned undefined. ' + "Make sure you're not trying to stringify an undefined value."); } } return value; }; exports.maybeStringify = maybeStringify; /** * The `KeyValueStore` class represents a key-value store, a simple data storage that is used * for saving and reading data records or files. Each data record is * represented by a unique key and associated with a MIME content type. Key-value stores are ideal * for saving screenshots, crawler inputs and outputs, web pages, PDFs or to persist the state of crawlers. * * Do not instantiate this class directly, use the * {@link KeyValueStore.open} function instead. * * Each crawler run is associated with a default key-value store, which is created exclusively * for the run. By convention, the crawler input and output are stored into the * default key-value store under the `INPUT` and `OUTPUT` key, respectively. * Typically, input and output are JSON files, although it can be any other format. * To access the default key-value store directly, you can use the * {@link KeyValueStore.getValue} and {@link KeyValueStore.setValue} convenience functions. * * To access the input, you can also use the {@link KeyValueStore.getInput} convenience function. * * `KeyValueStore` stores its data on a local disk. * * If the `CRAWLEE_STORAGE_DIR` environment variable is set, the data is stored in * the local directory in the following files: * ``` * {CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT} * ``` * Note that `{STORE_ID}` is the name or ID of the key-value store. The default key-value store has ID: `default`, * unless you override it by setting the `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID` environment variable. * The `{KEY}` is the key of the record and `{EXT}` corresponds to the MIME content type of the data value. * * **Example usage:** * * ```javascript * // Get crawler input from the default key-value store. * const input = await KeyValueStore.getInput(); * // Get some value from the default key-value store. * const otherValue = await KeyValueStore.getValue('my-key'); * * // Write crawler output to the default key-value store. * await KeyValueStore.setValue('OUTPUT', { myResult: 123 }); * * // Open a named key-value store * const store = await KeyValueStore.open('some-name'); * * // Write a record. JavaScript object is automatically converted to JSON, * // strings and binary buffers are stored as they are * await store.setValue('some-key', { foo: 'bar' }); * * // Read a record. Note that JSON is automatically parsed to a JavaScript object, * // text data returned as a string and other data is returned as binary buffer * const value = await store.getValue('some-key'); * * // Drop (delete) the store * await store.drop(); * ``` * @category Result Stores */ class KeyValueStore { /** * @internal */ constructor(options, config = configuration_1.Configuration.getGlobalConfig()) { Object.defineProperty(this, "config", { enumerable: true, configurable: true, writable: true, value: config }); Object.defineProperty(this, "id", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "name", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "storageObject", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "client", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "persistStateEventStarted", { enumerable: true, configurable: true, writable: true, value: false }); /** Cache for persistent (auto-saved) values. When we try to set such value, the cache will be updated automatically. */ Object.defineProperty(this, "cache", { enumerable: true, configurable: true, writable: true, value: new Map() }); this.id = options.id; this.name = options.name; this.storageObject = options.storageObject; this.client = options.client.keyValueStore(this.id); } /** * Gets a value from the key-value store. * * The function returns a `Promise` that resolves to the record value, * whose JavaScript type depends on the MIME content type of the record. * Records with the `application/json` * content type are automatically parsed and returned as a JavaScript object. * Similarly, records with `text/plain` content types are returned as a string. * For all other content types, the value is returned as a raw * [`Buffer`](https://nodejs.org/api/buffer.html) instance. * * If the record does not exist, the function resolves to `null`. * * To save or delete a value in the key-value store, use the * {@link KeyValueStore.setValue} function. * * **Example usage:** * * ```javascript * const store = await KeyValueStore.open(); * const buffer = await store.getValue('screenshot1.png'); * ``` * @param key * Unique key of the record. It can be at most 256 characters long and only consist * of the following characters: `a`-`z`, `A`-`Z`, `0`-`9` and `!-_.'()` * @param defaultValue * Fallback that will be returned if no value if present in the storage. * @returns * Returns a promise that resolves to an object, string * or [`Buffer`](https://nodejs.org/api/buffer.html), depending * on the MIME content type of the record, or `null` if the key is missing from the store. */ async getValue(key, defaultValue) { (0, access_checking_1.checkStorageAccess)(); (0, ow_1.default)(key, ow_1.default.string.nonEmpty); const record = await this.client.getRecord(key); return record?.value ?? defaultValue ?? null; } /** * Tests whether a record with the given key exists in the key-value store without retrieving its value. * * @param key The queried record key. * @returns `true` if the record exists, `false` if it does not. */ async recordExists(key) { (0, access_checking_1.checkStorageAccess)(); (0, ow_1.default)(key, ow_1.default.string.nonEmpty); return this.client.recordExists(key); } async getAutoSavedValue(key, defaultValue = {}) { (0, access_checking_1.checkStorageAccess)(); if (this.cache.has(key)) { return this.cache.get(key); } const value = await this.getValue(key, defaultValue); // The await above could have run in parallel with another call to this function. If the other call finished more quickly, // the value will in cache at this point, and returning the new fetched value would introduce two different instances of // the auto-saved object, and only the latter one would be persisted. // Therefore we re-check the cache here, and if such race condition happened, we drop the fetched value and return the cached one. if (this.cache.has(key)) { return this.cache.get(key); } this.cache.set(key, value); this.ensurePersistStateEvent(); return value; } ensurePersistStateEvent() { if (this.persistStateEventStarted) { return; } // use half the interval of `persistState` to avoid race conditions const persistStateIntervalMillis = this.config.get('persistStateIntervalMillis'); const timeoutSecs = persistStateIntervalMillis / 2000; this.config.getEventManager().on('persistState', async () => { const promises = []; for (const [key, value] of this.cache) { promises.push(this.setValue(key, value, { timeoutSecs, doNotRetryTimeouts: true, }).catch((error) => log_1.default.warning(`Failed to persist the state value to ${key}`, { error }))); } await Promise.all(promises); }); this.persistStateEventStarted = true; } /** * Saves or deletes a record in the key-value store. * The function returns a promise that resolves once the record has been saved or deleted. * * **Example usage:** * * ```javascript * const store = await KeyValueStore.open(); * await store.setValue('OUTPUT', { foo: 'bar' }); * ``` * * Beware that the key can be at most 256 characters long and only contain the following characters: `a-zA-Z0-9!-_.'()` * * By default, `value` is converted to JSON and stored with the * `application/json; charset=utf-8` MIME content type. * To store the value with another content type, pass it in the options as follows: * ```javascript * const store = await KeyValueStore.open('my-text-store'); * await store.setValue('RESULTS', 'my text data', { contentType: 'text/plain' }); * ``` * If you set custom content type, `value` must be either a string or * [`Buffer`](https://nodejs.org/api/buffer.html), otherwise an error will be thrown. * * If `value` is `null`, the record is deleted instead. Note that the `setValue()` function succeeds * regardless whether the record existed or not. * * To retrieve a value from the key-value store, use the * {@link KeyValueStore.getValue} function. * * **IMPORTANT:** Always make sure to use the `await` keyword when calling `setValue()`, * otherwise the crawler process might finish before the value is stored! * * @param key * Unique key of the record. It can be at most 256 characters long and only consist * of the following characters: `a`-`z`, `A`-`Z`, `0`-`9` and `!-_.'()` * @param value * Record data, which can be one of the following values: * - If `null`, the record in the key-value store is deleted. * - If no `options.contentType` is specified, `value` can be any JavaScript object and it will be stringified to JSON. * - If `options.contentType` is set, `value` is taken as is and it must be a `String` or [`Buffer`](https://nodejs.org/api/buffer.html). * For any other value an error will be thrown. * @param [options] Record options. */ async setValue(key, value, options = {}) { (0, access_checking_1.checkStorageAccess)(); (0, ow_1.default)(key, 'key', ow_1.default.string.nonEmpty); (0, ow_1.default)(key, ow_1.default.string.validate((k) => ({ validator: ow_1.default.isValid(k, ow_1.default.string.matches(consts_1.KEY_VALUE_STORE_KEY_REGEX)), message: `The "key" argument "${key}" must be at most 256 characters long and only contain the following characters: a-zA-Z0-9!-_.'()`, }))); if (options.contentType && !(ow_1.default.isValid(value, ow_1.default.any(ow_1.default.string, ow_1.default.uint8Array)) || (ow_1.default.isValid(value, ow_1.default.object) && typeof value.pipe === 'function'))) { throw new ow_1.ArgumentError('The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified.', this.setValue); } (0, ow_1.default)(options, ow_1.default.object.exactShape({ contentType: ow_1.default.optional.string.nonEmpty, timeoutSecs: ow_1.default.optional.number, doNotRetryTimeouts: ow_1.default.optional.boolean, })); // Make copy of options, don't update what user passed. const optionsCopy = { ...options }; // If we try to set the value of a cached state to a different reference, we need to update the cache accordingly. const cachedValue = this.cache.get(key); if (cachedValue && cachedValue !== value) { if (value === null) { // Cached state can be only object, so a propagation of `null` means removing all its properties. Object.keys(cachedValue).forEach((k) => this.cache.delete(k)); } else if (typeof value === 'object') { // We need to remove the keys that are no longer present in the new value. Object.keys(cachedValue) .filter((k) => !(k in value)) .forEach((k) => this.cache.delete(k)); // And update the existing ones + add new ones. Object.assign(cachedValue, value); } } // In this case delete the record. if (value === null) return this.client.deleteRecord(key); value = (0, exports.maybeStringify)(value, optionsCopy); return this.client.setRecord({ key, value, contentType: optionsCopy.contentType, }, { timeoutSecs: optionsCopy.timeoutSecs, doNotRetryTimeouts: optionsCopy.doNotRetryTimeouts, }); } /** * Removes the key-value store either from the Apify cloud storage or from the local directory, * depending on the mode of operation. */ async drop() { (0, access_checking_1.checkStorageAccess)(); await this.client.delete(); const manager = storage_manager_1.StorageManager.getManager(KeyValueStore, this.config); manager.closeStorage(this); } /** @internal */ clearCache() { (0, access_checking_1.checkStorageAccess)(); this.cache.clear(); } /** * Iterates over key-value store keys, yielding each in turn to an `iteratee` function. * Each invocation of `iteratee` is called with three arguments: `(key, index, info)`, where `key` * is the record key, `index` is a zero-based index of the key in the current iteration * (regardless of `options.exclusiveStartKey`) and `info` is an object that contains a single property `size` * indicating size of the record in bytes. * * If the `iteratee` function returns a Promise then it is awaited before the next call. * If it throws an error, the iteration is aborted and the `forEachKey` function throws the error. * * **Example usage** * ```javascript * const keyValueStore = await KeyValueStore.open(); * await keyValueStore.forEachKey(async (key, index, info) => { * console.log(`Key at ${index}: ${key} has size ${info.size}`); * }); * ``` * * @param iteratee A function that is called for every key in the key-value store. * @param [options] All `forEachKey()` parameters. */ async forEachKey(iteratee, options = {}) { (0, access_checking_1.checkStorageAccess)(); return this._forEachKey(iteratee, options); } async _forEachKey(iteratee, options = {}, index = 0) { const { exclusiveStartKey, prefix, collection } = options; (0, ow_1.default)(iteratee, ow_1.default.function); (0, ow_1.default)(options, ow_1.default.object.exactShape({ exclusiveStartKey: ow_1.default.optional.string, prefix: ow_1.default.optional.string, collection: ow_1.default.optional.string, })); const response = await this.client.listKeys({ exclusiveStartKey, prefix, collection }); const { nextExclusiveStartKey, isTruncated, items } = response; for (const item of items) { await iteratee(item.key, index++, { size: item.size }); } return isTruncated ? this._forEachKey(iteratee, { exclusiveStartKey: nextExclusiveStartKey, prefix, collection }, index) : undefined; // [].forEach() returns undefined. } /** * Returns a file URL for the given key. */ getPublicUrl(key) { const name = this.name ?? this.config.get('defaultKeyValueStoreId'); return `file://${process.cwd()}/storage/key_value_stores/${name}/${key}`; } /** * Opens a key-value store and returns a promise resolving to an instance of the {@link KeyValueStore} class. * * Key-value stores are used to store records or files, along with their MIME content type. * The records are stored and retrieved using a unique key. * The actual data is stored either on a local filesystem or in the Apify cloud. * * For more details and code examples, see the {@link KeyValueStore} class. * * @param [storeIdOrName] * ID or name of the key-value store to be opened. If `null` or `undefined`, * the function returns the default key-value store associated with the crawler run. * @param [options] Storage manager options. */ static async open(storeIdOrName, options = {}) { (0, access_checking_1.checkStorageAccess)(); (0, ow_1.default)(storeIdOrName, ow_1.default.optional.any(ow_1.default.string, ow_1.default.null)); (0, ow_1.default)(options, ow_1.default.object.exactShape({ config: ow_1.default.optional.object.instanceOf(configuration_1.Configuration), storageClient: ow_1.default.optional.object, })); options.config ?? (options.config = configuration_1.Configuration.getGlobalConfig()); options.storageClient ?? (options.storageClient = options.config.getStorageClient()); await (0, utils_1.purgeDefaultStorages)({ onlyPurgeOnce: true, client: options.storageClient, config: options.config }); const manager = storage_manager_1.StorageManager.getManager(this, options.config); return manager.openStorage(storeIdOrName, options.storageClient); } /** * Gets a value from the default {@link KeyValueStore} associated with the current crawler run. * * This is just a convenient shortcut for {@link KeyValueStore.getValue}. * For example, calling the following code: * ```javascript * const value = await KeyValueStore.getValue('my-key'); * ``` * * is equivalent to: * ```javascript * const store = await KeyValueStore.open(); * const value = await store.getValue('my-key'); * ``` * * To store the value to the default key-value store, you can use the {@link KeyValueStore.setValue} function. * * For more information, see {@link KeyValueStore.open} * and {@link KeyValueStore.getValue}. * * @param key Unique record key. * @param defaultValue Fallback that will be returned if no value if present in the storage. * @returns * Returns a promise that resolves to an object, string * or [`Buffer`](https://nodejs.org/api/buffer.html), depending * on the MIME content type of the record, or `null` * if the record is missing. * @ignore */ static async getValue(key, defaultValue) { const store = await this.open(); return store.getValue(key, defaultValue); } /** * Tests whether a record with the given key exists in the default {@link KeyValueStore} associated with the current crawler run. * @param key The queried record key. * @returns `true` if the record exists, `false` if it does not. */ static async recordExists(key) { const store = await this.open(); return store.recordExists(key); } static async getAutoSavedValue(key, defaultValue = {}) { const store = await this.open(); return store.getAutoSavedValue(key, defaultValue); } /** * Stores or deletes a value in the default {@link KeyValueStore} associated with the current crawler run. * * This is just a convenient shortcut for {@link KeyValueStore.setValue}. * For example, calling the following code: * ```javascript * await KeyValueStore.setValue('OUTPUT', { foo: "bar" }); * ``` * * is equivalent to: * ```javascript * const store = await KeyValueStore.open(); * await store.setValue('OUTPUT', { foo: "bar" }); * ``` * * To get a value from the default key-value store, you can use the {@link KeyValueStore.getValue} function. * * For more information, see {@link KeyValueStore.open} * and {@link KeyValueStore.getValue}. * * @param key * Unique record key. * @param value * Record data, which can be one of the following values: * - If `null`, the record in the key-value store is deleted. * - If no `options.contentType` is specified, `value` can be any JavaScript object, and it will be stringified to JSON. * - If `options.contentType` is set, `value` is taken as is, and it must be a `String` or [`Buffer`](https://nodejs.org/api/buffer.html). * For any other value an error will be thrown. * @param [options] * @ignore */ static async setValue(key, value, options = {}) { const store = await this.open(); return store.setValue(key, value, options); } /** * Gets the crawler input value from the default {@link KeyValueStore} associated with the current crawler run. * By default, it will try to find root input files (either extension-less, `.json` or `.txt`), * or alternatively read the input from the default {@link KeyValueStore}. * * Note that the `getInput()` function does not cache the value read from the key-value store. * If you need to use the input multiple times in your crawler, * it is far more efficient to read it once and store it locally. * * For more information, see {@link KeyValueStore.open} * and {@link KeyValueStore.getValue}. * * @returns * Returns a promise that resolves to an object, string * or [`Buffer`](https://nodejs.org/api/buffer.html), depending * on the MIME content type of the record, or `null` * if the record is missing. * @ignore */ static async getInput() { const store = await this.open(); const inputKey = store.config.get('inputKey'); const cwd = process.cwd(); const possibleExtensions = ['', '.json', '.txt']; // Attempt to read input from root file instead of key-value store for (const extension of possibleExtensions) { const inputFile = (0, node_path_1.join)(cwd, `${inputKey}${extension}`); let input; // Try getting the file from the file system try { input = await (0, promises_1.readFile)(inputFile); } catch { continue; } // Attempt to parse as JSON, or return the input as is otherwise try { return json5_1.default.parse(input.toString()); } catch { return input; } } return store.getValue(inputKey); } } exports.KeyValueStore = KeyValueStore; //# sourceMappingURL=key_value_store.js.map