@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
549 lines • 25 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.KeyValueStore = exports.maybeStringify = void 0;
const tslib_1 = require("tslib");
const promises_1 = require("node:fs/promises");
const node_path_1 = require("node:path");
const json5_1 = tslib_1.__importDefault(require("json5"));
const ow_1 = tslib_1.__importStar(require("ow"));
const consts_1 = require("@apify/consts");
const log_1 = tslib_1.__importDefault(require("@apify/log"));
const utilities_1 = require("@apify/utilities");
const configuration_1 = require("../configuration");
const access_checking_1 = require("./access_checking");
const storage_manager_1 = require("./storage_manager");
const utils_1 = require("./utils");
/**
* Helper function to possibly stringify value if options.contentType is not set.
*
* @ignore
*/
const maybeStringify = (value, options) => {
// If contentType is missing, value will be stringified to JSON
if (options.contentType === null || options.contentType === undefined) {
options.contentType = 'application/json; charset=utf-8';
try {
// Format JSON to simplify debugging, the overheads with compression is negligible
value = (0, utilities_1.jsonStringifyExtended)(value, null, 2);
}
catch (e) {
const error = e;
// Give more meaningful error message
if (error.message?.includes('Invalid string length')) {
error.message = 'Object is too large';
}
throw new Error(`The "value" parameter cannot be stringified to JSON: ${error.message}`);
}
if (value === undefined) {
throw new Error('The "value" parameter was stringified to JSON and returned undefined. ' +
"Make sure you're not trying to stringify an undefined value.");
}
}
return value;
};
exports.maybeStringify = maybeStringify;
/**
* The `KeyValueStore` class represents a key-value store, a simple data storage that is used
* for saving and reading data records or files. Each data record is
* represented by a unique key and associated with a MIME content type. Key-value stores are ideal
* for saving screenshots, crawler inputs and outputs, web pages, PDFs or to persist the state of crawlers.
*
* Do not instantiate this class directly, use the
* {@link KeyValueStore.open} function instead.
*
* Each crawler run is associated with a default key-value store, which is created exclusively
* for the run. By convention, the crawler input and output are stored into the
* default key-value store under the `INPUT` and `OUTPUT` key, respectively.
* Typically, input and output are JSON files, although it can be any other format.
* To access the default key-value store directly, you can use the
* {@link KeyValueStore.getValue} and {@link KeyValueStore.setValue} convenience functions.
*
* To access the input, you can also use the {@link KeyValueStore.getInput} convenience function.
*
* `KeyValueStore` stores its data on a local disk.
*
* If the `CRAWLEE_STORAGE_DIR` environment variable is set, the data is stored in
* the local directory in the following files:
* ```
* {CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT}
* ```
* Note that `{STORE_ID}` is the name or ID of the key-value store. The default key-value store has ID: `default`,
* unless you override it by setting the `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID` environment variable.
* The `{KEY}` is the key of the record and `{EXT}` corresponds to the MIME content type of the data value.
*
* **Example usage:**
*
* ```javascript
* // Get crawler input from the default key-value store.
* const input = await KeyValueStore.getInput();
* // Get some value from the default key-value store.
* const otherValue = await KeyValueStore.getValue('my-key');
*
* // Write crawler output to the default key-value store.
* await KeyValueStore.setValue('OUTPUT', { myResult: 123 });
*
* // Open a named key-value store
* const store = await KeyValueStore.open('some-name');
*
* // Write a record. JavaScript object is automatically converted to JSON,
* // strings and binary buffers are stored as they are
* await store.setValue('some-key', { foo: 'bar' });
*
* // Read a record. Note that JSON is automatically parsed to a JavaScript object,
* // text data returned as a string and other data is returned as binary buffer
* const value = await store.getValue('some-key');
*
* // Drop (delete) the store
* await store.drop();
* ```
* @category Result Stores
*/
class KeyValueStore {
/**
* @internal
*/
constructor(options, config = configuration_1.Configuration.getGlobalConfig()) {
Object.defineProperty(this, "config", {
enumerable: true,
configurable: true,
writable: true,
value: config
});
Object.defineProperty(this, "id", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "name", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "storageObject", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "client", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "persistStateEventStarted", {
enumerable: true,
configurable: true,
writable: true,
value: false
});
/** Cache for persistent (auto-saved) values. When we try to set such value, the cache will be updated automatically. */
Object.defineProperty(this, "cache", {
enumerable: true,
configurable: true,
writable: true,
value: new Map()
});
this.id = options.id;
this.name = options.name;
this.storageObject = options.storageObject;
this.client = options.client.keyValueStore(this.id);
}
/**
* Gets a value from the key-value store.
*
* The function returns a `Promise` that resolves to the record value,
* whose JavaScript type depends on the MIME content type of the record.
* Records with the `application/json`
* content type are automatically parsed and returned as a JavaScript object.
* Similarly, records with `text/plain` content types are returned as a string.
* For all other content types, the value is returned as a raw
* [`Buffer`](https://nodejs.org/api/buffer.html) instance.
*
* If the record does not exist, the function resolves to `null`.
*
* To save or delete a value in the key-value store, use the
* {@link KeyValueStore.setValue} function.
*
* **Example usage:**
*
* ```javascript
* const store = await KeyValueStore.open();
* const buffer = await store.getValue('screenshot1.png');
* ```
* @param key
* Unique key of the record. It can be at most 256 characters long and only consist
* of the following characters: `a`-`z`, `A`-`Z`, `0`-`9` and `!-_.'()`
* @param defaultValue
* Fallback that will be returned if no value if present in the storage.
* @returns
* Returns a promise that resolves to an object, string
* or [`Buffer`](https://nodejs.org/api/buffer.html), depending
* on the MIME content type of the record, or `null` if the key is missing from the store.
*/
async getValue(key, defaultValue) {
(0, access_checking_1.checkStorageAccess)();
(0, ow_1.default)(key, ow_1.default.string.nonEmpty);
const record = await this.client.getRecord(key);
return record?.value ?? defaultValue ?? null;
}
/**
* Tests whether a record with the given key exists in the key-value store without retrieving its value.
*
* @param key The queried record key.
* @returns `true` if the record exists, `false` if it does not.
*/
async recordExists(key) {
(0, access_checking_1.checkStorageAccess)();
(0, ow_1.default)(key, ow_1.default.string.nonEmpty);
return this.client.recordExists(key);
}
async getAutoSavedValue(key, defaultValue = {}) {
(0, access_checking_1.checkStorageAccess)();
if (this.cache.has(key)) {
return this.cache.get(key);
}
const value = await this.getValue(key, defaultValue);
// The await above could have run in parallel with another call to this function. If the other call finished more quickly,
// the value will in cache at this point, and returning the new fetched value would introduce two different instances of
// the auto-saved object, and only the latter one would be persisted.
// Therefore we re-check the cache here, and if such race condition happened, we drop the fetched value and return the cached one.
if (this.cache.has(key)) {
return this.cache.get(key);
}
this.cache.set(key, value);
this.ensurePersistStateEvent();
return value;
}
ensurePersistStateEvent() {
if (this.persistStateEventStarted) {
return;
}
// use half the interval of `persistState` to avoid race conditions
const persistStateIntervalMillis = this.config.get('persistStateIntervalMillis');
const timeoutSecs = persistStateIntervalMillis / 2000;
this.config.getEventManager().on('persistState', async () => {
const promises = [];
for (const [key, value] of this.cache) {
promises.push(this.setValue(key, value, {
timeoutSecs,
doNotRetryTimeouts: true,
}).catch((error) => log_1.default.warning(`Failed to persist the state value to ${key}`, { error })));
}
await Promise.all(promises);
});
this.persistStateEventStarted = true;
}
/**
* Saves or deletes a record in the key-value store.
* The function returns a promise that resolves once the record has been saved or deleted.
*
* **Example usage:**
*
* ```javascript
* const store = await KeyValueStore.open();
* await store.setValue('OUTPUT', { foo: 'bar' });
* ```
*
* Beware that the key can be at most 256 characters long and only contain the following characters: `a-zA-Z0-9!-_.'()`
*
* By default, `value` is converted to JSON and stored with the
* `application/json; charset=utf-8` MIME content type.
* To store the value with another content type, pass it in the options as follows:
* ```javascript
* const store = await KeyValueStore.open('my-text-store');
* await store.setValue('RESULTS', 'my text data', { contentType: 'text/plain' });
* ```
* If you set custom content type, `value` must be either a string or
* [`Buffer`](https://nodejs.org/api/buffer.html), otherwise an error will be thrown.
*
* If `value` is `null`, the record is deleted instead. Note that the `setValue()` function succeeds
* regardless whether the record existed or not.
*
* To retrieve a value from the key-value store, use the
* {@link KeyValueStore.getValue} function.
*
* **IMPORTANT:** Always make sure to use the `await` keyword when calling `setValue()`,
* otherwise the crawler process might finish before the value is stored!
*
* @param key
* Unique key of the record. It can be at most 256 characters long and only consist
* of the following characters: `a`-`z`, `A`-`Z`, `0`-`9` and `!-_.'()`
* @param value
* Record data, which can be one of the following values:
* - If `null`, the record in the key-value store is deleted.
* - If no `options.contentType` is specified, `value` can be any JavaScript object and it will be stringified to JSON.
* - If `options.contentType` is set, `value` is taken as is and it must be a `String` or [`Buffer`](https://nodejs.org/api/buffer.html).
* For any other value an error will be thrown.
* @param [options] Record options.
*/
async setValue(key, value, options = {}) {
(0, access_checking_1.checkStorageAccess)();
(0, ow_1.default)(key, 'key', ow_1.default.string.nonEmpty);
(0, ow_1.default)(key, ow_1.default.string.validate((k) => ({
validator: ow_1.default.isValid(k, ow_1.default.string.matches(consts_1.KEY_VALUE_STORE_KEY_REGEX)),
message: `The "key" argument "${key}" must be at most 256 characters long and only contain the following characters: a-zA-Z0-9!-_.'()`,
})));
if (options.contentType &&
!(ow_1.default.isValid(value, ow_1.default.any(ow_1.default.string, ow_1.default.uint8Array)) ||
(ow_1.default.isValid(value, ow_1.default.object) && typeof value.pipe === 'function'))) {
throw new ow_1.ArgumentError('The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified.', this.setValue);
}
(0, ow_1.default)(options, ow_1.default.object.exactShape({
contentType: ow_1.default.optional.string.nonEmpty,
timeoutSecs: ow_1.default.optional.number,
doNotRetryTimeouts: ow_1.default.optional.boolean,
}));
// Make copy of options, don't update what user passed.
const optionsCopy = { ...options };
// If we try to set the value of a cached state to a different reference, we need to update the cache accordingly.
const cachedValue = this.cache.get(key);
if (cachedValue && cachedValue !== value) {
if (value === null) {
// Cached state can be only object, so a propagation of `null` means removing all its properties.
Object.keys(cachedValue).forEach((k) => this.cache.delete(k));
}
else if (typeof value === 'object') {
// We need to remove the keys that are no longer present in the new value.
Object.keys(cachedValue)
.filter((k) => !(k in value))
.forEach((k) => this.cache.delete(k));
// And update the existing ones + add new ones.
Object.assign(cachedValue, value);
}
}
// In this case delete the record.
if (value === null)
return this.client.deleteRecord(key);
value = (0, exports.maybeStringify)(value, optionsCopy);
return this.client.setRecord({
key,
value,
contentType: optionsCopy.contentType,
}, {
timeoutSecs: optionsCopy.timeoutSecs,
doNotRetryTimeouts: optionsCopy.doNotRetryTimeouts,
});
}
/**
* Removes the key-value store either from the Apify cloud storage or from the local directory,
* depending on the mode of operation.
*/
async drop() {
(0, access_checking_1.checkStorageAccess)();
await this.client.delete();
const manager = storage_manager_1.StorageManager.getManager(KeyValueStore, this.config);
manager.closeStorage(this);
}
/** @internal */
clearCache() {
(0, access_checking_1.checkStorageAccess)();
this.cache.clear();
}
/**
* Iterates over key-value store keys, yielding each in turn to an `iteratee` function.
* Each invocation of `iteratee` is called with three arguments: `(key, index, info)`, where `key`
* is the record key, `index` is a zero-based index of the key in the current iteration
* (regardless of `options.exclusiveStartKey`) and `info` is an object that contains a single property `size`
* indicating size of the record in bytes.
*
* If the `iteratee` function returns a Promise then it is awaited before the next call.
* If it throws an error, the iteration is aborted and the `forEachKey` function throws the error.
*
* **Example usage**
* ```javascript
* const keyValueStore = await KeyValueStore.open();
* await keyValueStore.forEachKey(async (key, index, info) => {
* console.log(`Key at ${index}: ${key} has size ${info.size}`);
* });
* ```
*
* @param iteratee A function that is called for every key in the key-value store.
* @param [options] All `forEachKey()` parameters.
*/
async forEachKey(iteratee, options = {}) {
(0, access_checking_1.checkStorageAccess)();
return this._forEachKey(iteratee, options);
}
async _forEachKey(iteratee, options = {}, index = 0) {
const { exclusiveStartKey, prefix, collection } = options;
(0, ow_1.default)(iteratee, ow_1.default.function);
(0, ow_1.default)(options, ow_1.default.object.exactShape({
exclusiveStartKey: ow_1.default.optional.string,
prefix: ow_1.default.optional.string,
collection: ow_1.default.optional.string,
}));
const response = await this.client.listKeys({ exclusiveStartKey, prefix, collection });
const { nextExclusiveStartKey, isTruncated, items } = response;
for (const item of items) {
await iteratee(item.key, index++, { size: item.size });
}
return isTruncated
? this._forEachKey(iteratee, { exclusiveStartKey: nextExclusiveStartKey, prefix, collection }, index)
: undefined; // [].forEach() returns undefined.
}
/**
* Returns a file URL for the given key.
*/
getPublicUrl(key) {
const name = this.name ?? this.config.get('defaultKeyValueStoreId');
return `file://${process.cwd()}/storage/key_value_stores/${name}/${key}`;
}
/**
* Opens a key-value store and returns a promise resolving to an instance of the {@link KeyValueStore} class.
*
* Key-value stores are used to store records or files, along with their MIME content type.
* The records are stored and retrieved using a unique key.
* The actual data is stored either on a local filesystem or in the Apify cloud.
*
* For more details and code examples, see the {@link KeyValueStore} class.
*
* @param [storeIdOrName]
* ID or name of the key-value store to be opened. If `null` or `undefined`,
* the function returns the default key-value store associated with the crawler run.
* @param [options] Storage manager options.
*/
static async open(storeIdOrName, options = {}) {
(0, access_checking_1.checkStorageAccess)();
(0, ow_1.default)(storeIdOrName, ow_1.default.optional.any(ow_1.default.string, ow_1.default.null));
(0, ow_1.default)(options, ow_1.default.object.exactShape({
config: ow_1.default.optional.object.instanceOf(configuration_1.Configuration),
storageClient: ow_1.default.optional.object,
}));
options.config ?? (options.config = configuration_1.Configuration.getGlobalConfig());
options.storageClient ?? (options.storageClient = options.config.getStorageClient());
await (0, utils_1.purgeDefaultStorages)({ onlyPurgeOnce: true, client: options.storageClient, config: options.config });
const manager = storage_manager_1.StorageManager.getManager(this, options.config);
return manager.openStorage(storeIdOrName, options.storageClient);
}
/**
* Gets a value from the default {@link KeyValueStore} associated with the current crawler run.
*
* This is just a convenient shortcut for {@link KeyValueStore.getValue}.
* For example, calling the following code:
* ```javascript
* const value = await KeyValueStore.getValue('my-key');
* ```
*
* is equivalent to:
* ```javascript
* const store = await KeyValueStore.open();
* const value = await store.getValue('my-key');
* ```
*
* To store the value to the default key-value store, you can use the {@link KeyValueStore.setValue} function.
*
* For more information, see {@link KeyValueStore.open}
* and {@link KeyValueStore.getValue}.
*
* @param key Unique record key.
* @param defaultValue Fallback that will be returned if no value if present in the storage.
* @returns
* Returns a promise that resolves to an object, string
* or [`Buffer`](https://nodejs.org/api/buffer.html), depending
* on the MIME content type of the record, or `null`
* if the record is missing.
* @ignore
*/
static async getValue(key, defaultValue) {
const store = await this.open();
return store.getValue(key, defaultValue);
}
/**
* Tests whether a record with the given key exists in the default {@link KeyValueStore} associated with the current crawler run.
* @param key The queried record key.
* @returns `true` if the record exists, `false` if it does not.
*/
static async recordExists(key) {
const store = await this.open();
return store.recordExists(key);
}
static async getAutoSavedValue(key, defaultValue = {}) {
const store = await this.open();
return store.getAutoSavedValue(key, defaultValue);
}
/**
* Stores or deletes a value in the default {@link KeyValueStore} associated with the current crawler run.
*
* This is just a convenient shortcut for {@link KeyValueStore.setValue}.
* For example, calling the following code:
* ```javascript
* await KeyValueStore.setValue('OUTPUT', { foo: "bar" });
* ```
*
* is equivalent to:
* ```javascript
* const store = await KeyValueStore.open();
* await store.setValue('OUTPUT', { foo: "bar" });
* ```
*
* To get a value from the default key-value store, you can use the {@link KeyValueStore.getValue} function.
*
* For more information, see {@link KeyValueStore.open}
* and {@link KeyValueStore.getValue}.
*
* @param key
* Unique record key.
* @param value
* Record data, which can be one of the following values:
* - If `null`, the record in the key-value store is deleted.
* - If no `options.contentType` is specified, `value` can be any JavaScript object, and it will be stringified to JSON.
* - If `options.contentType` is set, `value` is taken as is, and it must be a `String` or [`Buffer`](https://nodejs.org/api/buffer.html).
* For any other value an error will be thrown.
* @param [options]
* @ignore
*/
static async setValue(key, value, options = {}) {
const store = await this.open();
return store.setValue(key, value, options);
}
/**
* Gets the crawler input value from the default {@link KeyValueStore} associated with the current crawler run.
* By default, it will try to find root input files (either extension-less, `.json` or `.txt`),
* or alternatively read the input from the default {@link KeyValueStore}.
*
* Note that the `getInput()` function does not cache the value read from the key-value store.
* If you need to use the input multiple times in your crawler,
* it is far more efficient to read it once and store it locally.
*
* For more information, see {@link KeyValueStore.open}
* and {@link KeyValueStore.getValue}.
*
* @returns
* Returns a promise that resolves to an object, string
* or [`Buffer`](https://nodejs.org/api/buffer.html), depending
* on the MIME content type of the record, or `null`
* if the record is missing.
* @ignore
*/
static async getInput() {
const store = await this.open();
const inputKey = store.config.get('inputKey');
const cwd = process.cwd();
const possibleExtensions = ['', '.json', '.txt'];
// Attempt to read input from root file instead of key-value store
for (const extension of possibleExtensions) {
const inputFile = (0, node_path_1.join)(cwd, `${inputKey}${extension}`);
let input;
// Try getting the file from the file system
try {
input = await (0, promises_1.readFile)(inputFile);
}
catch {
continue;
}
// Attempt to parse as JSON, or return the input as is otherwise
try {
return json5_1.default.parse(input.toString());
}
catch {
return input;
}
}
return store.getValue(inputKey);
}
}
exports.KeyValueStore = KeyValueStore;
//# sourceMappingURL=key_value_store.js.map