UNPKG

@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

509 lines • 21.6 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Dataset = exports.DATASET_ITERATORS_DEFAULT_LIMIT = void 0; exports.checkAndSerialize = checkAndSerialize; exports.chunkBySize = chunkBySize; const tslib_1 = require("tslib"); const sync_1 = require("csv-stringify/sync"); const ow_1 = tslib_1.__importDefault(require("ow")); const consts_1 = require("@apify/consts"); const configuration_1 = require("../configuration"); const log_1 = require("../log"); const access_checking_1 = require("./access_checking"); const key_value_store_1 = require("./key_value_store"); const storage_manager_1 = require("./storage_manager"); const utils_1 = require("./utils"); /** @internal */ exports.DATASET_ITERATORS_DEFAULT_LIMIT = 10000; const SAFETY_BUFFER_PERCENT = 0.01 / 100; // 0.01% /** * Accepts a JSON serializable object as an input, validates its serializability, * and validates its serialized size against limitBytes. Optionally accepts its index * in an array to provide better error messages. Returns serialized object. * @ignore */ function checkAndSerialize(item, limitBytes, index) { const s = typeof index === 'number' ? ` at index ${index} ` : ' '; const isItemObject = item && typeof item === 'object' && !Array.isArray(item); if (!isItemObject) { throw new Error(`Data item${s}is not an object. You can push only objects into a dataset.`); } let payload; try { payload = JSON.stringify(item); } catch (e) { const err = e; throw new Error(`Data item${s}is not serializable to JSON.\nCause: ${err.message}`); } const bytes = Buffer.byteLength(payload); if (bytes > limitBytes) { throw new Error(`Data item${s}is too large (size: ${bytes} bytes, limit: ${limitBytes} bytes)`); } return payload; } /** * Takes an array of JSONs (payloads) as input and produces an array of JSON strings * where each string is a JSON array of payloads with a maximum size of limitBytes per one * JSON array. Fits as many payloads as possible into a single JSON array and then moves * on to the next, preserving item order. * * The function assumes that none of the items is larger than limitBytes and does not validate. * @ignore */ function chunkBySize(items, limitBytes) { if (!items.length) return []; if (items.length === 1) return items; // Split payloads into buckets of valid size. let lastChunkBytes = 2; // Add 2 bytes for [] wrapper. const chunks = []; for (const payload of items) { const bytes = Buffer.byteLength(payload); if (bytes <= limitBytes && bytes + 2 > limitBytes) { // Handle cases where wrapping with [] would fail, but solo object is fine. chunks.push(payload); lastChunkBytes = bytes; } else if (lastChunkBytes + bytes <= limitBytes) { // ensure array if (!Array.isArray(chunks[chunks.length - 1])) { chunks.push([]); } chunks[chunks.length - 1].push(payload); lastChunkBytes += bytes + 1; // Add 1 byte for ',' separator. } else { chunks.push([payload]); lastChunkBytes = bytes + 2; // Add 2 bytes for [] wrapper. } } // Stringify array chunks. return chunks.map((chunk) => (typeof chunk === 'string' ? chunk : `[${chunk.join(',')}]`)); } /** * The `Dataset` class represents a store for structured data where each object stored has the same attributes, * such as online store products or real estate offers. You can imagine it as a table, * where each object is a row and its attributes are columns. * Dataset is an append-only storage - you can only add new records to it but you cannot modify or remove existing records. * Typically it is used to store crawling results. * * Do not instantiate this class directly, use the * {@link Dataset.open} function instead. * * `Dataset` stores its data either on local disk or in the Apify cloud, * depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set. * * If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in * the local directory in the following files: * ``` * {APIFY_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json * ``` * Note that `{DATASET_ID}` is the name or ID of the dataset. The default dataset has ID: `default`, * unless you override it by setting the `APIFY_DEFAULT_DATASET_ID` environment variable. * Each dataset item is stored as a separate JSON file, where `{INDEX}` is a zero-based index of the item in the dataset. * * If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` not, the data is stored in the * [Apify Dataset](https://docs.apify.com/storage/dataset) * cloud storage. Note that you can force usage of the cloud storage also by passing the `forceCloud` * option to {@link Dataset.open} function, * even if the `APIFY_LOCAL_STORAGE_DIR` variable is set. * * **Example usage:** * * ```javascript * // Write a single row to the default dataset * await Dataset.pushData({ col1: 123, col2: 'val2' }); * * // Open a named dataset * const dataset = await Dataset.open('some-name'); * * // Write a single row * await dataset.pushData({ foo: 'bar' }); * * // Write multiple rows * await dataset.pushData([ * { foo: 'bar2', col2: 'val2' }, * { col3: 123 }, * ]); * * // Export the entirety of the dataset to one file in the key-value store * await dataset.exportToCSV('MY-DATA'); * ``` * @category Result Stores */ class Dataset { /** * @internal */ constructor(options, config = configuration_1.Configuration.getGlobalConfig()) { Object.defineProperty(this, "config", { enumerable: true, configurable: true, writable: true, value: config }); Object.defineProperty(this, "id", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "name", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "client", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "storageObject", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "log", { enumerable: true, configurable: true, writable: true, value: log_1.log.child({ prefix: 'Dataset' }) }); this.id = options.id; this.name = options.name; this.client = options.client.dataset(this.id); this.storageObject = options.storageObject; } /** * Stores an object or an array of objects to the dataset. * The function returns a promise that resolves when the operation finishes. * It has no result, but throws on invalid args or other errors. * * **IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`, * otherwise the crawler process might finish before the data is stored! * * The size of the data is limited by the receiving API and therefore `pushData()` will only * allow objects whose JSON representation is smaller than 9MB. When an array is passed, * none of the included objects * may be larger than 9MB, but the array itself may be of any size. * * The function internally * chunks the array into separate items and pushes them sequentially. * The chunking process is stable (keeps order of data), but it does not provide a transaction * safety mechanism. Therefore, in the event of an uploading error (after several automatic retries), * the function's Promise will reject and the dataset will be left in a state where some of * the items have already been saved to the dataset while other items from the source array were not. * To overcome this limitation, the developer may, for example, read the last item saved in the dataset * and re-attempt the save of the data from this item onwards to prevent duplicates. * @param data Object or array of objects containing data to be stored in the default dataset. * The objects must be serializable to JSON and the JSON representation of each object must be smaller than 9MB. */ async pushData(data) { (0, access_checking_1.checkStorageAccess)(); (0, ow_1.default)(data, 'data', ow_1.default.object); const dispatch = async (payload) => this.client.pushItems(payload); const limit = consts_1.MAX_PAYLOAD_SIZE_BYTES - Math.ceil(consts_1.MAX_PAYLOAD_SIZE_BYTES * SAFETY_BUFFER_PERCENT); // Handle singular Objects if (!Array.isArray(data)) { const payload = checkAndSerialize(data, limit); await dispatch(payload); return; } // Handle Arrays const payloads = data.map((item, index) => checkAndSerialize(item, limit, index)); const chunks = chunkBySize(payloads, limit); // Invoke client in series to preserve order of data for (const chunk of chunks) { await dispatch(chunk); } } /** * Returns {@link DatasetContent} object holding the items in the dataset based on the provided parameters. */ async getData(options = {}) { (0, access_checking_1.checkStorageAccess)(); try { return await this.client.listItems(options); } catch (e) { const error = e; if (error.message.includes('Cannot create a string longer than')) { throw new Error('dataset.getData(): The response is too large for parsing. You can fix this by lowering the "limit" option.'); } throw e; } } /** * Returns all the data from the dataset. This will iterate through the whole dataset * via the `listItems()` client method, which gives you only paginated results. */ async export(options = {}) { (0, access_checking_1.checkStorageAccess)(); const items = []; const fetchNextChunk = async (offset = 0) => { const limit = 1000; const value = await this.client.listItems({ offset, limit, ...options }); if (value.count === 0) { return; } items.push(...value.items); if (value.total > offset + value.count) { await fetchNextChunk(offset + value.count); } }; await fetchNextChunk(); return items; } /** * Save the entirety of the dataset's contents into one file within a key-value store. * * @param key The name of the value to save the data in. * @param [options] An optional options object where you can provide the dataset and target KVS name. * @param [contentType] Only JSON and CSV are supported currently, defaults to JSON. */ async exportTo(key, options, contentType) { const kvStore = await key_value_store_1.KeyValueStore.open(options?.toKVS ?? null, { config: this.config }); const items = await this.export(options); if (contentType === 'text/csv') { // To handle empty dataset exports gracefully. if (items.length === 0) { await kvStore.setValue(key, '', { contentType }); return items; } const keys = options?.collectAllKeys ? Array.from(new Set(items.flatMap(Object.keys))) : Object.keys(items[0]); const value = (0, sync_1.stringify)([ keys, ...items.map((item) => { return keys.map((k) => item[k]); }), ]); await kvStore.setValue(key, value, { contentType }); return items; } if (contentType === 'application/json') { await kvStore.setValue(key, items); return items; } throw new Error(`Unsupported content type: ${contentType}`); return items; } /** * Save entire default dataset's contents into one JSON file within a key-value store. * * @param key The name of the value to save the data in. * @param [options] An optional options object where you can provide the target KVS name. */ async exportToJSON(key, options) { await this.exportTo(key, options, 'application/json'); } /** * Save entire default dataset's contents into one CSV file within a key-value store. * * @param key The name of the value to save the data in. * @param [options] An optional options object where you can provide the target KVS name. */ async exportToCSV(key, options) { await this.exportTo(key, options, 'text/csv'); } /** * Save entire default dataset's contents into one JSON file within a key-value store. * * @param key The name of the value to save the data in. * @param [options] An optional options object where you can provide the dataset and target KVS name. */ static async exportToJSON(key, options) { (0, access_checking_1.checkStorageAccess)(); const dataset = await this.open(options?.fromDataset); await dataset.exportToJSON(key, options); } /** * Save entire default dataset's contents into one CSV file within a key-value store. * * @param key The name of the value to save the data in. * @param [options] An optional options object where you can provide the dataset and target KVS name. */ static async exportToCSV(key, options) { (0, access_checking_1.checkStorageAccess)(); const dataset = await this.open(options?.fromDataset); await dataset.exportToCSV(key, options); } /** * Returns an object containing general information about the dataset. * * The function returns the same object as the Apify API Client's * [getDataset](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-datasets-getDataset) * function, which in turn calls the * [Get dataset](https://apify.com/docs/api/v2#/reference/datasets/dataset/get-dataset) * API endpoint. * * **Example:** * ``` * { * id: "WkzbQMuFYuamGv3YF", * name: "my-dataset", * userId: "wRsJZtadYvn4mBZmm", * createdAt: new Date("2015-12-12T07:34:14.202Z"), * modifiedAt: new Date("2015-12-13T08:36:13.202Z"), * accessedAt: new Date("2015-12-14T08:36:13.202Z"), * itemCount: 14, * } * ``` */ async getInfo() { (0, access_checking_1.checkStorageAccess)(); return this.client.get(); } /** * Iterates over dataset items, yielding each in turn to an `iteratee` function. * Each invocation of `iteratee` is called with two arguments: `(item, index)`. * * If the `iteratee` function returns a Promise then it is awaited before the next call. * If it throws an error, the iteration is aborted and the `forEach` function throws the error. * * **Example usage** * ```javascript * const dataset = await Dataset.open('my-results'); * await dataset.forEach(async (item, index) => { * console.log(`Item at ${index}: ${JSON.stringify(item)}`); * }); * ``` * * @param iteratee A function that is called for every item in the dataset. * @param [options] All `forEach()` parameters. * @param [index] Specifies the initial index number passed to the `iteratee` function. * @default 0 */ async forEach(iteratee, options = {}, index = 0) { (0, access_checking_1.checkStorageAccess)(); if (!options.offset) options.offset = 0; if (options.format && options.format !== 'json') throw new Error('Dataset.forEach/map/reduce() support only a "json" format.'); if (!options.limit) options.limit = exports.DATASET_ITERATORS_DEFAULT_LIMIT; const { items, total, limit, offset } = await this.getData(options); for (const item of items) { await iteratee(item, index++); } const newOffset = offset + limit; if (newOffset >= total) return; const newOpts = { ...options, offset: newOffset }; await this.forEach(iteratee, newOpts, index); } /** * Produces a new array of values by mapping each value in list through a transformation function `iteratee()`. * Each invocation of `iteratee()` is called with two arguments: `(element, index)`. * * If `iteratee` returns a `Promise` then it's awaited before a next call. * * @param iteratee * @param [options] All `map()` parameters. */ async map(iteratee, options = {}) { (0, access_checking_1.checkStorageAccess)(); const result = []; await this.forEach(async (item, index) => { const res = await iteratee(item, index); result.push(res); }, options); return result; } async reduce(iteratee, memo, options = {}) { (0, access_checking_1.checkStorageAccess)(); let currentMemo = memo; const wrappedFunc = async (item, index) => { if (index === 0 && currentMemo === undefined) { currentMemo = item; } else { // We are guaranteed that currentMemo is instanciated, since we are either not on // the first iteration, or memo was already set by the user. currentMemo = await iteratee(currentMemo, item, index); } }; await this.forEach(wrappedFunc, options); return currentMemo; } /** * Removes the dataset either from the Apify cloud storage or from the local directory, * depending on the mode of operation. */ async drop() { (0, access_checking_1.checkStorageAccess)(); await this.client.delete(); const manager = storage_manager_1.StorageManager.getManager(Dataset, this.config); manager.closeStorage(this); } /** * Opens a dataset and returns a promise resolving to an instance of the {@link Dataset} class. * * Datasets are used to store structured data where each object stored has the same attributes, * such as online store products or real estate offers. * The actual data is stored either on the local filesystem or in the cloud. * * For more details and code examples, see the {@link Dataset} class. * * @param [datasetIdOrName] * ID or name of the dataset to be opened. If `null` or `undefined`, * the function returns the default dataset associated with the crawler run. * @param [options] Storage manager options. */ static async open(datasetIdOrName, options = {}) { (0, access_checking_1.checkStorageAccess)(); (0, ow_1.default)(datasetIdOrName, ow_1.default.optional.string); (0, ow_1.default)(options, ow_1.default.object.exactShape({ config: ow_1.default.optional.object.instanceOf(configuration_1.Configuration), storageClient: ow_1.default.optional.object, })); options.config ?? (options.config = configuration_1.Configuration.getGlobalConfig()); options.storageClient ?? (options.storageClient = options.config.getStorageClient()); await (0, utils_1.purgeDefaultStorages)({ onlyPurgeOnce: true, client: options.storageClient, config: options.config }); const manager = storage_manager_1.StorageManager.getManager(this, options.config); return manager.openStorage(datasetIdOrName, options.storageClient); } /** * Stores an object or an array of objects to the default {@link Dataset} of the current crawler run. * * This is just a convenient shortcut for {@link Dataset.pushData}. * For example, calling the following code: * ```javascript * await Dataset.pushData({ myValue: 123 }); * ``` * * is equivalent to: * ```javascript * const dataset = await Dataset.open(); * await dataset.pushData({ myValue: 123 }); * ``` * * For more information, see {@link Dataset.open} and {@link Dataset.pushData} * * **IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`, * otherwise the crawler process might finish before the data are stored! * * @param item Object or array of objects containing data to be stored in the default dataset. * The objects must be serializable to JSON and the JSON representation of each object must be smaller than 9MB. * @ignore */ static async pushData(item) { const dataset = await this.open(); return dataset.pushData(item); } /** * Returns {@link DatasetContent} object holding the items in the dataset based on the provided parameters. */ static async getData(options = {}) { const dataset = await this.open(); return dataset.getData(options); } } exports.Dataset = Dataset; //# sourceMappingURL=dataset.js.map