@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
480 lines • 20.8 kB
TypeScript
import type { DatasetClient, DatasetInfo, Dictionary, PaginatedList, StorageClient } from '@crawlee/types';
import { Configuration } from '../configuration';
import { type Log } from '../log';
import type { Awaitable } from '../typedefs';
import type { StorageManagerOptions } from './storage_manager';
/** @internal */
export declare const DATASET_ITERATORS_DEFAULT_LIMIT = 10000;
/**
* Accepts a JSON serializable object as an input, validates its serializability,
* and validates its serialized size against limitBytes. Optionally accepts its index
* in an array to provide better error messages. Returns serialized object.
* @ignore
*/
export declare function checkAndSerialize<T>(item: T, limitBytes: number, index?: number): string;
/**
* Takes an array of JSONs (payloads) as input and produces an array of JSON strings
* where each string is a JSON array of payloads with a maximum size of limitBytes per one
* JSON array. Fits as many payloads as possible into a single JSON array and then moves
* on to the next, preserving item order.
*
* The function assumes that none of the items is larger than limitBytes and does not validate.
* @ignore
*/
export declare function chunkBySize(items: string[], limitBytes: number): string[];
export interface DatasetDataOptions {
/**
* Number of array elements that should be skipped at the start.
* @default 0
*/
offset?: number;
/**
* Maximum number of array elements to return.
* @default 250000
*/
limit?: number;
/**
* If `true` then the objects are sorted by `createdAt` in descending order.
* Otherwise they are sorted in ascending order.
* @default false
*/
desc?: boolean;
/**
* An array of field names that will be included in the result. If omitted, all fields are included in the results.
*/
fields?: string[];
/**
* Specifies a name of the field in the result objects that will be used to unwind the resulting objects.
* By default, the results are returned as they are.
*/
unwind?: string;
/**
* If `true` then the function returns only non-empty items and skips hidden fields (i.e. fields starting with `#` character).
* Note that the `clean` parameter is a shortcut for `skipHidden: true` and `skipEmpty: true` options.
* @default false
*/
clean?: boolean;
/**
* If `true` then the function doesn't return hidden fields (fields starting with "#" character).
* @default false
*/
skipHidden?: boolean;
/**
* If `true` then the function doesn't return empty items.
* Note that in this case the returned number of items might be lower than limit parameter and pagination must be done using the `limit` value.
* @default false
*/
skipEmpty?: boolean;
}
export interface DatasetExportOptions extends Omit<DatasetDataOptions, 'offset' | 'limit'> {
/**
* If true, includes all unique keys from all dataset items in the CSV export header.
* If omitted or false, only keys from the first item are used.
*/
collectAllKeys?: boolean;
}
export interface DatasetIteratorOptions extends Omit<DatasetDataOptions, 'offset' | 'limit' | 'clean' | 'skipHidden' | 'skipEmpty'> {
/** @internal */
offset?: number;
/**
* @default 10000
* @internal
*/
limit?: number;
/** @internal */
clean?: boolean;
/** @internal */
skipHidden?: boolean;
/** @internal */
skipEmpty?: boolean;
/** @internal */
format?: string;
}
export interface DatasetExportToOptions extends DatasetExportOptions {
fromDataset?: string;
toKVS?: string;
}
/**
* The `Dataset` class represents a store for structured data where each object stored has the same attributes,
* such as online store products or real estate offers. You can imagine it as a table,
* where each object is a row and its attributes are columns.
* Dataset is an append-only storage - you can only add new records to it but you cannot modify or remove existing records.
* Typically it is used to store crawling results.
*
* Do not instantiate this class directly, use the
* {@link Dataset.open} function instead.
*
* `Dataset` stores its data either on local disk or in the Apify cloud,
* depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set.
*
* If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in
* the local directory in the following files:
* ```
* {APIFY_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json
* ```
* Note that `{DATASET_ID}` is the name or ID of the dataset. The default dataset has ID: `default`,
* unless you override it by setting the `APIFY_DEFAULT_DATASET_ID` environment variable.
* Each dataset item is stored as a separate JSON file, where `{INDEX}` is a zero-based index of the item in the dataset.
*
* If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` not, the data is stored in the
* [Apify Dataset](https://docs.apify.com/storage/dataset)
* cloud storage. Note that you can force usage of the cloud storage also by passing the `forceCloud`
* option to {@link Dataset.open} function,
* even if the `APIFY_LOCAL_STORAGE_DIR` variable is set.
*
* **Example usage:**
*
* ```javascript
* // Write a single row to the default dataset
* await Dataset.pushData({ col1: 123, col2: 'val2' });
*
* // Open a named dataset
* const dataset = await Dataset.open('some-name');
*
* // Write a single row
* await dataset.pushData({ foo: 'bar' });
*
* // Write multiple rows
* await dataset.pushData([
* { foo: 'bar2', col2: 'val2' },
* { col3: 123 },
* ]);
*
* // Export the entirety of the dataset to one file in the key-value store
* await dataset.exportToCSV('MY-DATA');
* ```
* @category Result Stores
*/
export declare class Dataset<Data extends Dictionary = Dictionary> {
readonly config: Configuration;
id: string;
name?: string;
client: DatasetClient<Data>;
readonly storageObject?: Record<string, unknown>;
log: Log;
/**
* @internal
*/
constructor(options: DatasetOptions, config?: Configuration);
/**
* Stores an object or an array of objects to the dataset.
* The function returns a promise that resolves when the operation finishes.
* It has no result, but throws on invalid args or other errors.
*
* **IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`,
* otherwise the crawler process might finish before the data is stored!
*
* The size of the data is limited by the receiving API and therefore `pushData()` will only
* allow objects whose JSON representation is smaller than 9MB. When an array is passed,
* none of the included objects
* may be larger than 9MB, but the array itself may be of any size.
*
* The function internally
* chunks the array into separate items and pushes them sequentially.
* The chunking process is stable (keeps order of data), but it does not provide a transaction
* safety mechanism. Therefore, in the event of an uploading error (after several automatic retries),
* the function's Promise will reject and the dataset will be left in a state where some of
* the items have already been saved to the dataset while other items from the source array were not.
* To overcome this limitation, the developer may, for example, read the last item saved in the dataset
* and re-attempt the save of the data from this item onwards to prevent duplicates.
* @param data Object or array of objects containing data to be stored in the default dataset.
* The objects must be serializable to JSON and the JSON representation of each object must be smaller than 9MB.
*/
pushData(data: Data | Data[]): Promise<void>;
/**
* Returns {@link DatasetContent} object holding the items in the dataset based on the provided parameters.
*/
getData(options?: DatasetDataOptions): Promise<DatasetContent<Data>>;
/**
* Returns all the data from the dataset. This will iterate through the whole dataset
* via the `listItems()` client method, which gives you only paginated results.
*/
export(options?: DatasetExportOptions): Promise<Data[]>;
/**
* Save the entirety of the dataset's contents into one file within a key-value store.
*
* @param key The name of the value to save the data in.
* @param [options] An optional options object where you can provide the dataset and target KVS name.
* @param [contentType] Only JSON and CSV are supported currently, defaults to JSON.
*/
exportTo(key: string, options?: DatasetExportToOptions, contentType?: string): Promise<Data[]>;
/**
* Save entire default dataset's contents into one JSON file within a key-value store.
*
* @param key The name of the value to save the data in.
* @param [options] An optional options object where you can provide the target KVS name.
*/
exportToJSON(key: string, options?: Omit<DatasetExportToOptions, 'fromDataset'>): Promise<void>;
/**
* Save entire default dataset's contents into one CSV file within a key-value store.
*
* @param key The name of the value to save the data in.
* @param [options] An optional options object where you can provide the target KVS name.
*/
exportToCSV(key: string, options?: Omit<DatasetExportToOptions, 'fromDataset'>): Promise<void>;
/**
* Save entire default dataset's contents into one JSON file within a key-value store.
*
* @param key The name of the value to save the data in.
* @param [options] An optional options object where you can provide the dataset and target KVS name.
*/
static exportToJSON(key: string, options?: DatasetExportToOptions): Promise<void>;
/**
* Save entire default dataset's contents into one CSV file within a key-value store.
*
* @param key The name of the value to save the data in.
* @param [options] An optional options object where you can provide the dataset and target KVS name.
*/
static exportToCSV(key: string, options?: DatasetExportToOptions): Promise<void>;
/**
* Returns an object containing general information about the dataset.
*
* The function returns the same object as the Apify API Client's
* [getDataset](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-datasets-getDataset)
* function, which in turn calls the
* [Get dataset](https://apify.com/docs/api/v2#/reference/datasets/dataset/get-dataset)
* API endpoint.
*
* **Example:**
* ```
* {
* id: "WkzbQMuFYuamGv3YF",
* name: "my-dataset",
* userId: "wRsJZtadYvn4mBZmm",
* createdAt: new Date("2015-12-12T07:34:14.202Z"),
* modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
* accessedAt: new Date("2015-12-14T08:36:13.202Z"),
* itemCount: 14,
* }
* ```
*/
getInfo(): Promise<DatasetInfo | undefined>;
/**
* Iterates over dataset items, yielding each in turn to an `iteratee` function.
* Each invocation of `iteratee` is called with two arguments: `(item, index)`.
*
* If the `iteratee` function returns a Promise then it is awaited before the next call.
* If it throws an error, the iteration is aborted and the `forEach` function throws the error.
*
* **Example usage**
* ```javascript
* const dataset = await Dataset.open('my-results');
* await dataset.forEach(async (item, index) => {
* console.log(`Item at ${index}: ${JSON.stringify(item)}`);
* });
* ```
*
* @param iteratee A function that is called for every item in the dataset.
* @param [options] All `forEach()` parameters.
* @param [index] Specifies the initial index number passed to the `iteratee` function.
* @default 0
*/
forEach(iteratee: DatasetConsumer<Data>, options?: DatasetIteratorOptions, index?: number): Promise<void>;
/**
* Produces a new array of values by mapping each value in list through a transformation function `iteratee()`.
* Each invocation of `iteratee()` is called with two arguments: `(element, index)`.
*
* If `iteratee` returns a `Promise` then it's awaited before a next call.
*
* @param iteratee
* @param [options] All `map()` parameters.
*/
map<R>(iteratee: DatasetMapper<Data, R>, options?: DatasetIteratorOptions): Promise<R[]>;
/**
* Reduces a list of values down to a single value.
*
* The first element of the dataset is the initial value, with each successive reductions should
* be returned by `iteratee()`. The `iteratee()` is passed three arguments: the `memo`, `value`
* and `index` of the current element being folded into the reduction.
*
* The `iteratee` is first invoked on the second element of the list (`index = 1`), with the
* first element given as the memo parameter. After that, the rest of the elements in the
* dataset is passed to `iteratee`, with the result of the previous invocation as the memo.
*
* If `iteratee()` returns a `Promise` it's awaited before a next call.
*
* If the dataset is empty, reduce will return undefined.
*
* @param iteratee
*/
reduce(iteratee: DatasetReducer<Data, Data>): Promise<Data | undefined>;
/**
* Reduces a list of values down to a single value.
*
* The first element of the dataset is the initial value, with each successive reductions should
* be returned by `iteratee()`. The `iteratee()` is passed three arguments: the `memo`, `value`
* and `index` of the current element being folded into the reduction.
*
* The `iteratee` is first invoked on the second element of the list (`index = 1`), with the
* first element given as the memo parameter. After that, the rest of the elements in the
* dataset is passed to `iteratee`, with the result of the previous invocation as the memo.
*
* If `iteratee()` returns a `Promise` it's awaited before a next call.
*
* If the dataset is empty, reduce will return undefined.
*
* @param iteratee
* @param memo Unset parameter, neccesary to be able to pass options
* @param [options] An object containing extra options for `reduce()`
*/
reduce(iteratee: DatasetReducer<Data, Data>, memo: undefined, options: DatasetIteratorOptions): Promise<Data | undefined>;
/**
* Reduces a list of values down to a single value.
*
* Memo is the initial state of the reduction, and each successive step of it should be returned
* by `iteratee()`. The `iteratee()` is passed three arguments: the `memo`, then the `value` and
* `index` of the iteration.
*
* If `iteratee()` returns a `Promise` then it's awaited before a next call.
*
* @param iteratee
* @param memo Initial state of the reduction.
* @param [options] An object containing extra options for `reduce()`
*/
reduce<T>(iteratee: DatasetReducer<T, Data>, memo: T, options?: DatasetIteratorOptions): Promise<T>;
/**
* Iterates over dataset items using an async generator,
* allowing the use of `for await...of` syntax.
*
* **Example usage:**
* ```javascript
* const dataset = await Dataset.open('my-results');
* for await (const item of dataset.values()) {
* console.log(item);
* }
* ```
*
* @param options Options for the iteration.
*/
values(options?: DatasetIteratorOptions): AsyncIterable<Data> & Promise<PaginatedList<Data>>;
/**
* Iterates over dataset entries (index-value pairs) using an async generator,
* allowing the use of `for await...of` syntax.
*
* **Example usage:**
* ```javascript
* const dataset = await Dataset.open('my-results');
* for await (const [index, item] of dataset.entries()) {
* console.log(`Item at ${index}: ${JSON.stringify(item)}`);
* }
* ```
*
* @param options Options for the iteration.
*/
entries(options?: DatasetIteratorOptions): AsyncIterable<[number, Data]> & Promise<PaginatedList<[number, Data]>>;
/**
* Default async iterator for the dataset, iterating over items.
* Allows using the dataset directly in a `for await...of` loop.
*
* **Example usage:**
* ```javascript
* const dataset = await Dataset.open('my-results');
* for await (const item of dataset) {
* console.log(item);
* }
* ```
*/
[Symbol.asyncIterator](): AsyncGenerator<Data, void, undefined>;
/**
* Removes the dataset either from the Apify cloud storage or from the local directory,
* depending on the mode of operation.
*/
drop(): Promise<void>;
/**
* Opens a dataset and returns a promise resolving to an instance of the {@link Dataset} class.
*
* Datasets are used to store structured data where each object stored has the same attributes,
* such as online store products or real estate offers.
* The actual data is stored either on the local filesystem or in the cloud.
*
* For more details and code examples, see the {@link Dataset} class.
*
* @param [datasetIdOrName]
* ID or name of the dataset to be opened. If `null` or `undefined`,
* the function returns the default dataset associated with the crawler run.
* @param [options] Storage manager options.
*/
static open<Data extends Dictionary = Dictionary>(datasetIdOrName?: string | null, options?: StorageManagerOptions): Promise<Dataset<Data>>;
/**
* Stores an object or an array of objects to the default {@link Dataset} of the current crawler run.
*
* This is just a convenient shortcut for {@link Dataset.pushData}.
* For example, calling the following code:
* ```javascript
* await Dataset.pushData({ myValue: 123 });
* ```
*
* is equivalent to:
* ```javascript
* const dataset = await Dataset.open();
* await dataset.pushData({ myValue: 123 });
* ```
*
* For more information, see {@link Dataset.open} and {@link Dataset.pushData}
*
* **IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`,
* otherwise the crawler process might finish before the data are stored!
*
* @param item Object or array of objects containing data to be stored in the default dataset.
* The objects must be serializable to JSON and the JSON representation of each object must be smaller than 9MB.
* @ignore
*/
static pushData<Data extends Dictionary = Dictionary>(item: Data | Data[]): Promise<void>;
/**
* Returns {@link DatasetContent} object holding the items in the dataset based on the provided parameters.
*/
static getData<Data extends Dictionary = Dictionary>(options?: DatasetDataOptions): Promise<DatasetContent<Data>>;
}
/**
* User-function used in the `Dataset.forEach()` API.
*/
export interface DatasetConsumer<Data> {
/**
* @param item Current {@link Dataset} entry being processed.
* @param index Position of current {@link Dataset} entry.
*/
(item: Data, index: number): Awaitable<void>;
}
/**
* User-function used in the `Dataset.map()` API.
*/
export interface DatasetMapper<Data, R> {
/**
* User-function used in the `Dataset.map()` API.
* @param item Current {@link Dataset} entry being processed.
* @param index Position of current {@link Dataset} entry.
*/
(item: Data, index: number): Awaitable<R>;
}
/**
* User-function used in the `Dataset.reduce()` API.
*/
export interface DatasetReducer<T, Data> {
/**
* @param memo Previous state of the reduction.
* @param item Current {@link Dataset} entry being processed.
* @param index Position of current {@link Dataset} entry.
*/
(memo: T, item: Data, index: number): Awaitable<T>;
}
export interface DatasetOptions {
id: string;
name?: string;
client: StorageClient;
storageObject?: Record<string, unknown>;
}
export interface DatasetContent<Data> {
/** Total count of entries in the dataset. */
total: number;
/** Count of dataset entries returned in this set. */
count: number;
/** Position of the first returned entry in the dataset. */
offset: number;
/** Maximum number of dataset entries requested. */
limit: number;
/** Dataset entries based on chosen format parameter. */
items: Data[];
/** Should the results be in descending order. */
desc?: boolean;
}
//# sourceMappingURL=dataset.d.ts.map