@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
315 lines • 14 kB
TypeScript
import type { BatchAddRequestsResult, Dictionary, ProcessedRequest, QueueOperationInfo, RequestQueueClient, RequestQueueInfo, StorageClient } from '@crawlee/types';
import { ListDictionary, LruCache } from '@apify/datastructures';
import type { Log } from '@apify/log';
import { Configuration } from '../configuration';
import type { ProxyConfiguration } from '../proxy_configuration';
import type { InternalSource, RequestOptions, Source } from '../request';
import { Request } from '../request';
import type { IStorage, StorageManagerOptions } from './storage_manager';
export type RequestsLike = AsyncIterable<Source | string> | Iterable<Source | string> | (Source | string)[];
export declare abstract class RequestProvider implements IStorage {
readonly config: Configuration;
id: string;
name?: string;
timeoutSecs: number;
clientKey: string;
client: RequestQueueClient;
protected proxyConfiguration?: ProxyConfiguration;
log: Log;
internalTimeoutMillis: number;
requestLockSecs: number;
assumedTotalCount: number;
assumedHandledCount: number;
private initialCount;
private initialHandledCount;
protected queueHeadIds: ListDictionary<string>;
protected requestCache: LruCache<RequestLruItem>;
protected recentlyHandledRequestsCache: LruCache<boolean>;
protected queuePausedForMigration: boolean;
protected lastActivity: Date;
protected isFinishedCalledWhileHeadWasNotEmpty: number;
protected inProgressRequestBatchCount: number;
constructor(options: InternalRequestProviderOptions, config?: Configuration);
/**
* Returns an offline approximation of the total number of requests in the queue (i.e. pending + handled).
*
* Survives restarts and actor migrations.
*/
getTotalCount(): number;
/**
* Returns an offline approximation of the total number of pending requests in the queue.
*
* Survives restarts and Actor migrations.
*/
getPendingCount(): number;
/**
* Adds a request to the queue.
*
* If a request with the same `uniqueKey` property is already present in the queue,
* it will not be updated. You can find out whether this happened from the resulting
* {@link QueueOperationInfo} object.
*
* To add multiple requests to the queue by extracting links from a webpage,
* see the {@link enqueueLinks} helper function.
*
* @param requestLike {@link Request} object or vanilla object with request data.
* Note that the function sets the `uniqueKey` and `id` fields to the passed Request.
* @param [options] Request queue operation options.
*/
addRequest(requestLike: Source, options?: RequestQueueOperationOptions): Promise<RequestQueueOperationInfo>;
/**
* Adds requests to the queue in batches of 25. This method will wait till all the requests are added
* to the queue before resolving. You should prefer using `queue.addRequestsBatched()` or `crawler.addRequests()`
* if you don't want to block the processing, as those methods will only wait for the initial 1000 requests,
* start processing right after that happens, and continue adding more in the background.
*
* If a request passed in is already present due to its `uniqueKey` property being the same,
* it will not be updated. You can find out whether this happened by finding the request in the resulting
* {@link BatchAddRequestsResult} object.
*
* @param requestsLike {@link Request} objects or vanilla objects with request data.
* Note that the function sets the `uniqueKey` and `id` fields to the passed requests if missing.
* @param [options] Request queue operation options.
*/
addRequests(requestsLike: RequestsLike, options?: RequestQueueOperationOptions): Promise<BatchAddRequestsResult>;
/**
* Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
* adding the rest in the background. You can configure the batch size via `batchSize` option and the sleep time in between
* the batches via `waitBetweenBatchesMillis`. If you want to wait for all batches to be added to the queue, you can use
* the `waitForAllRequestsToBeAdded` promise you get in the response object.
*
* @param requests The requests to add
* @param options Options for the request queue
*/
addRequestsBatched(requests: RequestsLike, options?: AddRequestsBatchedOptions): Promise<AddRequestsBatchedResult>;
/**
* Gets the request from the queue specified by ID.
*
* @param id ID of the request.
* @returns Returns the request object, or `null` if it was not found.
*/
getRequest<T extends Dictionary = Dictionary>(id: string): Promise<Request<T> | null>;
/**
* Returns a next request in the queue to be processed, or `null` if there are no more pending requests.
*
* Once you successfully finish processing of the request, you need to call
* {@link RequestQueue.markRequestHandled}
* to mark the request as handled in the queue. If there was some error in processing the request,
* call {@link RequestQueue.reclaimRequest} instead,
* so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function.
*
* Note that the `null` return value doesn't mean the queue processing finished,
* it means there are currently no pending requests.
* To check whether all requests in queue were finished,
* use {@link RequestQueue.isFinished} instead.
*
* @returns
* Returns the request object or `null` if there are no more pending requests.
*/
abstract fetchNextRequest<T extends Dictionary = Dictionary>(options?: RequestOptions): Promise<Request<T> | null>;
/**
* Marks a request that was previously returned by the
* {@link RequestQueue.fetchNextRequest}
* function as handled after successful processing.
* Handled requests will never again be returned by the `fetchNextRequest` function.
*/
markRequestHandled(request: Request): Promise<RequestQueueOperationInfo | null>;
/**
* Reclaims a failed request back to the queue, so that it can be returned for processing later again
* by another call to {@link RequestQueue.fetchNextRequest}.
* The request record in the queue is updated using the provided `request` parameter.
* For example, this lets you store the number of retries or error messages for the request.
*/
reclaimRequest(request: Request, options?: RequestQueueOperationOptions): Promise<RequestQueueOperationInfo | null>;
protected abstract ensureHeadIsNonEmpty(): Promise<void>;
/**
* Resolves to `true` if the next call to {@link RequestQueue.fetchNextRequest}
* would return `null`, otherwise it resolves to `false`.
* Note that even if the queue is empty, there might be some pending requests currently being processed.
* If you need to ensure that there is no activity in the queue, use {@link RequestQueue.isFinished}.
*/
isEmpty(): Promise<boolean>;
/**
* Resolves to `true` if all requests were already handled and there are no more left.
* Due to the nature of distributed storage used by the queue,
* the function may occasionally return a false negative,
* but it shall never return a false positive.
*/
abstract isFinished(): Promise<boolean>;
protected _reset(): void;
/**
* Caches information about request to beware of unneeded addRequest() calls.
*/
protected _cacheRequest(cacheKey: string, queueOperationInfo: RequestQueueOperationInfo): void;
/**
* Adds a request straight to the queueHeadDict, to improve performance.
*/
protected _maybeAddRequestToQueueHead(requestId: string, forefront: boolean): void;
/**
* Removes the queue either from the Apify Cloud storage or from the local database,
* depending on the mode of operation.
*/
drop(): Promise<void>;
/**
* Returns the number of handled requests.
*
* This function is just a convenient shortcut for:
*
* ```javascript
* const { handledRequestCount } = await queue.getInfo();
* ```
*/
handledCount(): Promise<number>;
/**
* Returns an object containing general information about the request queue.
*
* The function returns the same object as the Apify API Client's
* [getQueue](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-requestQueues)
* function, which in turn calls the
* [Get request queue](https://apify.com/docs/api/v2#/reference/request-queues/queue/get-request-queue)
* API endpoint.
*
* **Example:**
* ```
* {
* id: "WkzbQMuFYuamGv3YF",
* name: "my-queue",
* userId: "wRsJZtadYvn4mBZmm",
* createdAt: new Date("2015-12-12T07:34:14.202Z"),
* modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
* accessedAt: new Date("2015-12-14T08:36:13.202Z"),
* totalRequestCount: 25,
* handledRequestCount: 5,
* pendingRequestCount: 20,
* }
* ```
*/
getInfo(): Promise<RequestQueueInfo | undefined>;
/**
* Fetches URLs from requestsFromUrl and returns them in format of list of requests
*/
protected _fetchRequestsFromUrl(source: InternalSource): Promise<RequestOptions[]>;
/**
* Adds all fetched requests from a URL from a remote resource.
*/
protected _addFetchedRequests(source: InternalSource, fetchedRequests: RequestOptions[], options: RequestQueueOperationOptions): Promise<ProcessedRequest[]>;
/**
* @internal wraps public utility for mocking purposes
*/
private _downloadListOfUrls;
/**
* Opens a request queue and returns a promise resolving to an instance
* of the {@link RequestQueue} class.
*
* {@link RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud.
* The queue is used for deep crawling of websites, where you start with several URLs and then
* recursively follow links to other pages. The data structure supports both breadth-first
* and depth-first crawling orders.
*
* For more details and code examples, see the {@link RequestQueue} class.
*
* @param [queueIdOrName]
* ID or name of the request queue to be opened. If `null` or `undefined`,
* the function returns the default request queue associated with the crawler run.
* @param [options] Open Request Queue options.
*/
static open(queueIdOrName?: string | null, options?: StorageManagerOptions): Promise<RequestProvider>;
}
interface RequestLruItem {
uniqueKey: string;
isHandled: boolean;
id: string;
hydrated: Request | null;
lockExpiresAt: number | null;
forefront: boolean;
}
export interface RequestProviderOptions {
id: string;
name?: string;
client: StorageClient;
/**
* Used to pass the proxy configuration for the `requestsFromUrl` objects.
* Takes advantage of the internal address rotation and authentication process.
* If undefined, the `requestsFromUrl` requests will be made without proxy.
*/
proxyConfiguration?: ProxyConfiguration;
}
/**
* @deprecated Use {@link RequestProviderOptions} instead.
*/
export interface RequestQueueOptions extends RequestProviderOptions {
}
/**
* @internal
*/
export interface InternalRequestProviderOptions extends RequestProviderOptions {
logPrefix: string;
requestCacheMaxSize: number;
recentlyHandledRequestsMaxSize: number;
}
export interface RequestQueueOperationOptions {
/**
* If set to `true`:
* - while adding the request to the queue: the request will be added to the foremost position in the queue.
* - while reclaiming the request: the request will be placed to the beginning of the queue, so that it's returned
* in the next call to {@link RequestQueue.fetchNextRequest}.
* By default, it's put to the end of the queue.
*
* In case the request is already present in the queue, this option has no effect.
*
* If more requests are added with this option at once, their order in the following `fetchNextRequest` call
* is arbitrary.
* @default false
*/
forefront?: boolean;
/**
* Should the requests be added to the local LRU cache?
* @default false
* @internal
*/
cache?: boolean;
}
/**
* @internal
*/
export interface RequestQueueOperationInfo extends QueueOperationInfo {
uniqueKey: string;
forefront: boolean;
}
export interface AddRequestsBatchedOptions extends RequestQueueOperationOptions {
/**
* Whether to wait for all the provided requests to be added, instead of waiting just for the initial batch of up to `batchSize`.
* @default false
*/
waitForAllRequestsToBeAdded?: boolean;
/**
* @default 1000
*/
batchSize?: number;
/**
* @default 1000
*/
waitBetweenBatchesMillis?: number;
}
export interface AddRequestsBatchedResult {
addedRequests: ProcessedRequest[];
/**
* A promise which will resolve with the rest of the requests that were added to the queue.
*
* Alternatively, we can set {@link AddRequestsBatchedOptions.waitForAllRequestsToBeAdded|`waitForAllRequestsToBeAdded`} to `true`
* in the {@link BasicCrawler.addRequests|`crawler.addRequests()`} options.
*
* **Example:**
*
* ```ts
* // Assuming `requests` is a list of requests.
* const result = await crawler.addRequests(requests);
*
* // If we want to wait for the rest of the requests to be added to the queue:
* await result.waitForAllRequestsToBeAdded;
* ```
*/
waitForAllRequestsToBeAdded: Promise<ProcessedRequest[]>;
}
export {};
//# sourceMappingURL=request_provider.d.ts.map