@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

crawlee.dev

apify/crawlee

128 lines • 6.64 kB

TypeScript

import type { Dictionary } from '@crawlee/types'; import { Configuration } from '../configuration'; import type { Request } from '../request'; import type { RequestProviderOptions, RequestQueueOperationInfo } from './request_provider'; import { RequestProvider } from './request_provider'; /** * Represents a queue of URLs to crawl, which is used for deep crawling of websites * where you start with several URLs and then recursively * follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders. * * Each URL is represented using an instance of the {@link Request} class. * The queue can only contain unique URLs. More precisely, it can only contain {@link Request} instances * with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden. * To add a single URL multiple times to the queue, * corresponding {@link Request} objects will need to have different `uniqueKey` properties. * * Do not instantiate this class directly, use the {@link RequestQueue.open} function instead. * * `RequestQueue` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler} * and {@link PlaywrightCrawler} as a source of URLs to crawl. * Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests. * On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch. * * `RequestQueue` stores its data either on local disk or in the Apify Cloud, * depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variable is set. * * If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the queue data is stored in * that directory in an SQLite database file. * * If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the * [Apify Request Queue](https://docs.apify.com/storage/request-queue) * cloud storage. Note that you can force usage of the cloud storage also by passing the `forceCloud` * option to {@link RequestQueue.open} function, * even if the `APIFY_LOCAL_STORAGE_DIR` variable is set. * * **Example usage:** * * ```javascript * // Open the default request queue associated with the crawler run * const queue = await RequestQueue.open(); * * // Open a named request queue * const queueWithName = await RequestQueue.open('some-name'); * * // Enqueue few requests * await queue.addRequest({ url: 'http://example.com/aaa' }); * await queue.addRequest({ url: 'http://example.com/bbb' }); * await queue.addRequest({ url: 'http://example.com/foo/bar' }, { forefront: true }); * ``` * @category Sources * * @deprecated RequestQueue v1 is deprecated and will be removed in the future. Please use {@link RequestQueue} instead. */ declare class RequestQueue extends RequestProvider { private queryQueueHeadPromise?; private inProgress; /** * @internal */ constructor(options: RequestProviderOptions, config?: Configuration); /** * @internal */ inProgressCount(): number; /** * Returns a next request in the queue to be processed, or `null` if there are no more pending requests. * * Once you successfully finish processing of the request, you need to call * {@link RequestQueue.markRequestHandled} * to mark the request as handled in the queue. If there was some error in processing the request, * call {@link RequestQueue.reclaimRequest} instead, * so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function. * * Note that the `null` return value doesn't mean the queue processing finished, * it means there are currently no pending requests. * To check whether all requests in queue were finished, * use {@link RequestQueue.isFinished} instead. * * @returns * Returns the request object or `null` if there are no more pending requests. */ fetchNextRequest<T extends Dictionary = Dictionary>(): Promise<Request<T> | null>; protected ensureHeadIsNonEmpty(): Promise<void>; /** * We always request more items than is in progress to ensure that something falls into head. * * @param [ensureConsistency] If true then query for queue head is retried until queueModifiedAt * is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS to ensure that queue * head is consistent. * @default false * @param [limit] How many queue head items will be fetched. * @param [iteration] Used when this function is called recursively to limit the recursion. * @returns Indicates if queue head is consistent (true) or inconsistent (false). */ protected _ensureHeadIsNonEmpty(ensureConsistency?: boolean, limit?: number, iteration?: number): Promise<boolean>; isFinished(): Promise<boolean>; /** * Reclaims a failed request back to the queue, so that it can be returned for processing later again * by another call to {@link RequestQueue.fetchNextRequest}. * The request record in the queue is updated using the provided `request` parameter. * For example, this lets you store the number of retries or error messages for the request. */ reclaimRequest(...args: Parameters<RequestProvider['reclaimRequest']>): Promise<RequestQueueOperationInfo | null>; /** * @inheritdoc */ markRequestHandled(request: Request): Promise<RequestQueueOperationInfo | null>; protected _reset(): void; /** * Opens a request queue and returns a promise resolving to an instance * of the {@link RequestQueue} class. * * {@link RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud. * The queue is used for deep crawling of websites, where you start with several URLs and then * recursively follow links to other pages. The data structure supports both breadth-first * and depth-first crawling orders. * * For more details and code examples, see the {@link RequestQueue} class. * * @param [queueIdOrName] * ID or name of the request queue to be opened. If `null` or `undefined`, * the function returns the default request queue associated with the crawler run. * @param [options] Open Request Queue options. */ static open(...args: Parameters<typeof RequestProvider.open>): Promise<RequestQueue>; } export { RequestQueue as RequestQueueV1 }; //# sourceMappingURL=request_queue.d.ts.map