@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

crawlee.dev

apify/crawlee

87 lines • 3.82 kB

TypeScript

View Raw

import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types'; import { Configuration } from '../configuration'; import type { Request, Source } from '../request'; import type { RequestProviderOptions, RequestQueueOperationInfo, RequestQueueOperationOptions, RequestsLike } from './request_provider'; import { RequestProvider } from './request_provider'; /** * Represents a queue of URLs to crawl, which is used for deep crawling of websites * where you start with several URLs and then recursively * follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders. * * Each URL is represented using an instance of the {@link Request} class. * The queue can only contain unique URLs. More precisely, it can only contain {@link Request} instances * with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden. * To add a single URL multiple times to the queue, * corresponding {@link Request} objects will need to have different `uniqueKey` properties. * * Do not instantiate this class directly, use the {@link RequestQueue.open} function instead. * * `RequestQueue` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler} * and {@link PlaywrightCrawler} as a source of URLs to crawl. * Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests. * On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch. * * **Example usage:** * * ```javascript * // Open the default request queue associated with the crawler run * const queue = await RequestQueue.open(); * * // Open a named request queue * const queueWithName = await RequestQueue.open('some-name'); * * // Enqueue few requests * await queue.addRequest({ url: 'http://example.com/aaa' }); * await queue.addRequest({ url: 'http://example.com/bbb' }); * await queue.addRequest({ url: 'http://example.com/foo/bar' }, { forefront: true }); * ``` * @category Sources */ export declare class RequestQueue extends RequestProvider { private listHeadAndLockPromise; private queueHasLockedRequests; private shouldCheckForForefrontRequests; private dequeuedRequestCount; constructor(options: RequestProviderOptions, config?: Configuration); /** * Caches information about request to beware of unneeded addRequest() calls. */ protected _cacheRequest(cacheKey: string, queueOperationInfo: RequestQueueOperationInfo): void; /** * @inheritDoc */ addRequest(requestLike: Source, options?: RequestQueueOperationOptions): Promise<RequestQueueOperationInfo>; /** * @inheritDoc */ addRequests(requestsLike: RequestsLike, options?: RequestQueueOperationOptions): Promise<BatchAddRequestsResult>; /** * @inheritDoc */ fetchNextRequest<T extends Dictionary = Dictionary>(): Promise<Request<T> | null>; /** * @inheritDoc */ markRequestHandled(request: Request): Promise<RequestQueueOperationInfo | null>; /** * @inheritDoc */ isFinished(): Promise<boolean>; /** * @inheritDoc */ reclaimRequest(...args: Parameters<RequestProvider['reclaimRequest']>): ReturnType<RequestProvider['reclaimRequest']>; protected ensureHeadIsNonEmpty(): Promise<void>; private giveUpLock; private _listHeadAndLock; private getOrHydrateRequest; private _prolongRequestLock; protected _reset(): void; protected _maybeAddRequestToQueueHead(): void; protected _clearPossibleLocks(): Promise<void>; /** * @inheritDoc */ static open(...args: Parameters<typeof RequestProvider.open>): Promise<RequestQueue>; } //# sourceMappingURL=request_queue_v2.d.ts.map