@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
128 lines • 6.64 kB
TypeScript
import type { Dictionary } from '@crawlee/types';
import { Configuration } from '../configuration';
import type { Request } from '../request';
import type { RequestProviderOptions, RequestQueueOperationInfo } from './request_provider';
import { RequestProvider } from './request_provider';
/**
* Represents a queue of URLs to crawl, which is used for deep crawling of websites
* where you start with several URLs and then recursively
* follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders.
*
* Each URL is represented using an instance of the {@link Request} class.
* The queue can only contain unique URLs. More precisely, it can only contain {@link Request} instances
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
* To add a single URL multiple times to the queue,
* corresponding {@link Request} objects will need to have different `uniqueKey` properties.
*
* Do not instantiate this class directly, use the {@link RequestQueue.open} function instead.
*
* `RequestQueue` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
* and {@link PlaywrightCrawler} as a source of URLs to crawl.
* Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests.
* On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch.
*
* `RequestQueue` stores its data either on local disk or in the Apify Cloud,
* depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variable is set.
*
* If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the queue data is stored in
* that directory in an SQLite database file.
*
* If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
* [Apify Request Queue](https://docs.apify.com/storage/request-queue)
* cloud storage. Note that you can force usage of the cloud storage also by passing the `forceCloud`
* option to {@link RequestQueue.open} function,
* even if the `APIFY_LOCAL_STORAGE_DIR` variable is set.
*
* **Example usage:**
*
* ```javascript
* // Open the default request queue associated with the crawler run
* const queue = await RequestQueue.open();
*
* // Open a named request queue
* const queueWithName = await RequestQueue.open('some-name');
*
* // Enqueue few requests
* await queue.addRequest({ url: 'http://example.com/aaa' });
* await queue.addRequest({ url: 'http://example.com/bbb' });
* await queue.addRequest({ url: 'http://example.com/foo/bar' }, { forefront: true });
* ```
* @category Sources
*
* @deprecated RequestQueue v1 is deprecated and will be removed in the future. Please use {@link RequestQueue} instead.
*/
declare class RequestQueue extends RequestProvider {
private queryQueueHeadPromise?;
private inProgress;
/**
* @internal
*/
constructor(options: RequestProviderOptions, config?: Configuration);
/**
* @internal
*/
inProgressCount(): number;
/**
* Returns a next request in the queue to be processed, or `null` if there are no more pending requests.
*
* Once you successfully finish processing of the request, you need to call
* {@link RequestQueue.markRequestHandled}
* to mark the request as handled in the queue. If there was some error in processing the request,
* call {@link RequestQueue.reclaimRequest} instead,
* so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function.
*
* Note that the `null` return value doesn't mean the queue processing finished,
* it means there are currently no pending requests.
* To check whether all requests in queue were finished,
* use {@link RequestQueue.isFinished} instead.
*
* @returns
* Returns the request object or `null` if there are no more pending requests.
*/
fetchNextRequest<T extends Dictionary = Dictionary>(): Promise<Request<T> | null>;
protected ensureHeadIsNonEmpty(): Promise<void>;
/**
* We always request more items than is in progress to ensure that something falls into head.
*
* @param [ensureConsistency] If true then query for queue head is retried until queueModifiedAt
* is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS to ensure that queue
* head is consistent.
* @default false
* @param [limit] How many queue head items will be fetched.
* @param [iteration] Used when this function is called recursively to limit the recursion.
* @returns Indicates if queue head is consistent (true) or inconsistent (false).
*/
protected _ensureHeadIsNonEmpty(ensureConsistency?: boolean, limit?: number, iteration?: number): Promise<boolean>;
isFinished(): Promise<boolean>;
/**
* Reclaims a failed request back to the queue, so that it can be returned for processing later again
* by another call to {@link RequestQueue.fetchNextRequest}.
* The request record in the queue is updated using the provided `request` parameter.
* For example, this lets you store the number of retries or error messages for the request.
*/
reclaimRequest(...args: Parameters<RequestProvider['reclaimRequest']>): Promise<RequestQueueOperationInfo | null>;
/**
* @inheritdoc
*/
markRequestHandled(request: Request): Promise<RequestQueueOperationInfo | null>;
protected _reset(): void;
/**
* Opens a request queue and returns a promise resolving to an instance
* of the {@link RequestQueue} class.
*
* {@link RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud.
* The queue is used for deep crawling of websites, where you start with several URLs and then
* recursively follow links to other pages. The data structure supports both breadth-first
* and depth-first crawling orders.
*
* For more details and code examples, see the {@link RequestQueue} class.
*
* @param [queueIdOrName]
* ID or name of the request queue to be opened. If `null` or `undefined`,
* the function returns the default request queue associated with the crawler run.
* @param [options] Open Request Queue options.
*/
static open(...args: Parameters<typeof RequestProvider.open>): Promise<RequestQueue>;
}
export { RequestQueue as RequestQueueV1 };
//# sourceMappingURL=request_queue.d.ts.map