@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
506 lines • 23.4 kB
TypeScript
import type { Dictionary } from '@crawlee/types';
import { Configuration } from '../configuration';
import type { ProxyConfiguration } from '../proxy_configuration';
import { type InternalSource, Request, type RequestOptions, type Source } from '../request';
/** @internal */
export declare const STATE_PERSISTENCE_KEY = "REQUEST_LIST_STATE";
/** @internal */
export declare const REQUESTS_PERSISTENCE_KEY = "REQUEST_LIST_REQUESTS";
/**
* Represents a static list of URLs to crawl.
*/
export interface IRequestList {
/**
* Returns the total number of unique requests present in the list.
*/
length(): number;
/**
* Returns `true` if all requests were already handled and there are no more left.
*/
isFinished(): Promise<boolean>;
/**
* Resolves to `true` if the next call to {@link IRequestList.fetchNextRequest} function
* would return `null`, otherwise it resolves to `false`.
* Note that even if the list is empty, there might be some pending requests currently being processed.
*/
isEmpty(): Promise<boolean>;
/**
* Returns number of handled requests.
*/
handledCount(): number;
/**
* Persists the current state of the `IRequestList` into the default {@link KeyValueStore}.
* The state is persisted automatically in regular intervals, but calling this method manually
* is useful in cases where you want to have the most current state available after you pause
* or stop fetching its requests. For example after you pause or abort a crawl. Or just before
* a server migration.
*/
persistState(): Promise<void>;
/**
* Gets the next {@link Request} to process. First, the function gets a request previously reclaimed
* using the {@link RequestList.reclaimRequest} function, if there is any.
* Otherwise it gets the next request from sources.
*
* The function's `Promise` resolves to `null` if there are no more
* requests to process.
*/
fetchNextRequest(): Promise<Request | null>;
/**
* Can be used to iterate over the `RequestList` instance in a `for await .. of` loop.
* Provides an alternative for the repeated use of `fetchNextRequest`.
*/
[Symbol.asyncIterator](): AsyncGenerator<Request>;
/**
* Reclaims request to the list if its processing failed.
* The request will become available in the next `this.fetchNextRequest()`.
*/
reclaimRequest(request: Request): Promise<void>;
/**
* Marks request as handled after successful processing.
*/
markRequestHandled(request: Request): Promise<void>;
/**
* @internal
*/
inProgress: Set<string>;
}
export interface RequestListOptions {
/**
* An array of sources of URLs for the {@link RequestList}. It can be either an array of strings,
* plain objects that define at least the `url` property, or an array of {@link Request} instances.
*
* **IMPORTANT:** The `sources` array will be consumed (left empty) after `RequestList` initializes.
* This is a measure to prevent memory leaks in situations when millions of sources are
* added.
*
* Additionally, the `requestsFromUrl` property may be used instead of `url`,
* which will instruct `RequestList` to download the source URLs from a given remote location.
* The URLs will be parsed from the received response.
*
* ```
* [
* // A single URL
* 'http://example.com/a/b',
*
* // Modify Request options
* { method: PUT, 'https://example.com/put, payload: { foo: 'bar' }}
*
* // Batch import of URLs from a file hosted on the web,
* // where the URLs should be requested using the HTTP POST request
* { method: 'POST', requestsFromUrl: 'http://example.com/urls.txt' },
*
* // Batch import from remote file, using a specific regular expression to extract the URLs.
* { requestsFromUrl: 'http://example.com/urls.txt', regex: /https:\/\/example.com\/.+/ },
*
* // Get list of URLs from a Google Sheets document. Just add "/gviz/tq?tqx=out:csv" to the Google Sheet URL.
* // For details, see https://help.apify.com/en/articles/2906022-scraping-a-list-of-urls-from-a-google-sheets-document
* { requestsFromUrl: 'https://docs.google.com/spreadsheets/d/1-2mUcRAiBbCTVA5KcpFdEYWflLMLp9DDU3iJutvES4w/gviz/tq?tqx=out:csv' }
* ]
* ```
*/
sources?: RequestListSource[];
/**
* A function that will be called to get the sources for the `RequestList`, but only if `RequestList`
* was not able to fetch their persisted version (see {@link RequestListOptions.persistRequestsKey}).
* It must return an `Array` of {@link Request} or {@link RequestOptions}.
*
* This is very useful in a scenario when getting the sources is a resource intensive or time consuming
* task, such as fetching URLs from multiple sitemaps or parsing URLs from large datasets. Using the
* `sourcesFunction` in combination with `persistStateKey` and `persistRequestsKey` will allow you to
* fetch and parse those URLs only once, saving valuable time when your crawler migrates or restarts.
*
* If both {@link RequestListOptions.sources} and {@link RequestListOptions.sourcesFunction} are provided,
* the sources returned by the function will be added after the `sources`.
*
* **Example:**
* ```javascript
* // Let's say we want to scrape URLs extracted from sitemaps.
*
* const sourcesFunction = async () => {
* // With super large sitemaps, this operation could take very long
* // and big websites typically have multiple sitemaps.
* const sitemaps = await downloadHugeSitemaps();
* return parseUrlsFromSitemaps(sitemaps);
* };
*
* // Sitemaps can change in real-time, so it's important to persist
* // the URLs we collected. Otherwise we might lose our scraping
* // state in case of an crawler migration / failure / time-out.
* const requestList = await RequestList.open(null, [], {
* // The sourcesFunction is called now and the Requests are persisted.
* // If something goes wrong and we need to start again, RequestList
* // will load the persisted Requests from storage and will NOT
* // call the sourcesFunction again, saving time and resources.
* sourcesFunction,
* persistStateKey: 'state-key',
* persistRequestsKey: 'requests-key',
* })
* ```
*/
sourcesFunction?: RequestListSourcesFunction;
/**
* Used to pass the proxy configuration for the `requestsFromUrl` objects.
* Takes advantage of the internal address rotation and authentication process.
* If undefined, the `requestsFromUrl` requests will be made without proxy.
*/
proxyConfiguration?: ProxyConfiguration;
/**
* Identifies the key in the default key-value store under which `RequestList` periodically stores its
* state (i.e. which URLs were crawled and which not).
* If the crawler is restarted, `RequestList` will read the state
* and continue where it left off.
*
* If `persistStateKey` is not set, `RequestList` will always start from the beginning,
* and all the source URLs will be crawled again.
*/
persistStateKey?: string;
/**
* Identifies the key in the default key-value store under which the `RequestList` persists its
* Requests during the {@link RequestList.initialize} call.
* This is necessary if `persistStateKey` is set and the source URLs might potentially change,
* to ensure consistency of the source URLs and state object. However, it comes with some
* storage and performance overheads.
*
* If `persistRequestsKey` is not set, {@link RequestList.initialize} will always fetch the sources
* from their origin, check that they are consistent with the restored state (if any)
* and throw an error if they are not.
*/
persistRequestsKey?: string;
/**
* The state object that the `RequestList` will be initialized from.
* It is in the form as returned by `RequestList.getState()`, such as follows:
*
* ```
* {
* nextIndex: 5,
* nextUniqueKey: 'unique-key-5'
* inProgress: {
* 'unique-key-1': true,
* 'unique-key-4': true,
* },
* }
* ```
*
* Note that the preferred (and simpler) way to persist the state of crawling of the `RequestList`
* is to use the `stateKeyPrefix` parameter instead.
*/
state?: RequestListState;
/**
* By default, `RequestList` will deduplicate the provided URLs. Default deduplication is based
* on the `uniqueKey` property of passed source {@link Request} objects.
*
* If the property is not present, it is generated by normalizing the URL. If present, it is kept intact.
* In any case, only one request per `uniqueKey` is added to the `RequestList` resulting in removal
* of duplicate URLs / unique keys.
*
* Setting `keepDuplicateUrls` to `true` will append an additional identifier to the `uniqueKey`
* of each request that does not already include a `uniqueKey`. Therefore, duplicate
* URLs will be kept in the list. It does not protect the user from having duplicates in user set
* `uniqueKey`s however. It is the user's responsibility to ensure uniqueness of their unique keys
* if they wish to keep more than just a single copy in the `RequestList`.
* @default false
*/
keepDuplicateUrls?: boolean;
/** @internal */
config?: Configuration;
}
/**
* Represents a static list of URLs to crawl.
* The URLs can be provided either in code or parsed from a text file hosted on the web.
* `RequestList` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
* and {@link PlaywrightCrawler} as a source of URLs to crawl.
*
* Each URL is represented using an instance of the {@link Request} class.
* The list can only contain unique URLs. More precisely, it can only contain `Request` instances
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
* To add a single URL to the list multiple times, corresponding {@link Request} objects will need to have different
* `uniqueKey` properties. You can use the `keepDuplicateUrls` option to do this for you when initializing the
* `RequestList` from sources.
*
* `RequestList` doesn't have a public constructor, you need to create it with the asynchronous {@link RequestList.open} function. After
* the request list is created, no more URLs can be added to it.
* Unlike {@link RequestQueue}, `RequestList` is static but it can contain even millions of URLs.
* > Note that `RequestList` can be used together with `RequestQueue` by the same crawler.
* > In such cases, each request from `RequestList` is enqueued into `RequestQueue` first and then consumed from the latter.
* > This is necessary to avoid the same URL being processed more than once (from the list first and then possibly from the queue).
* > In practical terms, such a combination can be useful when there is a large number of initial URLs,
* > but more URLs would be added dynamically by the crawler.
*
* `RequestList` has an internal state where it stores information about which requests were already handled,
* which are in progress and which were reclaimed. The state may be automatically persisted to the default
* {@link KeyValueStore} by setting the `persistStateKey` option so that if the Node.js process is restarted,
* the crawling can continue where it left off. The automated persisting is launched upon receiving the `persistState`
* event that is periodically emitted by {@link EventManager}.
*
* The internal state is closely tied to the provided sources (URLs). If the sources change on crawler restart, the state will become corrupted and
* `RequestList` will raise an exception. This typically happens when the sources is a list of URLs downloaded from the web.
* In such case, use the `persistRequestsKey` option in conjunction with `persistStateKey`,
* to make the `RequestList` store the initial sources to the default key-value store and load them after restart,
* which will prevent any issues that a live list of URLs might cause.
*
* **Basic usage:**
* ```javascript
* const requestList = await RequestList.open('my-request-list', [
* 'http://www.example.com/page-1',
* { url: 'http://www.example.com/page-2', method: 'POST', userData: { foo: 'bar' }},
* { requestsFromUrl: 'http://www.example.com/my-url-list.txt', userData: { isFromUrl: true } },
* ]);
* ```
*
* **Advanced usage:**
* ```javascript
* const requestList = await RequestList.open(null, [
* // Separate requests
* { url: 'http://www.example.com/page-1', method: 'GET', headers: { ... } },
* { url: 'http://www.example.com/page-2', userData: { foo: 'bar' }},
*
* // Bulk load of URLs from file `http://www.example.com/my-url-list.txt`
* // Note that all URLs must start with http:// or https://
* { requestsFromUrl: 'http://www.example.com/my-url-list.txt', userData: { isFromUrl: true } },
* ], {
* // Persist the state to avoid re-crawling which can lead to data duplications.
* // Keep in mind that the sources have to be immutable or this will throw an error.
* persistStateKey: 'my-state',
* });
* ```
* @category Sources
*/
export declare class RequestList implements IRequestList {
private log;
/**
* Array of all requests from all sources, in the order as they appeared in sources.
* All requests in the array have distinct uniqueKey!
* @internal
*/
requests: (Request | RequestOptions)[];
/** Index to the next item in requests array to fetch. All previous requests are either handled or in progress. */
private nextIndex;
/** Dictionary, key is Request.uniqueKey, value is corresponding index in the requests array. */
private uniqueKeyToIndex;
/**
* Set of `uniqueKey`s of requests that were returned by fetchNextRequest().
* @internal
*/
inProgress: Set<string>;
/**
* Set of `uniqueKey`s of requests for which reclaimRequest() was called.
* @internal
*/
reclaimed: Set<string>;
/**
* Starts as true because until we handle the first request, the list is effectively persisted by doing nothing.
* @internal
*/
isStatePersisted: boolean;
/**
* Starts as false because we don't know yet and sources might change in the meantime (eg. download from live list).
* @internal
*/
areRequestsPersisted: boolean;
private isLoading;
private isInitialized;
private persistStateKey?;
private persistRequestsKey?;
private initialState?;
private store?;
private keepDuplicateUrls;
private sources;
private sourcesFunction?;
private proxyConfiguration?;
private events;
/**
* To create new instance of `RequestList` we need to use `RequestList.open()` factory method.
* @param options All `RequestList` configuration options
* @internal
*/
private constructor();
/**
* Loads all remote sources of URLs and potentially starts periodic state persistence.
* This function must be called before you can start using the instance in a meaningful way.
*/
private initialize;
/**
* Adds previously persisted Requests, as retrieved from the key-value store.
* This needs to be done in a memory efficient way. We should update the input
* to a Stream once apify-client supports streams.
*/
protected _addPersistedRequests(persistedRequests: Buffer): Promise<void>;
/**
* Add Requests from both options.sources and options.sourcesFunction.
* This function is called only when persisted sources were not loaded.
* We need to avoid keeping both sources and requests in memory
* to reduce memory footprint with very large sources.
*/
protected _addRequestsFromSources(): Promise<void>;
/**
* @inheritDoc
*/
persistState(): Promise<void>;
/**
* Unlike persistState(), this is used only internally, since the sources
* are automatically persisted at RequestList initialization (if the persistRequestsKey is set),
* but there's no reason to persist it again afterwards, because RequestList is immutable.
*/
protected _persistRequests(): Promise<void>;
/**
* Restores RequestList state from a state object.
*/
protected _restoreState(state?: RequestListState): void;
/**
* Attempts to load state and requests using the `RequestList` configuration
* and returns a tuple of [state, requests] where each may be null if not loaded.
*/
protected _loadStateAndPersistedRequests(): Promise<[RequestListState, Buffer]>;
/**
* Returns an object representing the internal state of the `RequestList` instance.
* Note that the object's fields can change in future releases.
*/
getState(): RequestListState;
/**
* @inheritDoc
*/
isEmpty(): Promise<boolean>;
/**
* @inheritDoc
*/
isFinished(): Promise<boolean>;
/**
* @inheritDoc
*/
fetchNextRequest(): Promise<Request | null>;
/**
* @inheritDoc
*/
[Symbol.asyncIterator](): AsyncGenerator<Request<Dictionary>, void, unknown>;
private ensureRequest;
/**
* @inheritDoc
*/
markRequestHandled(request: Request): Promise<void>;
/**
* @inheritDoc
*/
reclaimRequest(request: Request): Promise<void>;
/**
* Adds all fetched requests from a URL from a remote resource.
*/
protected _addFetchedRequests(source: InternalSource, fetchedRequests: RequestOptions[]): Promise<void>;
protected _getPersistedState<T>(key: string): Promise<T>;
/**
* Fetches URLs from requestsFromUrl and returns them in format of list of requests
*/
protected _fetchRequestsFromUrl(source: InternalSource): Promise<RequestOptions[]>;
/**
* Adds given request.
* If the `source` parameter is a string or plain object and not an instance
* of a `Request`, then the function creates a `Request` instance.
*/
protected _addRequest(source: RequestListSource): void;
/**
* Helper function that validates unique key.
* Throws an error if uniqueKey is not a non-empty string.
*/
protected _ensureUniqueKeyValid(uniqueKey: string): void;
/**
* Checks that request is not reclaimed and throws an error if so.
*/
protected _ensureInProgressAndNotReclaimed(uniqueKey: string): void;
/**
* Throws an error if request list wasn't initialized.
*/
protected _ensureIsInitialized(): void;
/**
* Returns the total number of unique requests present in the `RequestList`.
*/
length(): number;
/**
* @inheritDoc
*/
handledCount(): number;
/**
* Opens a request list and returns a promise resolving to an instance
* of the {@link RequestList} class that is already initialized.
*
* {@link RequestList} represents a list of URLs to crawl, which is always stored in memory.
* To enable picking up where left off after a process restart, the request list sources
* are persisted to the key-value store at initialization of the list. Then, while crawling,
* a small state object is regularly persisted to keep track of the crawling status.
*
* For more details and code examples, see the {@link RequestList} class.
*
* **Example usage:**
*
* ```javascript
* const sources = [
* 'https://www.example.com',
* 'https://www.google.com',
* 'https://www.bing.com'
* ];
*
* const requestList = await RequestList.open('my-name', sources);
* ```
*
* @param listNameOrOptions
* Name of the request list to be opened, or the options object. Setting a name enables the `RequestList`'s
* state to be persisted in the key-value store. This is useful in case of a restart or migration. Since `RequestList`
* is only stored in memory, a restart or migration wipes it clean. Setting a name will enable the `RequestList`'s
* state to survive those situations and continue where it left off.
*
* The name will be used as a prefix in key-value store, producing keys such as `NAME-REQUEST_LIST_STATE`
* and `NAME-REQUEST_LIST_SOURCES`.
*
* If `null`, the list will not be persisted and will only be stored in memory. Process restart
* will then cause the list to be crawled again from the beginning. We suggest always using a name.
* @param [sources]
* An array of sources of URLs for the {@link RequestList}. It can be either an array of strings,
* plain objects that define at least the `url` property, or an array of {@link Request} instances.
*
* **IMPORTANT:** The `sources` array will be consumed (left empty) after {@link RequestList} initializes.
* This is a measure to prevent memory leaks in situations when millions of sources are
* added.
*
* Additionally, the `requestsFromUrl` property may be used instead of `url`,
* which will instruct {@link RequestList} to download the source URLs from a given remote location.
* The URLs will be parsed from the received response. In this case you can limit the URLs
* using `regex` parameter containing regular expression pattern for URLs to be included.
*
* For details, see the {@link RequestListOptions.sources}
* @param [options]
* The {@link RequestList} options. Note that the `listName` parameter supersedes
* the {@link RequestListOptions.persistStateKey} and {@link RequestListOptions.persistRequestsKey}
* options and the `sources` parameter supersedes the {@link RequestListOptions.sources} option.
*/
static open(listNameOrOptions: string | null | RequestListOptions, sources?: RequestListSource[], options?: RequestListOptions): Promise<RequestList>;
/**
* @internal wraps public utility for mocking purposes
*/
private _downloadListOfUrls;
}
/**
* Represents state of a {@link RequestList}. It can be used to resume a {@link RequestList} which has been previously processed.
* You can obtain the state by calling {@link RequestList.getState} and receive an object with
* the following structure:
*
* ```
* {
* nextIndex: 5,
* nextUniqueKey: 'unique-key-5'
* inProgress: {
* 'unique-key-1': true,
* 'unique-key-4': true
* },
* }
* ```
*/
export interface RequestListState {
/** Position of the next request to be processed. */
nextIndex: number;
/** Key of the next request to be processed. */
nextUniqueKey: string | null;
/** Array of request keys representing those that being processed at the moment. */
inProgress: string[];
}
type RequestListSource = string | Source;
export type RequestListSourcesFunction = () => Promise<RequestListSource[]>;
export {};
//# sourceMappingURL=request_list.d.ts.map