@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
195 lines • 8.37 kB
TypeScript
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
// @ts-ignore optional peer dependency or compatibility with es2022
import type { OptionsInit, Response as GotResponse } from 'got-scraping';
import type { ReadonlyDeep } from 'type-fest';
import type { Configuration } from '../configuration';
import type { EnqueueLinksOptions } from '../enqueue_links/enqueue_links';
import type { Log } from '../log';
import type { ProxyInfo } from '../proxy_configuration';
import type { Request, Source } from '../request';
import type { Session } from '../session_pool/session';
import type { Dataset, RecordOptions, RequestQueueOperationOptions } from '../storages';
import { KeyValueStore } from '../storages';
/** @internal */
export type IsAny<T> = 0 extends 1 & T ? true : false;
/** @internal */
export type WithRequired<T, K extends keyof T> = T & {
[P in K]-?: T[P];
};
export type LoadedRequest<R extends Request> = WithRequired<R, 'id' | 'loadedUrl'>;
/** @internal */
export type LoadedContext<Context extends RestrictedCrawlingContext> = IsAny<Context> extends true ? Context : {
request: LoadedRequest<Context['request']>;
} & Omit<Context, 'request'>;
export interface RestrictedCrawlingContext<UserData extends Dictionary = Dictionary> extends Record<string & {}, unknown> {
id: string;
session?: Session;
/**
* An object with information about currently used proxy by the crawler
* and configured by the {@link ProxyConfiguration} class.
*/
proxyInfo?: ProxyInfo;
/**
* The original {@link Request} object.
*/
request: Request<UserData>;
/**
* This function allows you to push data to a {@link Dataset} specified by name, or the one currently used by the crawler.
*
* Shortcut for `crawler.pushData()`.
*
* @param [data] Data to be pushed to the default dataset.
*/
pushData(data: ReadonlyDeep<Parameters<Dataset['pushData']>[0]>, datasetIdOrName?: string): Promise<void>;
/**
* This function automatically finds and enqueues links from the current page, adding them to the {@link RequestQueue}
* currently used by the crawler.
*
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
* and override settings of the enqueued {@link Request} objects.
*
* Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example
* for more details regarding its usage.
*
* **Example usage**
*
* ```ts
* async requestHandler({ enqueueLinks }) {
* await enqueueLinks({
* globs: [
* 'https://www.example.com/handbags/*',
* ],
* });
* },
* ```
*
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
*/
enqueueLinks: (options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>>) => Promise<unknown>;
/**
* Add requests directly to the request queue.
*
* @param requests The requests to add
* @param options Options for the request queue
*/
addRequests: (requestsLike: ReadonlyDeep<(string | Source)[]>, options?: ReadonlyDeep<RequestQueueOperationOptions>) => Promise<void>;
/**
* Returns the state - a piece of mutable persistent data shared across all the request handler runs.
*/
useState: <State extends Dictionary = Dictionary>(defaultValue?: State) => Promise<State>;
/**
* Get a key-value store with given name or id, or the default one for the crawler.
*/
getKeyValueStore: (idOrName?: string) => Promise<Pick<KeyValueStore, 'id' | 'name' | 'getValue' | 'getAutoSavedValue' | 'setValue' | 'getPublicUrl'>>;
/**
* A preconfigured logger for the request handler.
*/
log: Log;
}
export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary = Dictionary> extends RestrictedCrawlingContext<UserData> {
crawler: Crawler;
/**
* This function automatically finds and enqueues links from the current page, adding them to the {@link RequestQueue}
* currently used by the crawler.
*
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
* and override settings of the enqueued {@link Request} objects.
*
* Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example
* for more details regarding its usage.
*
* **Example usage**
*
* ```ts
* async requestHandler({ enqueueLinks }) {
* await enqueueLinks({
* globs: [
* 'https://www.example.com/handbags/*',
* ],
* });
* },
* ```
*
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
* @returns Promise that resolves to {@link BatchAddRequestsResult} object.
*/
enqueueLinks(options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>): Promise<BatchAddRequestsResult>;
/**
* Get a key-value store with given name or id, or the default one for the crawler.
*/
getKeyValueStore: (idOrName?: string) => Promise<KeyValueStore>;
/**
* Fires HTTP request via [`got-scraping`](https://crawlee.dev/js/docs/guides/got-scraping), allowing to override the request
* options on the fly.
*
* This is handy when you work with a browser crawler but want to execute some requests outside it (e.g. API requests).
* Check the [Skipping navigations for certain requests](https://crawlee.dev/js/docs/examples/skip-navigation) example for
* more detailed explanation of how to do that.
*
* ```ts
* async requestHandler({ sendRequest }) {
* const { body } = await sendRequest({
* // override headers only
* headers: { ... },
* });
* },
* ```
*/
sendRequest<Response = string>(overrideOptions?: Partial<OptionsInit>): Promise<GotResponse<Response>>;
}
/**
* A partial implementation of {@link RestrictedCrawlingContext} that stores parameters of calls to context methods for later inspection.
*
* @experimental
*/
export declare class RequestHandlerResult {
private config;
private crawleeStateKey;
private _keyValueStoreChanges;
private pushDataCalls;
private addRequestsCalls;
constructor(config: Configuration, crawleeStateKey: string);
/**
* A record of calls to {@link RestrictedCrawlingContext.pushData}, {@link RestrictedCrawlingContext.addRequests}, {@link RestrictedCrawlingContext.enqueueLinks} made by a request handler.
*/
get calls(): ReadonlyDeep<{
pushData: Parameters<RestrictedCrawlingContext['pushData']>[];
addRequests: Parameters<RestrictedCrawlingContext['addRequests']>[];
}>;
/**
* A record of changes made to key-value stores by a request handler.
*/
get keyValueStoreChanges(): ReadonlyDeep<Record<string, Record<string, {
changedValue: unknown;
options?: RecordOptions;
}>>>;
/**
* Items added to datasets by a request handler.
*/
get datasetItems(): ReadonlyDeep<{
item: Dictionary;
datasetIdOrName?: string;
}[]>;
/**
* URLs enqueued to the request queue by a request handler, either via {@link RestrictedCrawlingContext.addRequests} or {@link RestrictedCrawlingContext.enqueueLinks}
*/
get enqueuedUrls(): ReadonlyDeep<{
url: string;
label?: string;
}[]>;
/**
* URL lists enqueued to the request queue by a request handler via {@link RestrictedCrawlingContext.addRequests} using the `requestsFromUrl` option.
*/
get enqueuedUrlLists(): ReadonlyDeep<{
listUrl: string;
label?: string;
}[]>;
pushData: RestrictedCrawlingContext['pushData'];
addRequests: RestrictedCrawlingContext['addRequests'];
useState: RestrictedCrawlingContext['useState'];
getKeyValueStore: RestrictedCrawlingContext['getKeyValueStore'];
private idOrDefault;
private getKeyValueStoreChangedValue;
private setKeyValueStoreChangedValue;
}
//# sourceMappingURL=crawler_commons.d.ts.map