UNPKG

@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

249 lines 11.6 kB
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types'; import { type RobotsTxtFile } from '@crawlee/utils'; import type { SetRequired } from 'type-fest'; import type { Request } from '../request'; import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, RequestProvider, RequestQueueOperationOptions } from '../storages'; import type { GlobInput, PseudoUrlInput, RegExpInput, RequestTransform, SkippedRequestCallback } from './shared'; export interface EnqueueLinksOptions extends RequestQueueOperationOptions { /** Limit the amount of actually enqueued URLs to this number. Useful for testing across the entire crawling scope. */ limit?: number; /** An array of URLs to enqueue. */ urls?: readonly string[]; /** A request queue to which the URLs will be enqueued. */ requestQueue?: RequestProvider; /** A CSS selector matching links to be enqueued. */ selector?: string; /** Sets {@link Request.userData} for newly enqueued requests. */ userData?: Dictionary; /** * Sets {@link Request.label} for newly enqueued requests. * * Note that the request options specified in `globs`, `regexps`, or `pseudoUrls` objects * have priority over this option. */ label?: string; /** * If set to `true`, tells the crawler to skip navigation and process the request directly. * @default false */ skipNavigation?: boolean; /** * A base URL that will be used to resolve relative URLs when using Cheerio. Ignored when using Puppeteer, * since the relative URL resolution is done inside the browser automatically. */ baseUrl?: string; /** * An array of glob pattern strings or plain objects * containing glob pattern strings matching the URLs to be enqueued. * * The plain objects must include at least the `glob` property, which holds the glob pattern string. * All remaining keys will be used as request options for the corresponding enqueued {@link Request} objects. * * The matching is always case-insensitive. * If you need case-sensitive matching, use `regexps` property directly. * * If `globs` is an empty array or `undefined`, and `regexps` are also not defined, then the function * enqueues the links with the same subdomain. */ globs?: readonly GlobInput[]; /** * An array of glob pattern strings, regexp patterns or plain objects * containing patterns matching URLs that will **never** be enqueued. * * The plain objects must include either the `glob` property or the `regexp` property. * * Glob matching is always case-insensitive. * If you need case-sensitive matching, provide a regexp. */ exclude?: readonly (GlobInput | RegExpInput)[]; /** * An array of regular expressions or plain objects * containing regular expressions matching the URLs to be enqueued. * * The plain objects must include at least the `regexp` property, which holds the regular expression. * All remaining keys will be used as request options for the corresponding enqueued {@link Request} objects. * * If `regexps` is an empty array or `undefined`, and `globs` are also not defined, then the function * enqueues the links with the same subdomain. */ regexps?: readonly RegExpInput[]; /** * *NOTE:* In future versions of SDK the options will be removed. * Please use `globs` or `regexps` instead. * * An array of {@link PseudoUrl} strings or plain objects * containing {@link PseudoUrl} strings matching the URLs to be enqueued. * * The plain objects must include at least the `purl` property, which holds the pseudo-URL string. * All remaining keys will be used as request options for the corresponding enqueued {@link Request} objects. * * With a pseudo-URL string, the matching is always case-insensitive. * If you need case-sensitive matching, use `regexps` property directly. * * If `pseudoUrls` is an empty array or `undefined`, then the function * enqueues the links with the same subdomain. * * @deprecated prefer using `globs` or `regexps` instead */ pseudoUrls?: readonly PseudoUrlInput[]; /** * Just before a new {@link Request} is constructed and enqueued to the {@link RequestQueue}, this function can be used * to remove it or modify its contents such as `userData`, `payload` or, most importantly `uniqueKey`. This is useful * when you need to enqueue multiple `Requests` to the queue that share the same URL, but differ in methods or payloads, * or to dynamically update or create `userData`. * * For example: by adding `keepUrlFragment: true` to the `request` object, URL fragments will not be removed * when `uniqueKey` is computed. * * **Example:** * ```javascript * { * transformRequestFunction: (request) => { * request.userData.foo = 'bar'; * request.keepUrlFragment = true; * return request; * } * } * ``` * * Note that the request options specified in `globs`, `regexps`, or `pseudoUrls` objects * have priority over this function. Some request options returned by `transformRequestFunction` may be overwritten by pattern-based options from `globs`, `regexps`, or `pseudoUrls`. */ transformRequestFunction?: RequestTransform; /** * The strategy to use when enqueueing the urls. * * Depending on the strategy you select, we will only check certain parts of the URLs found. Here is a diagram of each URL part and their name: * * ```md * Protocol Domain * ┌────┐ ┌─────────┐ * https://example.crawlee.dev/... * │ └─────────────────┤ * │ Hostname │ * │ │ * └─────────────────────────┘ * Origin *``` * * @default EnqueueStrategy.SameHostname */ strategy?: EnqueueStrategy | 'all' | 'same-domain' | 'same-hostname' | 'same-origin'; /** * By default, only the first batch (1000) of found requests will be added to the queue before resolving the call. * You can use this option to wait for adding all of them. */ waitForAllRequestsToBeAdded?: boolean; /** * RobotsTxtFile instance for the current request that triggered the `enqueueLinks`. * If provided, disallowed URLs will be ignored. */ robotsTxtFile?: Pick<RobotsTxtFile, 'isAllowed'>; /** * When a request is skipped for some reason, you can use this callback to act on it. * This is currently fired for requests skipped * 1. based on robots.txt file, * 2. because they don't match enqueueLinks filters, * 3. or because the maxRequestsPerCrawl limit has been reached */ onSkippedRequest?: SkippedRequestCallback; } /** * The different enqueueing strategies available. * * Depending on the strategy you select, we will only check certain parts of the URLs found. Here is a diagram of each URL part and their name: * * ```md * Protocol Domain * ┌────┐ ┌─────────┐ * https://example.crawlee.dev/... * │ └─────────────────┤ * │ Hostname │ * │ │ * └─────────────────────────┘ * Origin *``` * * - The `Protocol` is usually `http` or `https` * - The `Domain` represents the path without any possible subdomains to a website. For example, `crawlee.dev` is the domain of `https://example.crawlee.dev/` * - The `Hostname` is the full path to a website, including any subdomains. For example, `example.crawlee.dev` is the hostname of `https://example.crawlee.dev/` * - The `Origin` is the combination of the `Protocol` and `Hostname`. For example, `https://example.crawlee.dev` is the origin of `https://example.crawlee.dev/` */ export declare enum EnqueueStrategy { /** * Matches any URLs found */ All = "all", /** * Matches any URLs that have the same hostname. * For example, `https://wow.example.com/hello` will be matched for a base url of `https://wow.example.com/`, but * `https://example.com/hello` will not be matched. * * > This strategy will match both `http` and `https` protocols regardless of the base URL protocol. */ SameHostname = "same-hostname", /** * Matches any URLs that have the same domain as the base URL. * For example, `https://wow.an.example.com` and `https://example.com` will both be matched for a base url of * `https://example.com`. * * > This strategy will match both `http` and `https` protocols regardless of the base URL protocol. */ SameDomain = "same-domain", /** * Matches any URLs that have the same hostname and protocol. * For example, `https://wow.example.com/hello` will be matched for a base url of `https://wow.example.com/`, but * `http://wow.example.com/hello` will not be matched. * * > This strategy will ensure the protocol of the base URL is the same as the protocol of the URL to be enqueued. */ SameOrigin = "same-origin" } /** * This function enqueues the urls provided to the {@link RequestQueue} provided. If you want to automatically find and enqueue links, * you should use the context-aware `enqueueLinks` function provided on the crawler contexts. * * Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions * and override settings of the enqueued {@link Request} objects. * * **Example usage** * * ```javascript * await enqueueLinks({ * urls: aListOfFoundUrls, * requestQueue, * selector: 'a.product-detail', * globs: [ * 'https://www.example.com/handbags/*', * 'https://www.example.com/purses/*' * ], * }); * ``` * * @param options All `enqueueLinks()` parameters are passed via an options object. * @returns Promise that resolves to {@link BatchAddRequestsResult} object. */ export declare function enqueueLinks(options: SetRequired<Omit<EnqueueLinksOptions, 'requestQueue'>, 'urls'> & { requestQueue: { addRequestsBatched: (requests: Request<Dictionary>[], options: AddRequestsBatchedOptions) => Promise<AddRequestsBatchedResult>; }; }): Promise<BatchAddRequestsResult>; /** * @internal * This method helps resolve the baseUrl that will be used for filtering in {@link enqueueLinks}. * - If a user provides a base url, we always return it * - If a user specifies {@link EnqueueStrategy.All} strategy, they do not care if the newly found urls are on the original * request domain, or a redirected one * - In all other cases, we return the domain of the original request as that's the one we need to use for filtering */ export declare function resolveBaseUrlForEnqueueLinksFiltering({ enqueueStrategy, finalRequestUrl, originalRequestUrl, userProvidedBaseUrl, }: ResolveBaseUrl): string | undefined; /** * @internal */ export interface ResolveBaseUrl { userProvidedBaseUrl?: string; enqueueStrategy?: EnqueueLinksOptions['strategy']; originalRequestUrl: string; finalRequestUrl?: string; } //# sourceMappingURL=enqueue_links.d.ts.map