UNPKG

@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

73 lines 3.38 kB
import type { Awaitable } from '@crawlee/types'; import type { RequestOptions } from '../request'; import { Request } from '../request'; import type { EnqueueLinksOptions } from './enqueue_links'; export { tryAbsoluteURL } from '@crawlee/utils'; export type UrlPatternObject = { glob?: string; regexp?: RegExp; } & Pick<RequestOptions, 'method' | 'payload' | 'label' | 'userData' | 'headers'>; export type PseudoUrlObject = { purl: string; } & Pick<RequestOptions, 'method' | 'payload' | 'label' | 'userData' | 'headers'>; export type PseudoUrlInput = string | PseudoUrlObject; export type GlobObject = { glob: string; } & Pick<RequestOptions, 'method' | 'payload' | 'label' | 'userData' | 'headers'>; export type GlobInput = string | GlobObject; export type RegExpObject = { regexp: RegExp; } & Pick<RequestOptions, 'method' | 'payload' | 'label' | 'userData' | 'headers'>; export type RegExpInput = RegExp | RegExpObject; export type SkippedRequestReason = 'robotsTxt' | 'limit' | 'enqueueLimit' | 'filters' | 'redirect' | 'depth'; export type SkippedRequestCallback = (args: { url: string; reason: SkippedRequestReason; }) => Awaitable<void>; /** * @ignore */ export declare function updateEnqueueLinksPatternCache(item: GlobInput | RegExpInput | PseudoUrlInput, pattern: RegExpObject | GlobObject): void; /** * Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function * to construct RegExps from PseudoUrl strings. * @ignore */ export declare function constructRegExpObjectsFromPseudoUrls(pseudoUrls: readonly PseudoUrlInput[]): RegExpObject[]; /** * Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function * to construct Glob objects from Glob pattern strings. * @ignore */ export declare function constructGlobObjectsFromGlobs(globs: readonly GlobInput[]): GlobObject[]; /** * @internal */ export declare function validateGlobPattern(glob: string): string; /** * Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function * to check RegExps input and return valid RegExps. * @ignore */ export declare function constructRegExpObjectsFromRegExps(regexps: readonly RegExpInput[]): RegExpObject[]; /** * @ignore */ export declare function createRequests(requestOptions: (string | RequestOptions)[], urlPatternObjects?: UrlPatternObject[], excludePatternObjects?: UrlPatternObject[], strategy?: EnqueueLinksOptions['strategy'], onSkippedUrl?: (url: string) => void): Request[]; export declare function filterRequestsByPatterns(requests: Request[], patterns?: UrlPatternObject[], onSkippedUrl?: (url: string) => void): Request[]; /** * @ignore */ export declare function createRequestOptions(sources: (string | Record<string, unknown>)[], options?: Pick<EnqueueLinksOptions, 'label' | 'userData' | 'baseUrl' | 'skipNavigation' | 'strategy'>): RequestOptions[]; /** * Takes an Apify {@link RequestOptions} object and changes its attributes in a desired way. This user-function is used * {@link enqueueLinks} to modify requests before enqueuing them. */ export interface RequestTransform { /** * @param original Request options to be modified. * @returns The modified request options to enqueue. */ (original: RequestOptions): RequestOptions | false | undefined | null; } //# sourceMappingURL=shared.d.ts.map