UNPKG

@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

245 lines • 8.48 kB
import { type ParseSitemapOptions } from '@crawlee/utils'; import { Configuration } from '../configuration'; import type { GlobInput, RegExpInput } from '../enqueue_links'; import { Request } from '../request'; import type { IRequestList } from './request_list'; interface UrlConstraints { /** * An array of glob pattern strings or plain objects * containing glob pattern strings matching the URLs to be enqueued. * * The plain objects must include at least the `glob` property, which holds the glob pattern string. * * The matching is always case-insensitive. * If you need case-sensitive matching, use `regexps` property directly. * * If `globs` is an empty array or `undefined`, and `regexps` are also not defined, then the `SitemapRequestList` * includes all the URLs from the sitemap. */ globs?: readonly GlobInput[]; /** * An array of glob pattern strings, regexp patterns or plain objects * containing patterns matching URLs that will **never** be included. * * The plain objects must include either the `glob` property or the `regexp` property. * * Glob matching is always case-insensitive. * If you need case-sensitive matching, provide a regexp. */ exclude?: readonly (GlobInput | RegExp)[]; /** * An array of regular expressions or plain objects * containing regular expressions matching the URLs to be enqueued. * * The plain objects must include at least the `regexp` property, which holds the regular expression. * * If `regexps` is an empty array or `undefined`, and `globs` are also not defined, then the `SitemapRequestList` * includes all the URLs from the sitemap. */ regexps?: readonly RegExpInput[]; } export interface SitemapRequestListOptions extends UrlConstraints { /** * List of sitemap URLs to parse. */ sitemapUrls: string[]; /** * Proxy URL to be used for sitemap loading. */ proxyUrl?: string; /** * Key for persisting the state of the request list in the `KeyValueStore`. */ persistStateKey?: string; /** * Persistence-related options to control how and when crawler's data gets persisted. */ persistenceOptions?: { /** * Use this flag to disable or enable periodic persistence to key value store. * @default true */ enable?: boolean; }; /** * Abort signal to be used for sitemap loading. */ signal?: AbortSignal; /** * Timeout for sitemap loading in milliseconds. If both `signal` and `timeoutMillis` are provided, either of them can abort the loading. */ timeoutMillis?: number; /** * Maximum number of buffered URLs for the sitemap loading stream. * If the buffer is full, the stream will pause until the buffer is drained. * * @default 200 */ maxBufferSize?: number; /** * Advanced options for the underlying `parseSitemap` call. */ parseSitemapOptions?: Omit<ParseSitemapOptions, 'emitNestedSitemaps' | 'maxDepth'>; /** * Crawlee configuration */ config?: Configuration; } /** * A list of URLs to crawl parsed from a sitemap. * * The loading of the sitemap is performed in the background so that crawling can start before the sitemap is fully loaded. */ export declare class SitemapRequestList implements IRequestList { /** * Set of URLs that were returned by `fetchNextRequest()` and not marked as handled yet. * @internal */ inProgress: Set<string>; /** Set of URLs for which `reclaimRequest()` was called. */ private reclaimed; /** * Map of returned Request objects that have not been marked as handled yet. * * We use this to persist custom user fields on the in-progress (or reclaimed) requests. */ private requestData; /** * Object for keeping track of the sitemap parsing progress. */ private sitemapParsingProgress; /** * Object stream of URLs parsed from the sitemaps. * Using `highWaterMark`, this can manage the speed of the sitemap loading. * * Fetch the next URL to be processed using `fetchNextRequest()`. */ private urlQueueStream; /** * Indicates whether the request list sitemap loading was aborted. * * If the loading was aborted before the sitemaps were fully loaded, the request list might be missing some URLs. * The `isSitemapFullyLoaded` method can be used to check if the sitemaps were fully loaded. * * If the loading is aborted and all the requests are handled, `isFinished()` will return `true`. */ private abortLoading; /** Number of URLs that were marked as handled */ private handledUrlCount; private persistStateKey?; private store?; private closed; /** * Proxy URL to be used for sitemap loading. */ private proxyUrl; /** * Logger instance. */ private log; private urlExcludePatternObjects; private urlPatternObjects; /** EventManager used to handle persistence */ private events; private persistenceOptions; /** @internal */ private constructor(); /** * Creates a new object stream with the specified highWaterMark. * @param highWaterMark High water mark for the stream (the maximum number of objects the stream will buffer). * @returns A new object stream. */ private createNewStream; /** * Returns a function that checks whether the provided pattern matches the closure URL. * @param url URL to be checked. * @returns A matcher function that checks whether the pattern matches the closure URL. */ private matchesUrl; /** * Checks whether the URL matches the `globs` / `regexps` / `exclude` provided in the `options`. * @param url URL to be checked. * @returns `true` if the URL matches the patterns, `false` otherwise. */ private isUrlMatchingPatterns; /** * Adds a URL to the queue of parsed URLs. * * Blocks if the stream is full until it is drained. */ private pushNextUrl; /** * Reads the next URL from the queue of parsed URLs. * * If the stream is empty, blocks until a new URL is pushed. * @returns The next URL from the queue or `null` if we have read all URLs. */ private readNextUrl; /** * Indicates whether the background processing of sitemap contents has successfully finished. * * If this is `false`, the background processing is either still in progress or was aborted. */ isSitemapFullyLoaded(): boolean; /** * Start processing the sitemaps and loading the URLs. * * Resolves once all the sitemaps URLs have been fully loaded (sets `isSitemapFullyLoaded` to `true`). */ private load; /** * Open a sitemap and start processing it. * * Resolves to a new instance of `SitemapRequestList`, which **might not be fully loaded yet** - i.e. the sitemap might still be loading in the background. * * Track the loading progress using the `isSitemapFullyLoaded` property. */ static open(options: SitemapRequestListOptions): Promise<SitemapRequestList>; /** * @inheritDoc */ length(): number; /** * @inheritDoc */ isFinished(): Promise<boolean>; /** * @inheritDoc */ isEmpty(): Promise<boolean>; /** * @inheritDoc */ handledCount(): number; /** * @inheritDoc */ persistState(): Promise<void>; private restoreState; /** * @inheritDoc */ fetchNextRequest(): Promise<Request | null>; /** * @inheritDoc */ // @ts-ignore optional peer dependency or compatibility with es2022 [Symbol.asyncIterator](): AsyncGenerator<Request<import("@crawlee/utils").Dictionary>, void, unknown>; /** * @inheritDoc */ reclaimRequest(request: Request): Promise<void>; /** * Aborts the internal sitemap loading, stops the processing of the sitemap contents and drops all the pending URLs. * * Calling `fetchNextRequest()` after this method will always return `null`. */ teardown(): Promise<void>; /** * @inheritDoc */ markRequestHandled(request: Request): Promise<void>; private ensureInProgressAndNotReclaimed; } export {}; //# sourceMappingURL=sitemap_request_list.d.ts.map