@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
245 lines • 8.48 kB
TypeScript
import { type ParseSitemapOptions } from '@crawlee/utils';
import { Configuration } from '../configuration';
import type { GlobInput, RegExpInput } from '../enqueue_links';
import { Request } from '../request';
import type { IRequestList } from './request_list';
interface UrlConstraints {
/**
* An array of glob pattern strings or plain objects
* containing glob pattern strings matching the URLs to be enqueued.
*
* The plain objects must include at least the `glob` property, which holds the glob pattern string.
*
* The matching is always case-insensitive.
* If you need case-sensitive matching, use `regexps` property directly.
*
* If `globs` is an empty array or `undefined`, and `regexps` are also not defined, then the `SitemapRequestList`
* includes all the URLs from the sitemap.
*/
globs?: readonly GlobInput[];
/**
* An array of glob pattern strings, regexp patterns or plain objects
* containing patterns matching URLs that will **never** be included.
*
* The plain objects must include either the `glob` property or the `regexp` property.
*
* Glob matching is always case-insensitive.
* If you need case-sensitive matching, provide a regexp.
*/
exclude?: readonly (GlobInput | RegExp)[];
/**
* An array of regular expressions or plain objects
* containing regular expressions matching the URLs to be enqueued.
*
* The plain objects must include at least the `regexp` property, which holds the regular expression.
*
* If `regexps` is an empty array or `undefined`, and `globs` are also not defined, then the `SitemapRequestList`
* includes all the URLs from the sitemap.
*/
regexps?: readonly RegExpInput[];
}
export interface SitemapRequestListOptions extends UrlConstraints {
/**
* List of sitemap URLs to parse.
*/
sitemapUrls: string[];
/**
* Proxy URL to be used for sitemap loading.
*/
proxyUrl?: string;
/**
* Key for persisting the state of the request list in the `KeyValueStore`.
*/
persistStateKey?: string;
/**
* Persistence-related options to control how and when crawler's data gets persisted.
*/
persistenceOptions?: {
/**
* Use this flag to disable or enable periodic persistence to key value store.
* @default true
*/
enable?: boolean;
};
/**
* Abort signal to be used for sitemap loading.
*/
signal?: AbortSignal;
/**
* Timeout for sitemap loading in milliseconds. If both `signal` and `timeoutMillis` are provided, either of them can abort the loading.
*/
timeoutMillis?: number;
/**
* Maximum number of buffered URLs for the sitemap loading stream.
* If the buffer is full, the stream will pause until the buffer is drained.
*
* @default 200
*/
maxBufferSize?: number;
/**
* Advanced options for the underlying `parseSitemap` call.
*/
parseSitemapOptions?: Omit<ParseSitemapOptions, 'emitNestedSitemaps' | 'maxDepth'>;
/**
* Crawlee configuration
*/
config?: Configuration;
}
/**
* A list of URLs to crawl parsed from a sitemap.
*
* The loading of the sitemap is performed in the background so that crawling can start before the sitemap is fully loaded.
*/
export declare class SitemapRequestList implements IRequestList {
/**
* Set of URLs that were returned by `fetchNextRequest()` and not marked as handled yet.
* @internal
*/
inProgress: Set<string>;
/** Set of URLs for which `reclaimRequest()` was called. */
private reclaimed;
/**
* Map of returned Request objects that have not been marked as handled yet.
*
* We use this to persist custom user fields on the in-progress (or reclaimed) requests.
*/
private requestData;
/**
* Object for keeping track of the sitemap parsing progress.
*/
private sitemapParsingProgress;
/**
* Object stream of URLs parsed from the sitemaps.
* Using `highWaterMark`, this can manage the speed of the sitemap loading.
*
* Fetch the next URL to be processed using `fetchNextRequest()`.
*/
private urlQueueStream;
/**
* Indicates whether the request list sitemap loading was aborted.
*
* If the loading was aborted before the sitemaps were fully loaded, the request list might be missing some URLs.
* The `isSitemapFullyLoaded` method can be used to check if the sitemaps were fully loaded.
*
* If the loading is aborted and all the requests are handled, `isFinished()` will return `true`.
*/
private abortLoading;
/** Number of URLs that were marked as handled */
private handledUrlCount;
private persistStateKey?;
private store?;
private closed;
/**
* Proxy URL to be used for sitemap loading.
*/
private proxyUrl;
/**
* Logger instance.
*/
private log;
private urlExcludePatternObjects;
private urlPatternObjects;
/** EventManager used to handle persistence */
private events;
private persistenceOptions;
/** @internal */
private constructor();
/**
* Creates a new object stream with the specified highWaterMark.
* @param highWaterMark High water mark for the stream (the maximum number of objects the stream will buffer).
* @returns A new object stream.
*/
private createNewStream;
/**
* Returns a function that checks whether the provided pattern matches the closure URL.
* @param url URL to be checked.
* @returns A matcher function that checks whether the pattern matches the closure URL.
*/
private matchesUrl;
/**
* Checks whether the URL matches the `globs` / `regexps` / `exclude` provided in the `options`.
* @param url URL to be checked.
* @returns `true` if the URL matches the patterns, `false` otherwise.
*/
private isUrlMatchingPatterns;
/**
* Adds a URL to the queue of parsed URLs.
*
* Blocks if the stream is full until it is drained.
*/
private pushNextUrl;
/**
* Reads the next URL from the queue of parsed URLs.
*
* If the stream is empty, blocks until a new URL is pushed.
* @returns The next URL from the queue or `null` if we have read all URLs.
*/
private readNextUrl;
/**
* Indicates whether the background processing of sitemap contents has successfully finished.
*
* If this is `false`, the background processing is either still in progress or was aborted.
*/
isSitemapFullyLoaded(): boolean;
/**
* Start processing the sitemaps and loading the URLs.
*
* Resolves once all the sitemaps URLs have been fully loaded (sets `isSitemapFullyLoaded` to `true`).
*/
private load;
/**
* Open a sitemap and start processing it.
*
* Resolves to a new instance of `SitemapRequestList`, which **might not be fully loaded yet** - i.e. the sitemap might still be loading in the background.
*
* Track the loading progress using the `isSitemapFullyLoaded` property.
*/
static open(options: SitemapRequestListOptions): Promise<SitemapRequestList>;
/**
* @inheritDoc
*/
length(): number;
/**
* @inheritDoc
*/
isFinished(): Promise<boolean>;
/**
* @inheritDoc
*/
isEmpty(): Promise<boolean>;
/**
* @inheritDoc
*/
handledCount(): number;
/**
* @inheritDoc
*/
persistState(): Promise<void>;
private restoreState;
/**
* @inheritDoc
*/
fetchNextRequest(): Promise<Request | null>;
/**
* @inheritDoc
*/
// @ts-ignore optional peer dependency or compatibility with es2022
[Symbol.asyncIterator](): AsyncGenerator<Request<import("@crawlee/utils").Dictionary>, void, unknown>;
/**
* @inheritDoc
*/
reclaimRequest(request: Request): Promise<void>;
/**
* Aborts the internal sitemap loading, stops the processing of the sitemap contents and drops all the pending URLs.
*
* Calling `fetchNextRequest()` after this method will always return `null`.
*/
teardown(): Promise<void>;
/**
* @inheritDoc
*/
markRequestHandled(request: Request): Promise<void>;
private ensureInProgressAndNotReclaimed;
}
export {};
//# sourceMappingURL=sitemap_request_list.d.ts.map