@crawlee/utils

Version:

A set of shared utilities that can be used by crawlers

99 lines • 4.08 kB

TypeScript

// @ts-ignore optional peer dependency or compatibility with es2022 import type { Delays } from 'got-scraping'; interface SitemapUrlData { loc: string; lastmod?: Date; changefreq?: 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | 'yearly' | 'never'; priority?: number; } export type SitemapUrl = SitemapUrlData & { originSitemapUrl: string; }; interface NestedSitemap { loc: string; originSitemapUrl: null; } type SitemapSource = ({ type: 'url'; url: string; } | { type: 'raw'; content: string; }) & { depth?: number; }; export interface ParseSitemapOptions { /** * If set to `true`, elements referring to other sitemaps will be emitted as special objects with `originSitemapUrl` set to `null`. */ emitNestedSitemaps?: true | false; /** * Maximum depth of nested sitemaps to follow. */ maxDepth?: number; /** * Number of retries for fetching sitemaps. The counter resets for each nested sitemap. */ sitemapRetries?: number; /** * Network timeouts for sitemap fetching. See [Got documentation](https://github.com/sindresorhus/got/blob/main/documentation/6-timeout.md) for more details. */ networkTimeouts?: Delays; /** * If true, the parser will log a warning if it fails to fetch a sitemap due to a network error * @default true */ reportNetworkErrors?: boolean; } export declare function parseSitemap<T extends ParseSitemapOptions>(initialSources: SitemapSource[], proxyUrl?: string, options?: T): AsyncIterable<T['emitNestedSitemaps'] extends true ? SitemapUrl | NestedSitemap : SitemapUrl>; /** * Loads one or more sitemaps from given URLs, following references in sitemap index files, and exposes the contained URLs. * * **Example usage:** * ```javascript * // Load a sitemap * const sitemap = await Sitemap.load(['https://example.com/sitemap.xml', 'https://example.com/sitemap_2.xml.gz']); * * // Enqueue all the contained URLs (including those from sub-sitemaps from sitemap indexes) * await crawler.addRequests(sitemap.urls); * ``` */ export declare class Sitemap { readonly urls: string[]; constructor(urls: string[]); /** * Try to load sitemap from the most common locations - `/sitemap.xml` and `/sitemap.txt`. * For loading based on `Sitemap` entries in `robots.txt`, the {@link RobotsTxtFile} class should be used. * @param url The domain URL to fetch the sitemap for. * @param proxyUrl A proxy to be used for fetching the sitemap file. */ static tryCommonNames(url: string, proxyUrl?: string): Promise<Sitemap>; /** * Fetch sitemap content from given URL or URLs and return URLs of referenced pages. * @param urls sitemap URL(s) * @param proxyUrl URL of a proxy to be used for fetching sitemap contents */ static load(urls: string | string[], proxyUrl?: string, parseSitemapOptions?: ParseSitemapOptions): Promise<Sitemap>; /** * Parse XML sitemap content from a string and return URLs of referenced pages. If the sitemap references other sitemaps, they will be loaded via HTTP. * @param content XML sitemap content * @param proxyUrl URL of a proxy to be used for fetching sitemap contents */ static fromXmlString(content: string, proxyUrl?: string): Promise<Sitemap>; protected static parse(sources: SitemapSource[], proxyUrl?: string, parseSitemapOptions?: ParseSitemapOptions): Promise<Sitemap>; } /** * Given a list of URLs, discover related sitemap files for these domains by checking the `robots.txt` file, * the default `sitemap.xml` & `sitemap.txt` files and the URLs themselves. * @param `urls` The list of URLs to discover sitemaps for. * @param `options` Options for sitemap discovery * @returns An async iterable with the discovered sitemap URLs. */ export declare function discoverValidSitemaps(urls: string[], options?: { /** * Proxy URL to be used for network requests. */ proxyUrl?: string; }): AsyncIterable<string>; export {}; //# sourceMappingURL=sitemap.d.ts.map