@crawlee/utils
Version:
A set of shared utilities that can be used by crawlers
99 lines • 4.08 kB
TypeScript
// @ts-ignore optional peer dependency or compatibility with es2022
import type { Delays } from 'got-scraping';
interface SitemapUrlData {
loc: string;
lastmod?: Date;
changefreq?: 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | 'yearly' | 'never';
priority?: number;
}
export type SitemapUrl = SitemapUrlData & {
originSitemapUrl: string;
};
interface NestedSitemap {
loc: string;
originSitemapUrl: null;
}
type SitemapSource = ({
type: 'url';
url: string;
} | {
type: 'raw';
content: string;
}) & {
depth?: number;
};
export interface ParseSitemapOptions {
/**
* If set to `true`, elements referring to other sitemaps will be emitted as special objects with `originSitemapUrl` set to `null`.
*/
emitNestedSitemaps?: true | false;
/**
* Maximum depth of nested sitemaps to follow.
*/
maxDepth?: number;
/**
* Number of retries for fetching sitemaps. The counter resets for each nested sitemap.
*/
sitemapRetries?: number;
/**
* Network timeouts for sitemap fetching. See [Got documentation](https://github.com/sindresorhus/got/blob/main/documentation/6-timeout.md) for more details.
*/
networkTimeouts?: Delays;
/**
* If true, the parser will log a warning if it fails to fetch a sitemap due to a network error
* @default true
*/
reportNetworkErrors?: boolean;
}
export declare function parseSitemap<T extends ParseSitemapOptions>(initialSources: SitemapSource[], proxyUrl?: string, options?: T): AsyncIterable<T['emitNestedSitemaps'] extends true ? SitemapUrl | NestedSitemap : SitemapUrl>;
/**
* Loads one or more sitemaps from given URLs, following references in sitemap index files, and exposes the contained URLs.
*
* **Example usage:**
* ```javascript
* // Load a sitemap
* const sitemap = await Sitemap.load(['https://example.com/sitemap.xml', 'https://example.com/sitemap_2.xml.gz']);
*
* // Enqueue all the contained URLs (including those from sub-sitemaps from sitemap indexes)
* await crawler.addRequests(sitemap.urls);
* ```
*/
export declare class Sitemap {
readonly urls: string[];
constructor(urls: string[]);
/**
* Try to load sitemap from the most common locations - `/sitemap.xml` and `/sitemap.txt`.
* For loading based on `Sitemap` entries in `robots.txt`, the {@link RobotsTxtFile} class should be used.
* @param url The domain URL to fetch the sitemap for.
* @param proxyUrl A proxy to be used for fetching the sitemap file.
*/
static tryCommonNames(url: string, proxyUrl?: string): Promise<Sitemap>;
/**
* Fetch sitemap content from given URL or URLs and return URLs of referenced pages.
* @param urls sitemap URL(s)
* @param proxyUrl URL of a proxy to be used for fetching sitemap contents
*/
static load(urls: string | string[], proxyUrl?: string, parseSitemapOptions?: ParseSitemapOptions): Promise<Sitemap>;
/**
* Parse XML sitemap content from a string and return URLs of referenced pages. If the sitemap references other sitemaps, they will be loaded via HTTP.
* @param content XML sitemap content
* @param proxyUrl URL of a proxy to be used for fetching sitemap contents
*/
static fromXmlString(content: string, proxyUrl?: string): Promise<Sitemap>;
protected static parse(sources: SitemapSource[], proxyUrl?: string, parseSitemapOptions?: ParseSitemapOptions): Promise<Sitemap>;
}
/**
* Given a list of URLs, discover related sitemap files for these domains by checking the `robots.txt` file,
* the default `sitemap.xml` & `sitemap.txt` files and the URLs themselves.
* @param `urls` The list of URLs to discover sitemaps for.
* @param `options` Options for sitemap discovery
* @returns An async iterable with the discovered sitemap URLs.
*/
export declare function discoverValidSitemaps(urls: string[], options?: {
/**
* Proxy URL to be used for network requests.
*/
proxyUrl?: string;
}): AsyncIterable<string>;
export {};
//# sourceMappingURL=sitemap.d.ts.map