@crawlee/utils
Version:
A set of shared utilities that can be used by crawlers
86 lines • 3.52 kB
TypeScript
// @ts-ignore optional peer dependency or compatibility with es2022
import type { Delays } from 'got-scraping';
interface SitemapUrlData {
loc: string;
lastmod?: Date;
changefreq?: 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | 'yearly' | 'never';
priority?: number;
}
export type SitemapUrl = SitemapUrlData & {
originSitemapUrl: string;
};
interface NestedSitemap {
loc: string;
originSitemapUrl: null;
}
type SitemapSource = ({
type: 'url';
url: string;
} | {
type: 'raw';
content: string;
}) & {
depth?: number;
};
export interface ParseSitemapOptions {
/**
* If set to `true`, elements referring to other sitemaps will be emitted as special objects with `originSitemapUrl` set to `null`.
*/
emitNestedSitemaps?: true | false;
/**
* Maximum depth of nested sitemaps to follow.
*/
maxDepth?: number;
/**
* Number of retries for fetching sitemaps. The counter resets for each nested sitemap.
*/
sitemapRetries?: number;
/**
* Network timeouts for sitemap fetching. See [Got documentation](https://github.com/sindresorhus/got/blob/main/documentation/6-timeout.md) for more details.
*/
networkTimeouts?: Delays;
/**
* If true, the parser will log a warning if it fails to fetch a sitemap due to a network error
* @default true
*/
reportNetworkErrors?: boolean;
}
export declare function parseSitemap<T extends ParseSitemapOptions>(initialSources: SitemapSource[], proxyUrl?: string, options?: T): AsyncIterable<T['emitNestedSitemaps'] extends true ? SitemapUrl | NestedSitemap : SitemapUrl>;
/**
* Loads one or more sitemaps from given URLs, following references in sitemap index files, and exposes the contained URLs.
*
* **Example usage:**
* ```javascript
* // Load a sitemap
* const sitemap = await Sitemap.load(['https://example.com/sitemap.xml', 'https://example.com/sitemap_2.xml.gz']);
*
* // Enqueue all the contained URLs (including those from sub-sitemaps from sitemap indexes)
* await crawler.addRequests(sitemap.urls);
* ```
*/
export declare class Sitemap {
readonly urls: string[];
constructor(urls: string[]);
/**
* Try to load sitemap from the most common locations - `/sitemap.xml` and `/sitemap.txt`.
* For loading based on `Sitemap` entries in `robots.txt`, the {@link RobotsTxtFile} class should be used.
* @param url The domain URL to fetch the sitemap for.
* @param proxyUrl A proxy to be used for fetching the sitemap file.
*/
static tryCommonNames(url: string, proxyUrl?: string): Promise<Sitemap>;
/**
* Fetch sitemap content from given URL or URLs and return URLs of referenced pages.
* @param urls sitemap URL(s)
* @param proxyUrl URL of a proxy to be used for fetching sitemap contents
*/
static load(urls: string | string[], proxyUrl?: string, parseSitemapOptions?: ParseSitemapOptions): Promise<Sitemap>;
/**
* Parse XML sitemap content from a string and return URLs of referenced pages. If the sitemap references other sitemaps, they will be loaded via HTTP.
* @param content XML sitemap content
* @param proxyUrl URL of a proxy to be used for fetching sitemap contents
*/
static fromXmlString(content: string, proxyUrl?: string): Promise<Sitemap>;
protected static parse(sources: SitemapSource[], proxyUrl?: string, parseSitemapOptions?: ParseSitemapOptions): Promise<Sitemap>;
}
export {};
//# sourceMappingURL=sitemap.d.ts.map