UNPKG

@crawlee/linkedom

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

169 lines • 8.62 kB
import type { IncomingMessage } from 'node:http'; import type { EnqueueLinksOptions, ErrorHandler, GetUserDataFromRequest, HttpCrawlerOptions, InternalHttpCrawlingContext, InternalHttpHook, RequestHandler, RequestProvider, RouterRoutes, SkippedRequestCallback } from '@crawlee/http'; import { HttpCrawler } from '@crawlee/http'; import type { Dictionary } from '@crawlee/types'; import { type CheerioRoot, type RobotsTxtFile } from '@crawlee/utils'; export type LinkeDOMErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any> = ErrorHandler<LinkeDOMCrawlingContext<UserData, JSONData>>; export interface LinkeDOMCrawlerOptions<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any> extends HttpCrawlerOptions<LinkeDOMCrawlingContext<UserData, JSONData>> { } export interface LinkeDOMCrawlerEnqueueLinksOptions extends Omit<EnqueueLinksOptions, 'urls' | 'requestQueue'> { } export type LinkeDOMHook<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any> = InternalHttpHook<LinkeDOMCrawlingContext<UserData, JSONData>>; export interface LinkeDOMCrawlingContext<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any> extends InternalHttpCrawlingContext<UserData, JSONData, LinkeDOMCrawler> { window: Window; document: Document; /** * Wait for an element matching the selector to appear. * Timeout defaults to 5s. * * **Example usage:** * ```ts * async requestHandler({ waitForSelector, parseWithCheerio }) { * await waitForSelector('article h1'); * const $ = await parseWithCheerio(); * const title = $('title').text(); * }); * ``` */ waitForSelector(selector: string, timeoutMs?: number): Promise<void>; /** * Returns Cheerio handle, allowing to work with the data same way as with {@link CheerioCrawler}. * When provided with the `selector` argument, it will first look for the selector with a 5s timeout. * * **Example usage:** * ```javascript * async requestHandler({ parseWithCheerio }) { * const $ = await parseWithCheerio(); * const title = $('title').text(); * }); * ``` */ parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>; } export type LinkeDOMRequestHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any> = RequestHandler<LinkeDOMCrawlingContext<UserData, JSONData>>; /** * Provides a framework for the parallel crawling of web pages using plain HTTP requests and * [linkedom](https://www.npmjs.com/package/linkedom) LinkeDOM implementation. * The URLs to crawl are fed either from a static list of URLs * or from a dynamic queue of URLs enabling recursive crawling of websites. * * Since `LinkeDOMCrawler` uses raw HTTP requests to download web pages, * it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript * to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead, * because it loads the pages using full-featured headless Chrome browser. * * **Limitation**: * This crawler does not support proxies and cookies yet (each open starts with empty cookie store), and the user agent is always set to `Chrome`. * * `LinkeDOMCrawler` downloads each URL using a plain HTTP request, * parses the HTML content using [LinkeDOM](https://www.npmjs.com/package/linkedom) * and then invokes the user-provided {@link LinkeDOMCrawlerOptions.requestHandler} to extract page data * using the `window` object. * * The source URLs are represented using {@link Request} objects that are fed from * {@link RequestList} or {@link RequestQueue} instances provided by the {@link LinkeDOMCrawlerOptions.requestList} * or {@link LinkeDOMCrawlerOptions.requestQueue} constructor options, respectively. * * If both {@link LinkeDOMCrawlerOptions.requestList} and {@link LinkeDOMCrawlerOptions.requestQueue} are used, * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times. * * The crawler finishes when there are no more {@link Request} objects to crawl. * * We can use the `preNavigationHooks` to adjust `gotOptions`: * * ``` * preNavigationHooks: [ * (crawlingContext, gotOptions) => { * // ... * }, * ] * ``` * * By default, `LinkeDOMCrawler` only processes web pages with the `text/html` * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header), * and skips pages with other content types. If you want the crawler to process other content types, * use the {@link LinkeDOMCrawlerOptions.additionalMimeTypes} constructor option. * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content. * For more details, see {@link LinkeDOMCrawlerOptions.requestHandler}. * * New requests are only dispatched when there is enough free CPU and memory available, * using the functionality provided by the {@link AutoscaledPool} class. * All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions` * parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` * {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor. * * **Example usage:** * * ```javascript * const crawler = new LinkeDOMCrawler({ * async requestHandler({ request, window }) { * await Dataset.pushData({ * url: request.url, * title: window.document.title, * }); * }, * }); * * await crawler.run([ * 'http://crawlee.dev', * ]); * ``` * @category Crawlers */ export declare class LinkeDOMCrawler extends HttpCrawler<LinkeDOMCrawlingContext> { private static parser; protected _parseHTML(response: IncomingMessage, isXml: boolean, crawlingContext: LinkeDOMCrawlingContext): Promise<{ window: Window & typeof globalThis; readonly body: string; readonly document: Document; // @ts-ignore optional peer dependency or compatibility with es2022 enqueueLinks: (enqueueOptions?: LinkeDOMCrawlerEnqueueLinksOptions) => Promise<import("@crawlee/types").BatchAddRequestsResult>; }>; _runRequestHandler(context: LinkeDOMCrawlingContext): Promise<void>; } interface EnqueueLinksInternalOptions { options?: LinkeDOMCrawlerEnqueueLinksOptions; window: Window | null; requestQueue: RequestProvider; robotsTxtFile?: RobotsTxtFile; onSkippedRequest?: SkippedRequestCallback; originalRequestUrl: string; finalRequestUrl?: string; } /** @internal */ // @ts-ignore optional peer dependency or compatibility with es2022 export declare function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, robotsTxtFile, onSkippedRequest, originalRequestUrl, finalRequestUrl, }: EnqueueLinksInternalOptions): Promise<import("@crawlee/types").BatchAddRequestsResult>; /** * Creates new {@link Router} instance that works based on request labels. * This instance can then serve as a `requestHandler` of your {@link LinkeDOMCrawler}. * Defaults to the {@link LinkeDOMCrawlingContext}. * * > Serves as a shortcut for using `Router.create<LinkeDOMCrawlingContext>()`. * * ```ts * import { LinkeDOMCrawler, createLinkeDOMRouter } from 'crawlee'; * * const router = createLinkeDOMRouter(); * router.addHandler('label-a', async (ctx) => { * ctx.log.info('...'); * }); * router.addDefaultHandler(async (ctx) => { * ctx.log.info('...'); * }); * * const crawler = new LinkeDOMCrawler({ * requestHandler: router, * }); * await crawler.run(); * ``` */ // @ts-ignore optional peer dependency or compatibility with es2022 export declare function createLinkeDOMRouter<Context extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): import("@crawlee/http").RouterHandler<Context>; export {}; //# sourceMappingURL=linkedom-crawler.d.ts.map