@crawlee/linkedom
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
169 lines • 8.62 kB
TypeScript
import type { IncomingMessage } from 'node:http';
import type { EnqueueLinksOptions, ErrorHandler, GetUserDataFromRequest, HttpCrawlerOptions, InternalHttpCrawlingContext, InternalHttpHook, RequestHandler, RequestProvider, RouterRoutes, SkippedRequestCallback } from '@crawlee/http';
import { HttpCrawler } from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import { type CheerioRoot, type RobotsTxtFile } from '@crawlee/utils';
export type LinkeDOMErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any> = ErrorHandler<LinkeDOMCrawlingContext<UserData, JSONData>>;
export interface LinkeDOMCrawlerOptions<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any> extends HttpCrawlerOptions<LinkeDOMCrawlingContext<UserData, JSONData>> {
}
export interface LinkeDOMCrawlerEnqueueLinksOptions extends Omit<EnqueueLinksOptions, 'urls' | 'requestQueue'> {
}
export type LinkeDOMHook<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any> = InternalHttpHook<LinkeDOMCrawlingContext<UserData, JSONData>>;
export interface LinkeDOMCrawlingContext<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any> extends InternalHttpCrawlingContext<UserData, JSONData, LinkeDOMCrawler> {
window: Window;
document: Document;
/**
* Wait for an element matching the selector to appear.
* Timeout defaults to 5s.
*
* **Example usage:**
* ```ts
* async requestHandler({ waitForSelector, parseWithCheerio }) {
* await waitForSelector('article h1');
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
waitForSelector(selector: string, timeoutMs?: number): Promise<void>;
/**
* Returns Cheerio handle, allowing to work with the data same way as with {@link CheerioCrawler}.
* When provided with the `selector` argument, it will first look for the selector with a 5s timeout.
*
* **Example usage:**
* ```javascript
* async requestHandler({ parseWithCheerio }) {
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
}
export type LinkeDOMRequestHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any> = RequestHandler<LinkeDOMCrawlingContext<UserData, JSONData>>;
/**
* Provides a framework for the parallel crawling of web pages using plain HTTP requests and
* [linkedom](https://www.npmjs.com/package/linkedom) LinkeDOM implementation.
* The URLs to crawl are fed either from a static list of URLs
* or from a dynamic queue of URLs enabling recursive crawling of websites.
*
* Since `LinkeDOMCrawler` uses raw HTTP requests to download web pages,
* it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
* to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
* because it loads the pages using full-featured headless Chrome browser.
*
* **Limitation**:
* This crawler does not support proxies and cookies yet (each open starts with empty cookie store), and the user agent is always set to `Chrome`.
*
* `LinkeDOMCrawler` downloads each URL using a plain HTTP request,
* parses the HTML content using [LinkeDOM](https://www.npmjs.com/package/linkedom)
* and then invokes the user-provided {@link LinkeDOMCrawlerOptions.requestHandler} to extract page data
* using the `window` object.
*
* The source URLs are represented using {@link Request} objects that are fed from
* {@link RequestList} or {@link RequestQueue} instances provided by the {@link LinkeDOMCrawlerOptions.requestList}
* or {@link LinkeDOMCrawlerOptions.requestQueue} constructor options, respectively.
*
* If both {@link LinkeDOMCrawlerOptions.requestList} and {@link LinkeDOMCrawlerOptions.requestQueue} are used,
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
* to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
*
* The crawler finishes when there are no more {@link Request} objects to crawl.
*
* We can use the `preNavigationHooks` to adjust `gotOptions`:
*
* ```
* preNavigationHooks: [
* (crawlingContext, gotOptions) => {
* // ...
* },
* ]
* ```
*
* By default, `LinkeDOMCrawler` only processes web pages with the `text/html`
* and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
* and skips pages with other content types. If you want the crawler to process other content types,
* use the {@link LinkeDOMCrawlerOptions.additionalMimeTypes} constructor option.
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
* For more details, see {@link LinkeDOMCrawlerOptions.requestHandler}.
*
* New requests are only dispatched when there is enough free CPU and memory available,
* using the functionality provided by the {@link AutoscaledPool} class.
* All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
* parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
* {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
*
* **Example usage:**
*
* ```javascript
* const crawler = new LinkeDOMCrawler({
* async requestHandler({ request, window }) {
* await Dataset.pushData({
* url: request.url,
* title: window.document.title,
* });
* },
* });
*
* await crawler.run([
* 'http://crawlee.dev',
* ]);
* ```
* @category Crawlers
*/
export declare class LinkeDOMCrawler extends HttpCrawler<LinkeDOMCrawlingContext> {
private static parser;
protected _parseHTML(response: IncomingMessage, isXml: boolean, crawlingContext: LinkeDOMCrawlingContext): Promise<{
window: Window & typeof globalThis;
readonly body: string;
readonly document: Document;
// @ts-ignore optional peer dependency or compatibility with es2022
enqueueLinks: (enqueueOptions?: LinkeDOMCrawlerEnqueueLinksOptions) => Promise<import("@crawlee/types").BatchAddRequestsResult>;
}>;
_runRequestHandler(context: LinkeDOMCrawlingContext): Promise<void>;
}
interface EnqueueLinksInternalOptions {
options?: LinkeDOMCrawlerEnqueueLinksOptions;
window: Window | null;
requestQueue: RequestProvider;
robotsTxtFile?: RobotsTxtFile;
onSkippedRequest?: SkippedRequestCallback;
originalRequestUrl: string;
finalRequestUrl?: string;
}
/** @internal */
// @ts-ignore optional peer dependency or compatibility with es2022
export declare function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, robotsTxtFile, onSkippedRequest, originalRequestUrl, finalRequestUrl, }: EnqueueLinksInternalOptions): Promise<import("@crawlee/types").BatchAddRequestsResult>;
/**
* Creates new {@link Router} instance that works based on request labels.
* This instance can then serve as a `requestHandler` of your {@link LinkeDOMCrawler}.
* Defaults to the {@link LinkeDOMCrawlingContext}.
*
* > Serves as a shortcut for using `Router.create<LinkeDOMCrawlingContext>()`.
*
* ```ts
* import { LinkeDOMCrawler, createLinkeDOMRouter } from 'crawlee';
*
* const router = createLinkeDOMRouter();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new LinkeDOMCrawler({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
// @ts-ignore optional peer dependency or compatibility with es2022
export declare function createLinkeDOMRouter<Context extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): import("@crawlee/http").RouterHandler<Context>;
export {};
//# sourceMappingURL=linkedom-crawler.d.ts.map