@crawlee/http

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

crawlee.dev

apify/crawlee

129 lines • 7.33 kB

TypeScript

import { Transform } from 'node:stream'; import type { Dictionary } from '@crawlee/types'; // @ts-ignore optional peer dependency or compatibility with es2022 import type { Request } from 'got-scraping'; import type { ErrorHandler, GetUserDataFromRequest, HttpCrawlerOptions, InternalHttpCrawlingContext, InternalHttpHook, RequestHandler, RouterRoutes } from '../index'; import { HttpCrawler } from '../index'; export type FileDownloadErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any> = ErrorHandler<FileDownloadCrawlingContext<UserData, JSONData>>; export type StreamHandlerContext = Omit<FileDownloadCrawlingContext, 'body' | 'parseWithCheerio' | 'json' | 'addRequests' | 'contentType'> & { stream: Request; }; type StreamHandler = (context: StreamHandlerContext) => void | Promise<void>; export type FileDownloadOptions<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any> = (Omit<HttpCrawlerOptions<FileDownloadCrawlingContext<UserData, JSONData>>, 'requestHandler'> & { requestHandler?: never; streamHandler?: StreamHandler; }) | (Omit<HttpCrawlerOptions<FileDownloadCrawlingContext<UserData, JSONData>>, 'requestHandler'> & { requestHandler: FileDownloadRequestHandler; streamHandler?: never; }); export type FileDownloadHook<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any> = InternalHttpHook<FileDownloadCrawlingContext<UserData, JSONData>>; export interface FileDownloadCrawlingContext<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any> extends InternalHttpCrawlingContext<UserData, JSONData, FileDownload> { } export type FileDownloadRequestHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any> = RequestHandler<FileDownloadCrawlingContext<UserData, JSONData>>; /** * Creates a transform stream that throws an error if the source data speed is below the specified minimum speed. * This `Transform` checks the amount of data every `checkProgressInterval` milliseconds. * If the stream has received less than `minSpeedKbps * historyLengthMs / 1000` bytes in the last `historyLengthMs` milliseconds, * it will throw an error. * * Can be used e.g. to abort a download if the network speed is too slow. * @returns Transform stream that monitors the speed of the incoming data. */ export declare function MinimumSpeedStream({ minSpeedKbps, historyLengthMs, checkProgressInterval: checkProgressIntervalMs, }: { minSpeedKbps: number; historyLengthMs?: number; checkProgressInterval?: number; }): Transform; /** * Creates a transform stream that logs the progress of the incoming data. * This `Transform` calls the `logProgress` function every `loggingInterval` milliseconds with the number of bytes received so far. * * Can be used e.g. to log the progress of a download. * @returns Transform stream logging the progress of the incoming data. */ export declare function ByteCounterStream({ logTransferredBytes, loggingInterval, }: { logTransferredBytes: (transferredBytes: number) => void; loggingInterval?: number; }): Transform; /** * Provides a framework for downloading files in parallel using plain HTTP requests. The URLs to download are fed either from a static list of URLs or they can be added on the fly from another crawler. * * Since `FileDownload` uses raw HTTP requests to download the files, it is very fast and bandwidth-efficient. * However, it doesn't parse the content - if you need to e.g. extract data from the downloaded files, * you might need to use {@link CheerioCrawler}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead. * * `FileCrawler` downloads each URL using a plain HTTP request and then invokes the user-provided {@link FileDownloadOptions.requestHandler} where the user can specify what to do with the downloaded data. * * The source URLs are represented using {@link Request} objects that are fed from {@link RequestList} or {@link RequestQueue} instances provided by the {@link FileDownloadOptions.requestList} or {@link FileDownloadOptions.requestQueue} constructor options, respectively. * * If both {@link FileDownloadOptions.requestList} and {@link FileDownloadOptions.requestQueue} are used, the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times. * * The crawler finishes when there are no more {@link Request} objects to crawl. * * We can use the `preNavigationHooks` to adjust `gotOptions`: * * ``` * preNavigationHooks: [ * (crawlingContext, gotOptions) => { * // ... * }, * ] * ``` * * New requests are only dispatched when there is enough free CPU and memory available, using the functionality provided by the {@link AutoscaledPool} class. All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions` parameter of the `FileCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` {@link AutoscaledPool} options are available directly in the `FileCrawler` constructor. * * ## Example usage * * ```ts * const crawler = new FileDownloader({ * requestHandler({ body, request }) { * writeFileSync(request.url.replace(/[^a-z0-9\.]/gi, '_'), body); * }, * }); * * await crawler.run([ * 'http://www.example.com/document.pdf', * 'http://www.example.com/sound.mp3', * 'http://www.example.com/video.mkv', * ]); * ``` */ export declare class FileDownload extends HttpCrawler<FileDownloadCrawlingContext> { private streamHandler?; constructor(options?: FileDownloadOptions); protected _runRequestHandler(context: FileDownloadCrawlingContext): Promise<void>; private streamRequestHandler; } /** * Creates new {@link Router} instance that works based on request labels. * This instance can then serve as a `requestHandler` of your {@link FileDownload}. * Defaults to the {@link FileDownloadCrawlingContext}. * * > Serves as a shortcut for using `Router.create<FileDownloadCrawlingContext>()`. * * ```ts * import { FileDownload, createFileRouter } from 'crawlee'; * * const router = createFileRouter(); * router.addHandler('label-a', async (ctx) => { * ctx.log.info('...'); * }); * router.addDefaultHandler(async (ctx) => { * ctx.log.info('...'); * }); * * const crawler = new FileDownload({ * requestHandler: router, * }); * await crawler.run(); * ``` */ // @ts-ignore optional peer dependency or compatibility with es2022 export declare function createFileRouter<Context extends FileDownloadCrawlingContext = FileDownloadCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): import("packages/core/dist/router").RouterHandler<Context>; export {}; //# sourceMappingURL=file-download.d.ts.map