@crawlee/http
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
129 lines • 7.33 kB
TypeScript
import { Transform } from 'node:stream';
import type { Dictionary } from '@crawlee/types';
// @ts-ignore optional peer dependency or compatibility with es2022
import type { Request } from 'got-scraping';
import type { ErrorHandler, GetUserDataFromRequest, HttpCrawlerOptions, InternalHttpCrawlingContext, InternalHttpHook, RequestHandler, RouterRoutes } from '../index';
import { HttpCrawler } from '../index';
export type FileDownloadErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any> = ErrorHandler<FileDownloadCrawlingContext<UserData, JSONData>>;
export type StreamHandlerContext = Omit<FileDownloadCrawlingContext, 'body' | 'parseWithCheerio' | 'json' | 'addRequests' | 'contentType'> & {
stream: Request;
};
type StreamHandler = (context: StreamHandlerContext) => void | Promise<void>;
export type FileDownloadOptions<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any> = (Omit<HttpCrawlerOptions<FileDownloadCrawlingContext<UserData, JSONData>>, 'requestHandler'> & {
requestHandler?: never;
streamHandler?: StreamHandler;
}) | (Omit<HttpCrawlerOptions<FileDownloadCrawlingContext<UserData, JSONData>>, 'requestHandler'> & {
requestHandler: FileDownloadRequestHandler;
streamHandler?: never;
});
export type FileDownloadHook<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any> = InternalHttpHook<FileDownloadCrawlingContext<UserData, JSONData>>;
export interface FileDownloadCrawlingContext<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any> extends InternalHttpCrawlingContext<UserData, JSONData, FileDownload> {
}
export type FileDownloadRequestHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any> = RequestHandler<FileDownloadCrawlingContext<UserData, JSONData>>;
/**
* Creates a transform stream that throws an error if the source data speed is below the specified minimum speed.
* This `Transform` checks the amount of data every `checkProgressInterval` milliseconds.
* If the stream has received less than `minSpeedKbps * historyLengthMs / 1000` bytes in the last `historyLengthMs` milliseconds,
* it will throw an error.
*
* Can be used e.g. to abort a download if the network speed is too slow.
* @returns Transform stream that monitors the speed of the incoming data.
*/
export declare function MinimumSpeedStream({ minSpeedKbps, historyLengthMs, checkProgressInterval: checkProgressIntervalMs, }: {
minSpeedKbps: number;
historyLengthMs?: number;
checkProgressInterval?: number;
}): Transform;
/**
* Creates a transform stream that logs the progress of the incoming data.
* This `Transform` calls the `logProgress` function every `loggingInterval` milliseconds with the number of bytes received so far.
*
* Can be used e.g. to log the progress of a download.
* @returns Transform stream logging the progress of the incoming data.
*/
export declare function ByteCounterStream({ logTransferredBytes, loggingInterval, }: {
logTransferredBytes: (transferredBytes: number) => void;
loggingInterval?: number;
}): Transform;
/**
* Provides a framework for downloading files in parallel using plain HTTP requests. The URLs to download are fed either from a static list of URLs or they can be added on the fly from another crawler.
*
* Since `FileDownload` uses raw HTTP requests to download the files, it is very fast and bandwidth-efficient.
* However, it doesn't parse the content - if you need to e.g. extract data from the downloaded files,
* you might need to use {@link CheerioCrawler}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead.
*
* `FileCrawler` downloads each URL using a plain HTTP request and then invokes the user-provided {@link FileDownloadOptions.requestHandler} where the user can specify what to do with the downloaded data.
*
* The source URLs are represented using {@link Request} objects that are fed from {@link RequestList} or {@link RequestQueue} instances provided by the {@link FileDownloadOptions.requestList} or {@link FileDownloadOptions.requestQueue} constructor options, respectively.
*
* If both {@link FileDownloadOptions.requestList} and {@link FileDownloadOptions.requestQueue} are used, the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
*
* The crawler finishes when there are no more {@link Request} objects to crawl.
*
* We can use the `preNavigationHooks` to adjust `gotOptions`:
*
* ```
* preNavigationHooks: [
* (crawlingContext, gotOptions) => {
* // ...
* },
* ]
* ```
*
* New requests are only dispatched when there is enough free CPU and memory available, using the functionality provided by the {@link AutoscaledPool} class. All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions` parameter of the `FileCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` {@link AutoscaledPool} options are available directly in the `FileCrawler` constructor.
*
* ## Example usage
*
* ```ts
* const crawler = new FileDownloader({
* requestHandler({ body, request }) {
* writeFileSync(request.url.replace(/[^a-z0-9\.]/gi, '_'), body);
* },
* });
*
* await crawler.run([
* 'http://www.example.com/document.pdf',
* 'http://www.example.com/sound.mp3',
* 'http://www.example.com/video.mkv',
* ]);
* ```
*/
export declare class FileDownload extends HttpCrawler<FileDownloadCrawlingContext> {
private streamHandler?;
constructor(options?: FileDownloadOptions);
protected _runRequestHandler(context: FileDownloadCrawlingContext): Promise<void>;
private streamRequestHandler;
}
/**
* Creates new {@link Router} instance that works based on request labels.
* This instance can then serve as a `requestHandler` of your {@link FileDownload}.
* Defaults to the {@link FileDownloadCrawlingContext}.
*
* > Serves as a shortcut for using `Router.create<FileDownloadCrawlingContext>()`.
*
* ```ts
* import { FileDownload, createFileRouter } from 'crawlee';
*
* const router = createFileRouter();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new FileDownload({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
// @ts-ignore optional peer dependency or compatibility with es2022
export declare function createFileRouter<Context extends FileDownloadCrawlingContext = FileDownloadCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest<Context['request']>>(routes?: RouterRoutes<Context, UserData>): import("packages/core/dist/router").RouterHandler<Context>;
export {};
//# sourceMappingURL=file-download.d.ts.map