UNPKG

@crawlee/http

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

158 lines 6.5 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.FileDownload = void 0; exports.createFileRouter = createFileRouter; const promises_1 = require("node:stream/promises"); const types_1 = require("node:util/types"); const index_1 = require("../index"); /** * Provides a framework for downloading files in parallel using plain HTTP requests. The URLs to download are fed either from a static list of URLs or they can be added on the fly from another crawler. * * Since `FileDownload` uses raw HTTP requests to download the files, it is very fast and bandwidth-efficient. * However, it doesn't parse the content - if you need to e.g. extract data from the downloaded files, * you might need to use {@apilink CheerioCrawler}, {@apilink PuppeteerCrawler} or {@apilink PlaywrightCrawler} instead. * * `FileCrawler` downloads each URL using a plain HTTP request and then invokes the user-provided {@apilink FileDownloadOptions.requestHandler} where the user can specify what to do with the downloaded data. * * The source URLs are represented using {@apilink Request} objects that are fed from {@apilink RequestList} or {@apilink RequestQueue} instances provided by the {@apilink FileDownloadOptions.requestList} or {@apilink FileDownloadOptions.requestQueue} constructor options, respectively. * * If both {@apilink FileDownloadOptions.requestList} and {@apilink FileDownloadOptions.requestQueue} are used, the instance first processes URLs from the {@apilink RequestList} and automatically enqueues all of them to {@apilink RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times. * * The crawler finishes when there are no more {@apilink Request} objects to crawl. * * We can use the `preNavigationHooks` to adjust `gotOptions`: * * ``` * preNavigationHooks: [ * (crawlingContext, gotOptions) => { * // ... * }, * ] * ``` * * New requests are only dispatched when there is enough free CPU and memory available, using the functionality provided by the {@apilink AutoscaledPool} class. All {@apilink AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions` parameter of the `FileCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` {@apilink AutoscaledPool} options are available directly in the `FileCrawler` constructor. * * ## Example usage * * ```ts * const crawler = new FileDownloader({ * requestHandler({ body, request }) { * writeFileSync(request.url.replace(/[^a-z0-9\.]/gi, '_'), body); * }, * }); * * await crawler.run([ * 'http://www.example.com/document.pdf', * 'http://www.example.com/sound.mp3', * 'http://www.example.com/video.mkv', * ]); * ``` */ class FileDownload extends index_1.HttpCrawler { constructor(options = {}) { const { streamHandler } = options; delete options.streamHandler; if (streamHandler) { // For streams, the navigation is done in the request handler. options.requestHandlerTimeoutSecs = options.navigationTimeoutSecs ?? 120; } super(options); Object.defineProperty(this, "streamHandler", { enumerable: true, configurable: true, writable: true, value: void 0 }); this.streamHandler = streamHandler; if (this.streamHandler) { this.requestHandler = this.streamRequestHandler; } // The base HttpCrawler class only supports a handful of text based mime types. // With the FileDownload crawler, we want to download any file type. this.supportedMimeTypes = new Set(['*/*']); } async _runRequestHandler(context) { if (this.streamHandler) { context.request.skipNavigation = true; } await super._runRequestHandler(context); } async streamRequestHandler(context) { const { log, request: { url }, } = context; const response = await this.httpClient.stream({ url, timeout: { request: undefined }, proxyUrl: context.proxyInfo?.url, }); let pollingInterval; const cleanUp = () => { clearInterval(pollingInterval); response.stream.destroy(); }; const downloadPromise = new Promise((resolve, reject) => { pollingInterval = setInterval(() => { const { total, transferred } = response.downloadProgress; if (transferred > 0) { log.debug(`Downloaded ${transferred} bytes of ${total ?? 0} bytes from ${url}.`); } }, 5000); response.stream.on('error', async (error) => { cleanUp(); reject(error); }); let streamHandlerResult; try { context.stream = response.stream; streamHandlerResult = this.streamHandler(context); } catch (e) { cleanUp(); reject(e); } if ((0, types_1.isPromise)(streamHandlerResult)) { streamHandlerResult .then(() => { resolve(); }) .catch((e) => { cleanUp(); reject(e); }); } else { resolve(); } }); await Promise.all([downloadPromise, (0, promises_1.finished)(response.stream)]); cleanUp(); } } exports.FileDownload = FileDownload; /** * Creates new {@apilink Router} instance that works based on request labels. * This instance can then serve as a `requestHandler` of your {@apilink FileDownload}. * Defaults to the {@apilink FileDownloadCrawlingContext}. * * > Serves as a shortcut for using `Router.create<FileDownloadCrawlingContext>()`. * * ```ts * import { FileDownload, createFileRouter } from 'crawlee'; * * const router = createFileRouter(); * router.addHandler('label-a', async (ctx) => { * ctx.log.info('...'); * }); * router.addDefaultHandler(async (ctx) => { * ctx.log.info('...'); * }); * * const crawler = new FileDownload({ * requestHandler: router, * }); * await crawler.run(); * ``` */ function createFileRouter(routes) { return index_1.Router.create(routes); } //# sourceMappingURL=file-download.js.map