UNPKG

@crawlee/http

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

220 lines • 9.17 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.FileDownload = void 0; exports.MinimumSpeedStream = MinimumSpeedStream; exports.ByteCounterStream = ByteCounterStream; exports.createFileRouter = createFileRouter; const node_stream_1 = require("node:stream"); const promises_1 = require("node:stream/promises"); const types_1 = require("node:util/types"); const index_1 = require("../index"); /** * Creates a transform stream that throws an error if the source data speed is below the specified minimum speed. * This `Transform` checks the amount of data every `checkProgressInterval` milliseconds. * If the stream has received less than `minSpeedKbps * historyLengthMs / 1000` bytes in the last `historyLengthMs` milliseconds, * it will throw an error. * * Can be used e.g. to abort a download if the network speed is too slow. * @returns Transform stream that monitors the speed of the incoming data. */ function MinimumSpeedStream({ minSpeedKbps, historyLengthMs = 10e3, checkProgressInterval: checkProgressIntervalMs = 5e3, }) { let snapshots = []; const checkInterval = setInterval(() => { const now = Date.now(); snapshots = snapshots.filter((snapshot) => now - snapshot.timestamp < historyLengthMs); const totalBytes = snapshots.reduce((acc, snapshot) => acc + snapshot.bytes, 0); const elapsed = (now - (snapshots[0]?.timestamp ?? 0)) / 1000; if (totalBytes / 1024 / elapsed < minSpeedKbps) { clearInterval(checkInterval); stream.emit('error', new Error(`Stream speed too slow, aborting...`)); } }, checkProgressIntervalMs); const stream = new node_stream_1.Transform({ transform: (chunk, _, callback) => { snapshots.push({ timestamp: Date.now(), bytes: chunk.length }); callback(null, chunk); }, final: (callback) => { clearInterval(checkInterval); callback(); }, }); return stream; } /** * Creates a transform stream that logs the progress of the incoming data. * This `Transform` calls the `logProgress` function every `loggingInterval` milliseconds with the number of bytes received so far. * * Can be used e.g. to log the progress of a download. * @returns Transform stream logging the progress of the incoming data. */ function ByteCounterStream({ logTransferredBytes, loggingInterval = 5000, }) { let transferredBytes = 0; let lastLogTime = Date.now(); return new node_stream_1.Transform({ transform: (chunk, _, callback) => { transferredBytes += chunk.length; if (Date.now() - lastLogTime > loggingInterval) { lastLogTime = Date.now(); logTransferredBytes(transferredBytes); } callback(null, chunk); }, flush: (callback) => { logTransferredBytes(transferredBytes); callback(); }, }); } /** * Provides a framework for downloading files in parallel using plain HTTP requests. The URLs to download are fed either from a static list of URLs or they can be added on the fly from another crawler. * * Since `FileDownload` uses raw HTTP requests to download the files, it is very fast and bandwidth-efficient. * However, it doesn't parse the content - if you need to e.g. extract data from the downloaded files, * you might need to use {@link CheerioCrawler}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead. * * `FileCrawler` downloads each URL using a plain HTTP request and then invokes the user-provided {@link FileDownloadOptions.requestHandler} where the user can specify what to do with the downloaded data. * * The source URLs are represented using {@link Request} objects that are fed from {@link RequestList} or {@link RequestQueue} instances provided by the {@link FileDownloadOptions.requestList} or {@link FileDownloadOptions.requestQueue} constructor options, respectively. * * If both {@link FileDownloadOptions.requestList} and {@link FileDownloadOptions.requestQueue} are used, the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times. * * The crawler finishes when there are no more {@link Request} objects to crawl. * * We can use the `preNavigationHooks` to adjust `gotOptions`: * * ``` * preNavigationHooks: [ * (crawlingContext, gotOptions) => { * // ... * }, * ] * ``` * * New requests are only dispatched when there is enough free CPU and memory available, using the functionality provided by the {@link AutoscaledPool} class. All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions` parameter of the `FileCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` {@link AutoscaledPool} options are available directly in the `FileCrawler` constructor. * * ## Example usage * * ```ts * const crawler = new FileDownloader({ * requestHandler({ body, request }) { * writeFileSync(request.url.replace(/[^a-z0-9\.]/gi, '_'), body); * }, * }); * * await crawler.run([ * 'http://www.example.com/document.pdf', * 'http://www.example.com/sound.mp3', * 'http://www.example.com/video.mkv', * ]); * ``` */ class FileDownload extends index_1.HttpCrawler { constructor(options = {}) { const { streamHandler } = options; delete options.streamHandler; if (streamHandler) { // For streams, the navigation is done in the request handler. options.requestHandlerTimeoutSecs = options.navigationTimeoutSecs ?? 120; } super(options); Object.defineProperty(this, "streamHandler", { enumerable: true, configurable: true, writable: true, value: void 0 }); this.streamHandler = streamHandler; if (this.streamHandler) { this.requestHandler = this.streamRequestHandler; } // The base HttpCrawler class only supports a handful of text based mime types. // With the FileDownload crawler, we want to download any file type. this.supportedMimeTypes = new Set(['*/*']); } async _runRequestHandler(context) { if (this.streamHandler) { context.request.skipNavigation = true; } await super._runRequestHandler(context); } async streamRequestHandler(context) { const { log, request: { url }, } = context; const response = await this.httpClient.stream({ url, timeout: { request: undefined }, proxyUrl: context.proxyInfo?.url, }); let pollingInterval; const cleanUp = () => { clearInterval(pollingInterval); response.stream.destroy(); }; const downloadPromise = new Promise((resolve, reject) => { pollingInterval = setInterval(() => { const { total, transferred } = response.downloadProgress; if (transferred > 0) { log.debug(`Downloaded ${transferred} bytes of ${total ?? 0} bytes from ${url}.`); } }, 5000); response.stream.on('error', async (error) => { cleanUp(); reject(error); }); let streamHandlerResult; try { context.stream = response.stream; context.response = response; streamHandlerResult = this.streamHandler(context); } catch (e) { cleanUp(); reject(e); } if ((0, types_1.isPromise)(streamHandlerResult)) { streamHandlerResult .then(() => { resolve(); }) .catch((e) => { cleanUp(); reject(e); }); } else { resolve(); } }); await Promise.all([downloadPromise, (0, promises_1.finished)(response.stream)]); cleanUp(); } } exports.FileDownload = FileDownload; /** * Creates new {@link Router} instance that works based on request labels. * This instance can then serve as a `requestHandler` of your {@link FileDownload}. * Defaults to the {@link FileDownloadCrawlingContext}. * * > Serves as a shortcut for using `Router.create<FileDownloadCrawlingContext>()`. * * ```ts * import { FileDownload, createFileRouter } from 'crawlee'; * * const router = createFileRouter(); * router.addHandler('label-a', async (ctx) => { * ctx.log.info('...'); * }); * router.addDefaultHandler(async (ctx) => { * ctx.log.info('...'); * }); * * const crawler = new FileDownload({ * requestHandler: router, * }); * await crawler.run(); * ``` */ function createFileRouter(routes) { return index_1.Router.create(routes); } //# sourceMappingURL=file-download.js.map