@crawlee/http
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
220 lines • 9.17 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.FileDownload = void 0;
exports.MinimumSpeedStream = MinimumSpeedStream;
exports.ByteCounterStream = ByteCounterStream;
exports.createFileRouter = createFileRouter;
const node_stream_1 = require("node:stream");
const promises_1 = require("node:stream/promises");
const types_1 = require("node:util/types");
const index_1 = require("../index");
/**
* Creates a transform stream that throws an error if the source data speed is below the specified minimum speed.
* This `Transform` checks the amount of data every `checkProgressInterval` milliseconds.
* If the stream has received less than `minSpeedKbps * historyLengthMs / 1000` bytes in the last `historyLengthMs` milliseconds,
* it will throw an error.
*
* Can be used e.g. to abort a download if the network speed is too slow.
* @returns Transform stream that monitors the speed of the incoming data.
*/
function MinimumSpeedStream({ minSpeedKbps, historyLengthMs = 10e3, checkProgressInterval: checkProgressIntervalMs = 5e3, }) {
let snapshots = [];
const checkInterval = setInterval(() => {
const now = Date.now();
snapshots = snapshots.filter((snapshot) => now - snapshot.timestamp < historyLengthMs);
const totalBytes = snapshots.reduce((acc, snapshot) => acc + snapshot.bytes, 0);
const elapsed = (now - (snapshots[0]?.timestamp ?? 0)) / 1000;
if (totalBytes / 1024 / elapsed < minSpeedKbps) {
clearInterval(checkInterval);
stream.emit('error', new Error(`Stream speed too slow, aborting...`));
}
}, checkProgressIntervalMs);
const stream = new node_stream_1.Transform({
transform: (chunk, _, callback) => {
snapshots.push({ timestamp: Date.now(), bytes: chunk.length });
callback(null, chunk);
},
final: (callback) => {
clearInterval(checkInterval);
callback();
},
});
return stream;
}
/**
* Creates a transform stream that logs the progress of the incoming data.
* This `Transform` calls the `logProgress` function every `loggingInterval` milliseconds with the number of bytes received so far.
*
* Can be used e.g. to log the progress of a download.
* @returns Transform stream logging the progress of the incoming data.
*/
function ByteCounterStream({ logTransferredBytes, loggingInterval = 5000, }) {
let transferredBytes = 0;
let lastLogTime = Date.now();
return new node_stream_1.Transform({
transform: (chunk, _, callback) => {
transferredBytes += chunk.length;
if (Date.now() - lastLogTime > loggingInterval) {
lastLogTime = Date.now();
logTransferredBytes(transferredBytes);
}
callback(null, chunk);
},
flush: (callback) => {
logTransferredBytes(transferredBytes);
callback();
},
});
}
/**
* Provides a framework for downloading files in parallel using plain HTTP requests. The URLs to download are fed either from a static list of URLs or they can be added on the fly from another crawler.
*
* Since `FileDownload` uses raw HTTP requests to download the files, it is very fast and bandwidth-efficient.
* However, it doesn't parse the content - if you need to e.g. extract data from the downloaded files,
* you might need to use {@link CheerioCrawler}, {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead.
*
* `FileCrawler` downloads each URL using a plain HTTP request and then invokes the user-provided {@link FileDownloadOptions.requestHandler} where the user can specify what to do with the downloaded data.
*
* The source URLs are represented using {@link Request} objects that are fed from {@link RequestList} or {@link RequestQueue} instances provided by the {@link FileDownloadOptions.requestList} or {@link FileDownloadOptions.requestQueue} constructor options, respectively.
*
* If both {@link FileDownloadOptions.requestList} and {@link FileDownloadOptions.requestQueue} are used, the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
*
* The crawler finishes when there are no more {@link Request} objects to crawl.
*
* We can use the `preNavigationHooks` to adjust `gotOptions`:
*
* ```
* preNavigationHooks: [
* (crawlingContext, gotOptions) => {
* // ...
* },
* ]
* ```
*
* New requests are only dispatched when there is enough free CPU and memory available, using the functionality provided by the {@link AutoscaledPool} class. All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions` parameter of the `FileCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` {@link AutoscaledPool} options are available directly in the `FileCrawler` constructor.
*
* ## Example usage
*
* ```ts
* const crawler = new FileDownloader({
* requestHandler({ body, request }) {
* writeFileSync(request.url.replace(/[^a-z0-9\.]/gi, '_'), body);
* },
* });
*
* await crawler.run([
* 'http://www.example.com/document.pdf',
* 'http://www.example.com/sound.mp3',
* 'http://www.example.com/video.mkv',
* ]);
* ```
*/
class FileDownload extends index_1.HttpCrawler {
constructor(options = {}) {
const { streamHandler } = options;
delete options.streamHandler;
if (streamHandler) {
// For streams, the navigation is done in the request handler.
options.requestHandlerTimeoutSecs = options.navigationTimeoutSecs ?? 120;
}
super(options);
Object.defineProperty(this, "streamHandler", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.streamHandler = streamHandler;
if (this.streamHandler) {
this.requestHandler = this.streamRequestHandler;
}
// The base HttpCrawler class only supports a handful of text based mime types.
// With the FileDownload crawler, we want to download any file type.
this.supportedMimeTypes = new Set(['*/*']);
}
async _runRequestHandler(context) {
if (this.streamHandler) {
context.request.skipNavigation = true;
}
await super._runRequestHandler(context);
}
async streamRequestHandler(context) {
const { log, request: { url }, } = context;
const response = await this.httpClient.stream({
url,
timeout: { request: undefined },
proxyUrl: context.proxyInfo?.url,
});
let pollingInterval;
const cleanUp = () => {
clearInterval(pollingInterval);
response.stream.destroy();
};
const downloadPromise = new Promise((resolve, reject) => {
pollingInterval = setInterval(() => {
const { total, transferred } = response.downloadProgress;
if (transferred > 0) {
log.debug(`Downloaded ${transferred} bytes of ${total ?? 0} bytes from ${url}.`);
}
}, 5000);
response.stream.on('error', async (error) => {
cleanUp();
reject(error);
});
let streamHandlerResult;
try {
context.stream = response.stream;
context.response = response;
streamHandlerResult = this.streamHandler(context);
}
catch (e) {
cleanUp();
reject(e);
}
if ((0, types_1.isPromise)(streamHandlerResult)) {
streamHandlerResult
.then(() => {
resolve();
})
.catch((e) => {
cleanUp();
reject(e);
});
}
else {
resolve();
}
});
await Promise.all([downloadPromise, (0, promises_1.finished)(response.stream)]);
cleanUp();
}
}
exports.FileDownload = FileDownload;
/**
* Creates new {@link Router} instance that works based on request labels.
* This instance can then serve as a `requestHandler` of your {@link FileDownload}.
* Defaults to the {@link FileDownloadCrawlingContext}.
*
* > Serves as a shortcut for using `Router.create<FileDownloadCrawlingContext>()`.
*
* ```ts
* import { FileDownload, createFileRouter } from 'crawlee';
*
* const router = createFileRouter();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new FileDownload({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
function createFileRouter(routes) {
return index_1.Router.create(routes);
}
//# sourceMappingURL=file-download.js.map