@crawlee/http
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
158 lines • 6.5 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.FileDownload = void 0;
exports.createFileRouter = createFileRouter;
const promises_1 = require("node:stream/promises");
const types_1 = require("node:util/types");
const index_1 = require("../index");
/**
* Provides a framework for downloading files in parallel using plain HTTP requests. The URLs to download are fed either from a static list of URLs or they can be added on the fly from another crawler.
*
* Since `FileDownload` uses raw HTTP requests to download the files, it is very fast and bandwidth-efficient.
* However, it doesn't parse the content - if you need to e.g. extract data from the downloaded files,
* you might need to use {@apilink CheerioCrawler}, {@apilink PuppeteerCrawler} or {@apilink PlaywrightCrawler} instead.
*
* `FileCrawler` downloads each URL using a plain HTTP request and then invokes the user-provided {@apilink FileDownloadOptions.requestHandler} where the user can specify what to do with the downloaded data.
*
* The source URLs are represented using {@apilink Request} objects that are fed from {@apilink RequestList} or {@apilink RequestQueue} instances provided by the {@apilink FileDownloadOptions.requestList} or {@apilink FileDownloadOptions.requestQueue} constructor options, respectively.
*
* If both {@apilink FileDownloadOptions.requestList} and {@apilink FileDownloadOptions.requestQueue} are used, the instance first processes URLs from the {@apilink RequestList} and automatically enqueues all of them to {@apilink RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
*
* The crawler finishes when there are no more {@apilink Request} objects to crawl.
*
* We can use the `preNavigationHooks` to adjust `gotOptions`:
*
* ```
* preNavigationHooks: [
* (crawlingContext, gotOptions) => {
* // ...
* },
* ]
* ```
*
* New requests are only dispatched when there is enough free CPU and memory available, using the functionality provided by the {@apilink AutoscaledPool} class. All {@apilink AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions` parameter of the `FileCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` {@apilink AutoscaledPool} options are available directly in the `FileCrawler` constructor.
*
* ## Example usage
*
* ```ts
* const crawler = new FileDownloader({
* requestHandler({ body, request }) {
* writeFileSync(request.url.replace(/[^a-z0-9\.]/gi, '_'), body);
* },
* });
*
* await crawler.run([
* 'http://www.example.com/document.pdf',
* 'http://www.example.com/sound.mp3',
* 'http://www.example.com/video.mkv',
* ]);
* ```
*/
class FileDownload extends index_1.HttpCrawler {
constructor(options = {}) {
const { streamHandler } = options;
delete options.streamHandler;
if (streamHandler) {
// For streams, the navigation is done in the request handler.
options.requestHandlerTimeoutSecs = options.navigationTimeoutSecs ?? 120;
}
super(options);
Object.defineProperty(this, "streamHandler", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.streamHandler = streamHandler;
if (this.streamHandler) {
this.requestHandler = this.streamRequestHandler;
}
// The base HttpCrawler class only supports a handful of text based mime types.
// With the FileDownload crawler, we want to download any file type.
this.supportedMimeTypes = new Set(['*/*']);
}
async _runRequestHandler(context) {
if (this.streamHandler) {
context.request.skipNavigation = true;
}
await super._runRequestHandler(context);
}
async streamRequestHandler(context) {
const { log, request: { url }, } = context;
const response = await this.httpClient.stream({
url,
timeout: { request: undefined },
proxyUrl: context.proxyInfo?.url,
});
let pollingInterval;
const cleanUp = () => {
clearInterval(pollingInterval);
response.stream.destroy();
};
const downloadPromise = new Promise((resolve, reject) => {
pollingInterval = setInterval(() => {
const { total, transferred } = response.downloadProgress;
if (transferred > 0) {
log.debug(`Downloaded ${transferred} bytes of ${total ?? 0} bytes from ${url}.`);
}
}, 5000);
response.stream.on('error', async (error) => {
cleanUp();
reject(error);
});
let streamHandlerResult;
try {
context.stream = response.stream;
streamHandlerResult = this.streamHandler(context);
}
catch (e) {
cleanUp();
reject(e);
}
if ((0, types_1.isPromise)(streamHandlerResult)) {
streamHandlerResult
.then(() => {
resolve();
})
.catch((e) => {
cleanUp();
reject(e);
});
}
else {
resolve();
}
});
await Promise.all([downloadPromise, (0, promises_1.finished)(response.stream)]);
cleanUp();
}
}
exports.FileDownload = FileDownload;
/**
* Creates new {@apilink Router} instance that works based on request labels.
* This instance can then serve as a `requestHandler` of your {@apilink FileDownload}.
* Defaults to the {@apilink FileDownloadCrawlingContext}.
*
* > Serves as a shortcut for using `Router.create<FileDownloadCrawlingContext>()`.
*
* ```ts
* import { FileDownload, createFileRouter } from 'crawlee';
*
* const router = createFileRouter();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new FileDownload({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
function createFileRouter(routes) {
return index_1.Router.create(routes);
}
//# sourceMappingURL=file-download.js.map