UNPKG

@crawlee/core

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

179 lines 5.69 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Router = void 0; const errors_1 = require("./errors"); const defaultRoute = Symbol('default-route'); /** * Simple router that works based on request labels. This instance can then serve as a `requestHandler` of your crawler. * * ```ts * import { Router, CheerioCrawler, CheerioCrawlingContext } from 'crawlee'; * * const router = Router.create<CheerioCrawlingContext>(); * * // we can also use factory methods for specific crawling contexts, the above equals to: * // import { createCheerioRouter } from 'crawlee'; * // const router = createCheerioRouter(); * * router.addHandler('label-a', async (ctx) => { * ctx.log.info('...'); * }); * router.addDefaultHandler(async (ctx) => { * ctx.log.info('...'); * }); * * const crawler = new CheerioCrawler({ * requestHandler: router, * }); * await crawler.run(); * ``` * * Alternatively we can use the default router instance from crawler object: * * ```ts * import { CheerioCrawler } from 'crawlee'; * * const crawler = new CheerioCrawler(); * * crawler.router.addHandler('label-a', async (ctx) => { * ctx.log.info('...'); * }); * crawler.router.addDefaultHandler(async (ctx) => { * ctx.log.info('...'); * }); * * await crawler.run(); * ``` * * For convenience, we can also define the routes right when creating the router: * * ```ts * import { CheerioCrawler, createCheerioRouter } from 'crawlee'; * const crawler = new CheerioCrawler({ * requestHandler: createCheerioRouter({ * 'label-a': async (ctx) => { ... }, * 'label-b': async (ctx) => { ... }, * })}, * }); * await crawler.run(); * ``` * * Middlewares are also supported via the `router.use` method. There can be multiple * middlewares for a single router, they will be executed sequentially in the same * order as they were registered. * * ```ts * crawler.router.use(async (ctx) => { * ctx.log.info('...'); * }); * ``` */ class Router { /** * use Router.create() instead! * @ignore */ constructor() { Object.defineProperty(this, "routes", { enumerable: true, configurable: true, writable: true, value: new Map() }); Object.defineProperty(this, "middlewares", { enumerable: true, configurable: true, writable: true, value: [] }); } /** * Registers new route handler for given label. */ addHandler(label, handler) { this.validate(label); this.routes.set(label, handler); } /** * Registers default route handler. */ addDefaultHandler(handler) { this.validate(defaultRoute); this.routes.set(defaultRoute, handler); } /** * Registers a middleware that will be fired before the matching route handler. * Multiple middlewares can be registered, they will be fired in the same order. */ use(middleware) { this.middlewares.push(middleware); } /** * Returns route handler for given label. If no label is provided, the default request handler will be returned. */ getHandler(label) { if (label && this.routes.has(label)) { return this.routes.get(label); } if (this.routes.has(defaultRoute)) { return this.routes.get(defaultRoute); } throw new errors_1.MissingRouteError(`Route not found for label '${String(label)}'.` + ' You must set up a route for this label or a default route.' + ' Use `requestHandler`, `router.addHandler` or `router.addDefaultHandler`.'); } /** * Throws when the label already exists in our registry. */ validate(label) { if (this.routes.has(label)) { const message = label === defaultRoute ? `Default route is already defined!` : `Route for label '${String(label)}' is already defined!`; throw new Error(message); } } /** * Creates new router instance. This instance can then serve as a `requestHandler` of your crawler. * * ```ts * import { Router, CheerioCrawler, CheerioCrawlingContext } from 'crawlee'; * * const router = Router.create<CheerioCrawlingContext>(); * router.addHandler('label-a', async (ctx) => { * ctx.log.info('...'); * }); * router.addDefaultHandler(async (ctx) => { * ctx.log.info('...'); * }); * * const crawler = new CheerioCrawler({ * requestHandler: router, * }); * await crawler.run(); * ``` */ static create(routes) { const router = new Router(); const obj = Object.create(Function.prototype); obj.addHandler = router.addHandler.bind(router); obj.addDefaultHandler = router.addDefaultHandler.bind(router); obj.getHandler = router.getHandler.bind(router); obj.use = router.use.bind(router); for (const [label, handler] of Object.entries(routes ?? {})) { router.addHandler(label, handler); } const func = async function (context) { const { url, loadedUrl, label } = context.request; context.log.debug('Page opened.', { label, url: loadedUrl ?? url }); for (const middleware of router.middlewares) { await middleware(context); } return router.getHandler(label)(context); }; Object.setPrototypeOf(func, obj); return func; } } exports.Router = Router; //# sourceMappingURL=router.js.map