UNPKG

@crawlee/linkedom

Version:

The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.

202 lines • 8.5 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.LinkeDOMCrawler = void 0; exports.linkedomCrawlerEnqueueLinks = linkedomCrawlerEnqueueLinks; exports.createLinkeDOMRouter = createLinkeDOMRouter; const tslib_1 = require("tslib"); const http_1 = require("@crawlee/http"); const utils_1 = require("@crawlee/utils"); const cheerio = tslib_1.__importStar(require("cheerio")); // @ts-expect-error This throws a compilation error due to TypeScript not inferring the module has CJS versions too const cached_1 = require("linkedom/cached"); const utilities_1 = require("@apify/utilities"); /** * Provides a framework for the parallel crawling of web pages using plain HTTP requests and * [linkedom](https://www.npmjs.com/package/linkedom) LinkeDOM implementation. * The URLs to crawl are fed either from a static list of URLs * or from a dynamic queue of URLs enabling recursive crawling of websites. * * Since `LinkeDOMCrawler` uses raw HTTP requests to download web pages, * it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript * to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead, * because it loads the pages using full-featured headless Chrome browser. * * **Limitation**: * This crawler does not support proxies and cookies yet (each open starts with empty cookie store), and the user agent is always set to `Chrome`. * * `LinkeDOMCrawler` downloads each URL using a plain HTTP request, * parses the HTML content using [LinkeDOM](https://www.npmjs.com/package/linkedom) * and then invokes the user-provided {@link LinkeDOMCrawlerOptions.requestHandler} to extract page data * using the `window` object. * * The source URLs are represented using {@link Request} objects that are fed from * {@link RequestList} or {@link RequestQueue} instances provided by the {@link LinkeDOMCrawlerOptions.requestList} * or {@link LinkeDOMCrawlerOptions.requestQueue} constructor options, respectively. * * If both {@link LinkeDOMCrawlerOptions.requestList} and {@link LinkeDOMCrawlerOptions.requestQueue} are used, * the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them * to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times. * * The crawler finishes when there are no more {@link Request} objects to crawl. * * We can use the `preNavigationHooks` to adjust `gotOptions`: * * ``` * preNavigationHooks: [ * (crawlingContext, gotOptions) => { * // ... * }, * ] * ``` * * By default, `LinkeDOMCrawler` only processes web pages with the `text/html` * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header), * and skips pages with other content types. If you want the crawler to process other content types, * use the {@link LinkeDOMCrawlerOptions.additionalMimeTypes} constructor option. * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content. * For more details, see {@link LinkeDOMCrawlerOptions.requestHandler}. * * New requests are only dispatched when there is enough free CPU and memory available, * using the functionality provided by the {@link AutoscaledPool} class. * All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions` * parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` * {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor. * * **Example usage:** * * ```javascript * const crawler = new LinkeDOMCrawler({ * async requestHandler({ request, window }) { * await Dataset.pushData({ * url: request.url, * title: window.document.title, * }); * }, * }); * * await crawler.run([ * 'http://crawlee.dev', * ]); * ``` * @category Crawlers */ class LinkeDOMCrawler extends http_1.HttpCrawler { async _parseHTML(response, isXml, crawlingContext) { const body = await (0, utilities_1.concatStreamToBuffer)(response); const document = LinkeDOMCrawler.parser.parseFromString(body.toString(), isXml ? 'text/xml' : 'text/html'); return { window: document.defaultView, get body() { return document.documentElement.outerHTML; }, get document() { // See comment about typing in LinkeDOMCrawlingContext definition return document; }, enqueueLinks: async (enqueueOptions) => { return linkedomCrawlerEnqueueLinks({ options: enqueueOptions, window: document.defaultView, requestQueue: await this.getRequestQueue(), robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url), onSkippedRequest: this.onSkippedRequest, originalRequestUrl: crawlingContext.request.url, finalRequestUrl: crawlingContext.request.loadedUrl, }); }, }; } async _runRequestHandler(context) { context.waitForSelector = async (selector, timeoutMs = 5000) => { const $ = cheerio.load(context.body); if ($(selector).get().length === 0) { if (timeoutMs) { await (0, utils_1.sleep)(50); await context.waitForSelector(selector, Math.max(timeoutMs - 50, 0)); return; } throw new Error(`Selector '${selector}' not found.`); } }; context.parseWithCheerio = async (selector, _timeoutMs = 5000) => { const $ = cheerio.load(context.body); if (selector && $(selector).get().length === 0) { throw new Error(`Selector '${selector}' not found.`); } return $; }; await super._runRequestHandler(context); } } exports.LinkeDOMCrawler = LinkeDOMCrawler; Object.defineProperty(LinkeDOMCrawler, "parser", { enumerable: true, configurable: true, writable: true, value: new cached_1.DOMParser() }); /** @internal */ async function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, robotsTxtFile, onSkippedRequest, originalRequestUrl, finalRequestUrl, }) { if (!window) { throw new Error('Cannot enqueue links because the DOM is not available.'); } const baseUrl = (0, http_1.resolveBaseUrlForEnqueueLinksFiltering)({ enqueueStrategy: options?.strategy, finalRequestUrl, originalRequestUrl, userProvidedBaseUrl: options?.baseUrl, }); const urls = extractUrlsFromWindow(window, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl); return (0, http_1.enqueueLinks)({ requestQueue, robotsTxtFile, onSkippedRequest, urls, baseUrl, ...options, }); } /** * Extracts URLs from a given Window object. * @ignore */ function extractUrlsFromWindow(window, selector, baseUrl) { return Array.from(window.document.querySelectorAll(selector)) .map((e) => e.href) .filter((href) => href !== undefined && href !== '') .map((href) => { if (href === undefined) { return undefined; } return (0, http_1.tryAbsoluteURL)(href, baseUrl); }) .filter((href) => href !== undefined && href !== ''); } /** * Creates new {@link Router} instance that works based on request labels. * This instance can then serve as a `requestHandler` of your {@link LinkeDOMCrawler}. * Defaults to the {@link LinkeDOMCrawlingContext}. * * > Serves as a shortcut for using `Router.create<LinkeDOMCrawlingContext>()`. * * ```ts * import { LinkeDOMCrawler, createLinkeDOMRouter } from 'crawlee'; * * const router = createLinkeDOMRouter(); * router.addHandler('label-a', async (ctx) => { * ctx.log.info('...'); * }); * router.addDefaultHandler(async (ctx) => { * ctx.log.info('...'); * }); * * const crawler = new LinkeDOMCrawler({ * requestHandler: router, * }); * await crawler.run(); * ``` */ function createLinkeDOMRouter(routes) { return http_1.Router.create(routes); } //# sourceMappingURL=linkedom-crawler.js.map