@crawlee/linkedom
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
202 lines • 8.5 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.LinkeDOMCrawler = void 0;
exports.linkedomCrawlerEnqueueLinks = linkedomCrawlerEnqueueLinks;
exports.createLinkeDOMRouter = createLinkeDOMRouter;
const tslib_1 = require("tslib");
const http_1 = require("@crawlee/http");
const utils_1 = require("@crawlee/utils");
const cheerio = tslib_1.__importStar(require("cheerio"));
// @ts-expect-error This throws a compilation error due to TypeScript not inferring the module has CJS versions too
const cached_1 = require("linkedom/cached");
const utilities_1 = require("@apify/utilities");
/**
* Provides a framework for the parallel crawling of web pages using plain HTTP requests and
* [linkedom](https://www.npmjs.com/package/linkedom) LinkeDOM implementation.
* The URLs to crawl are fed either from a static list of URLs
* or from a dynamic queue of URLs enabling recursive crawling of websites.
*
* Since `LinkeDOMCrawler` uses raw HTTP requests to download web pages,
* it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
* to display the content, you might need to use {@link PuppeteerCrawler} or {@link PlaywrightCrawler} instead,
* because it loads the pages using full-featured headless Chrome browser.
*
* **Limitation**:
* This crawler does not support proxies and cookies yet (each open starts with empty cookie store), and the user agent is always set to `Chrome`.
*
* `LinkeDOMCrawler` downloads each URL using a plain HTTP request,
* parses the HTML content using [LinkeDOM](https://www.npmjs.com/package/linkedom)
* and then invokes the user-provided {@link LinkeDOMCrawlerOptions.requestHandler} to extract page data
* using the `window` object.
*
* The source URLs are represented using {@link Request} objects that are fed from
* {@link RequestList} or {@link RequestQueue} instances provided by the {@link LinkeDOMCrawlerOptions.requestList}
* or {@link LinkeDOMCrawlerOptions.requestQueue} constructor options, respectively.
*
* If both {@link LinkeDOMCrawlerOptions.requestList} and {@link LinkeDOMCrawlerOptions.requestQueue} are used,
* the instance first processes URLs from the {@link RequestList} and automatically enqueues all of them
* to {@link RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
*
* The crawler finishes when there are no more {@link Request} objects to crawl.
*
* We can use the `preNavigationHooks` to adjust `gotOptions`:
*
* ```
* preNavigationHooks: [
* (crawlingContext, gotOptions) => {
* // ...
* },
* ]
* ```
*
* By default, `LinkeDOMCrawler` only processes web pages with the `text/html`
* and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
* and skips pages with other content types. If you want the crawler to process other content types,
* use the {@link LinkeDOMCrawlerOptions.additionalMimeTypes} constructor option.
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
* For more details, see {@link LinkeDOMCrawlerOptions.requestHandler}.
*
* New requests are only dispatched when there is enough free CPU and memory available,
* using the functionality provided by the {@link AutoscaledPool} class.
* All {@link AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
* parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
* {@link AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
*
* **Example usage:**
*
* ```javascript
* const crawler = new LinkeDOMCrawler({
* async requestHandler({ request, window }) {
* await Dataset.pushData({
* url: request.url,
* title: window.document.title,
* });
* },
* });
*
* await crawler.run([
* 'http://crawlee.dev',
* ]);
* ```
* @category Crawlers
*/
class LinkeDOMCrawler extends http_1.HttpCrawler {
async _parseHTML(response, isXml, crawlingContext) {
const body = await (0, utilities_1.concatStreamToBuffer)(response);
const document = LinkeDOMCrawler.parser.parseFromString(body.toString(), isXml ? 'text/xml' : 'text/html');
return {
window: document.defaultView,
get body() {
return document.documentElement.outerHTML;
},
get document() {
// See comment about typing in LinkeDOMCrawlingContext definition
return document;
},
enqueueLinks: async (enqueueOptions) => {
return linkedomCrawlerEnqueueLinks({
options: enqueueOptions,
window: document.defaultView,
requestQueue: await this.getRequestQueue(),
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
onSkippedRequest: this.onSkippedRequest,
originalRequestUrl: crawlingContext.request.url,
finalRequestUrl: crawlingContext.request.loadedUrl,
});
},
};
}
async _runRequestHandler(context) {
context.waitForSelector = async (selector, timeoutMs = 5000) => {
const $ = cheerio.load(context.body);
if ($(selector).get().length === 0) {
if (timeoutMs) {
await (0, utils_1.sleep)(50);
await context.waitForSelector(selector, Math.max(timeoutMs - 50, 0));
return;
}
throw new Error(`Selector '${selector}' not found.`);
}
};
context.parseWithCheerio = async (selector, _timeoutMs = 5000) => {
const $ = cheerio.load(context.body);
if (selector && $(selector).get().length === 0) {
throw new Error(`Selector '${selector}' not found.`);
}
return $;
};
await super._runRequestHandler(context);
}
}
exports.LinkeDOMCrawler = LinkeDOMCrawler;
Object.defineProperty(LinkeDOMCrawler, "parser", {
enumerable: true,
configurable: true,
writable: true,
value: new cached_1.DOMParser()
});
/** @internal */
async function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, robotsTxtFile, onSkippedRequest, originalRequestUrl, finalRequestUrl, }) {
if (!window) {
throw new Error('Cannot enqueue links because the DOM is not available.');
}
const baseUrl = (0, http_1.resolveBaseUrlForEnqueueLinksFiltering)({
enqueueStrategy: options?.strategy,
finalRequestUrl,
originalRequestUrl,
userProvidedBaseUrl: options?.baseUrl,
});
const urls = extractUrlsFromWindow(window, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl);
return (0, http_1.enqueueLinks)({
requestQueue,
robotsTxtFile,
onSkippedRequest,
urls,
baseUrl,
...options,
});
}
/**
* Extracts URLs from a given Window object.
* @ignore
*/
function extractUrlsFromWindow(window, selector, baseUrl) {
return Array.from(window.document.querySelectorAll(selector))
.map((e) => e.href)
.filter((href) => href !== undefined && href !== '')
.map((href) => {
if (href === undefined) {
return undefined;
}
return (0, http_1.tryAbsoluteURL)(href, baseUrl);
})
.filter((href) => href !== undefined && href !== '');
}
/**
* Creates new {@link Router} instance that works based on request labels.
* This instance can then serve as a `requestHandler` of your {@link LinkeDOMCrawler}.
* Defaults to the {@link LinkeDOMCrawlingContext}.
*
* > Serves as a shortcut for using `Router.create<LinkeDOMCrawlingContext>()`.
*
* ```ts
* import { LinkeDOMCrawler, createLinkeDOMRouter } from 'crawlee';
*
* const router = createLinkeDOMRouter();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new LinkeDOMCrawler({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
function createLinkeDOMRouter(routes) {
return http_1.Router.create(routes);
}
//# sourceMappingURL=linkedom-crawler.js.map