UNPKG

@crawlee/utils

Version:

A set of shared utilities that can be used by crawlers

42 lines 2.05 kB
import type { CheerioAPI, load } from 'cheerio'; export type CheerioRoot = ReturnType<typeof load>; /** * The function converts a HTML document to a plain text. * * The plain text generated by the function is similar to a text captured * by pressing Ctrl+A and Ctrl+C on a page when loaded in a web browser. * The function doesn't aspire to preserve the formatting or to be perfectly correct with respect to HTML specifications. * However, it attempts to generate newlines and whitespaces in and around HTML elements * to avoid merging distinct parts of text and thus enable extraction of data from the text (e.g. phone numbers). * * **Example usage** * ```javascript * const text = htmlToText('<html><body>Some text</body></html>'); * console.log(text); * ``` * * Note that the function uses [cheerio](https://www.npmjs.com/package/cheerio) to parse the HTML. * Optionally, to avoid duplicate parsing of HTML and thus improve performance, you can pass * an existing Cheerio object to the function instead of the HTML text. The HTML should be parsed * with the `decodeEntities` option set to `true`. For example: * * ```javascript * import * as cheerio from 'cheerio'; * const html = '<html><body>Some text</body></html>'; * const text = htmlToText(cheerio.load(html, { decodeEntities: true })); * ``` * @param htmlOrCheerioElement HTML text or parsed HTML represented using a [cheerio](https://www.npmjs.com/package/cheerio) function. * @return Plain text */ export declare function htmlToText(htmlOrCheerioElement: string | CheerioRoot): string; /** * Extracts URLs from a given Cheerio object. * * @param $ the Cheerio object to extract URLs from * @param selector a CSS selector for matching link elements * @param baseUrl a URL for resolving relative links * @throws when a relative URL is encountered with no baseUrl set * @return An array of absolute URLs */ export declare function extractUrlsFromCheerio($: CheerioAPI, selector?: string, baseUrl?: string): string[]; //# sourceMappingURL=cheerio.d.ts.map