@crawlee/utils
Version:
A set of shared utilities that can be used by crawlers
42 lines • 2.05 kB
TypeScript
import type { CheerioAPI, load } from 'cheerio';
export type CheerioRoot = ReturnType<typeof load>;
/**
* The function converts a HTML document to a plain text.
*
* The plain text generated by the function is similar to a text captured
* by pressing Ctrl+A and Ctrl+C on a page when loaded in a web browser.
* The function doesn't aspire to preserve the formatting or to be perfectly correct with respect to HTML specifications.
* However, it attempts to generate newlines and whitespaces in and around HTML elements
* to avoid merging distinct parts of text and thus enable extraction of data from the text (e.g. phone numbers).
*
* **Example usage**
* ```javascript
* const text = htmlToText('<html><body>Some text</body></html>');
* console.log(text);
* ```
*
* Note that the function uses [cheerio](https://www.npmjs.com/package/cheerio) to parse the HTML.
* Optionally, to avoid duplicate parsing of HTML and thus improve performance, you can pass
* an existing Cheerio object to the function instead of the HTML text. The HTML should be parsed
* with the `decodeEntities` option set to `true`. For example:
*
* ```javascript
* import * as cheerio from 'cheerio';
* const html = '<html><body>Some text</body></html>';
* const text = htmlToText(cheerio.load(html, { decodeEntities: true }));
* ```
* @param htmlOrCheerioElement HTML text or parsed HTML represented using a [cheerio](https://www.npmjs.com/package/cheerio) function.
* @return Plain text
*/
export declare function htmlToText(htmlOrCheerioElement: string | CheerioRoot): string;
/**
* Extracts URLs from a given Cheerio object.
*
* @param $ the Cheerio object to extract URLs from
* @param selector a CSS selector for matching link elements
* @param baseUrl a URL for resolving relative links
* @throws when a relative URL is encountered with no baseUrl set
* @return An array of absolute URLs
*/
export declare function extractUrlsFromCheerio($: CheerioAPI, selector?: string, baseUrl?: string): string[];
//# sourceMappingURL=cheerio.d.ts.map