UNPKG

@crawlee/utils

Version:

A set of shared utilities that can be used by crawlers

118 lines 5.37 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.htmlToText = htmlToText; exports.extractUrlsFromCheerio = extractUrlsFromCheerio; const tslib_1 = require("tslib"); const cheerio = tslib_1.__importStar(require("cheerio")); const extract_urls_1 = require("./extract-urls"); // NOTE: We are skipping 'noscript' since it's content is evaluated as text, instead of HTML elements. That damages the results. const SKIP_TAGS_REGEX = /^(script|style|canvas|svg|noscript)$/i; const BLOCK_TAGS_REGEX = /^(p|h1|h2|h3|h4|h5|h6|ol|ul|li|pre|address|blockquote|dl|div|fieldset|form|table|tr|select|option)$/i; /** * The function converts a HTML document to a plain text. * * The plain text generated by the function is similar to a text captured * by pressing Ctrl+A and Ctrl+C on a page when loaded in a web browser. * The function doesn't aspire to preserve the formatting or to be perfectly correct with respect to HTML specifications. * However, it attempts to generate newlines and whitespaces in and around HTML elements * to avoid merging distinct parts of text and thus enable extraction of data from the text (e.g. phone numbers). * * **Example usage** * ```javascript * const text = htmlToText('<html><body>Some text</body></html>'); * console.log(text); * ``` * * Note that the function uses [cheerio](https://www.npmjs.com/package/cheerio) to parse the HTML. * Optionally, to avoid duplicate parsing of HTML and thus improve performance, you can pass * an existing Cheerio object to the function instead of the HTML text. The HTML should be parsed * with the `decodeEntities` option set to `true`. For example: * * ```javascript * import * as cheerio from 'cheerio'; * const html = '<html><body>Some text</body></html>'; * const text = htmlToText(cheerio.load(html, { decodeEntities: true })); * ``` * @param htmlOrCheerioElement HTML text or parsed HTML represented using a [cheerio](https://www.npmjs.com/package/cheerio) function. * @return Plain text */ function htmlToText(htmlOrCheerioElement) { if (!htmlOrCheerioElement) return ''; const $ = typeof htmlOrCheerioElement === 'function' ? htmlOrCheerioElement : cheerio.load(htmlOrCheerioElement, { decodeEntities: true }); let text = ''; const process = (elems) => { const len = elems ? elems.length : 0; for (let i = 0; i < len; i++) { const elem = elems[i]; if (elem.type === 'text') { // Compress spaces, unless we're inside <pre> element let compr; if (elem.parent && elem.parent.tagName === 'pre') compr = elem.data; else compr = elem.data.replace(/\s+/g, ' '); // If text is empty or ends with a whitespace, don't add the leading whitespace if (compr.startsWith(' ') && /(^|\s)$/.test(text)) compr = compr.substring(1); text += compr; } else if (elem.type === 'comment' || SKIP_TAGS_REGEX.test(elem.tagName)) { // Skip comments and special elements } else if (elem.tagName === 'br') { text += '\n'; } else if (elem.tagName === 'td') { process(elem.children); text += '\t'; } else { // Block elements must be surrounded by newlines (unless beginning of text) const isBlockTag = BLOCK_TAGS_REGEX.test(elem.tagName); if (isBlockTag && !/(^|\n)$/.test(text)) text += '\n'; process(elem.children); if (isBlockTag && !text.endsWith('\n')) text += '\n'; } } }; // If HTML document has body, only convert that, otherwise convert the entire HTML const $body = $('body'); process($body.length > 0 ? $body : $.root()); return text.trim(); } /** * Extracts URLs from a given Cheerio object. * * @param $ the Cheerio object to extract URLs from * @param selector a CSS selector for matching link elements * @param baseUrl a URL for resolving relative links * @throws when a relative URL is encountered with no baseUrl set * @return An array of absolute URLs */ function extractUrlsFromCheerio($, selector = 'a', baseUrl = '') { const base = $('base').attr('href'); const absoluteBaseUrl = base && (0, extract_urls_1.tryAbsoluteURL)(base, baseUrl); if (absoluteBaseUrl) { baseUrl = absoluteBaseUrl; } return $(selector) .map((_i, el) => $(el).attr('href')) .get() .filter(Boolean) .map((href) => { // Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later. const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package. if (!isHrefAbsolute && !baseUrl) { throw new Error(`An extracted URL: ${href} is relative and baseUrl is not set. ` + 'Provide a baseUrl to automatically resolve relative URLs.'); } return baseUrl ? (0, extract_urls_1.tryAbsoluteURL)(href, baseUrl) : href; }) .filter(Boolean); } //# sourceMappingURL=cheerio.js.map