UNPKG

@divriots/cheerio

Version:

The fast, flexible & elegant library for parsing and manipulating HTML and XML.

290 lines 13.5 kB
"use strict"; /** * @file Batteries-included version of Cheerio. This module includes several * convenience methods for loading documents from various sources. */ var __assign = (this && this.__assign) || function () { __assign = Object.assign || function(t) { for (var s, i = 1, n = arguments.length; i < n; i++) { s = arguments[i]; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p]; } return t; }; return __assign.apply(this, arguments); }; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __exportStar = (this && this.__exportStar) || function(m, exports) { for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p); }; var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __generator = (this && this.__generator) || function (thisArg, body) { var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; function verb(n) { return function (v) { return step([n, v]); }; } function step(op) { if (f) throw new TypeError("Generator is already executing."); while (g && (g = 0, op[0] && (_ = 0)), _) try { if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; if (y = 0, t) op = [op[0] & 2, t.value]; switch (op[0]) { case 0: case 1: t = op; break; case 4: _.label++; return { value: op[1], done: false }; case 5: _.label++; y = op[1]; op = [0]; continue; case 7: op = _.ops.pop(); _.trys.pop(); continue; default: if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } if (t[2]) _.ops.pop(); _.trys.pop(); continue; } op = body.call(thisArg, _); } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; } }; var __rest = (this && this.__rest) || function (s, e) { var t = {}; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0) t[p] = s[p]; if (s != null && typeof Object.getOwnPropertySymbols === "function") for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) { if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i])) t[p[i]] = s[p[i]]; } return t; }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.fromURL = exports.decodeStream = exports.stringStream = exports.loadBuffer = void 0; __exportStar(require("./index.js"), exports); var index_js_1 = require("./index.js"); var options_js_1 = require("./options.js"); var parse5_htmlparser2_tree_adapter_1 = require("parse5-htmlparser2-tree-adapter"); var htmlparser2 = __importStar(require("htmlparser2")); var parse5_parser_stream_1 = require("parse5-parser-stream"); var encoding_sniffer_1 = require("encoding-sniffer"); var undici = __importStar(require("undici")); var whatwg_mimetype_1 = __importDefault(require("whatwg-mimetype")); var node_stream_1 = require("node:stream"); /** * Sniffs the encoding of a buffer, then creates a querying function bound to a * document created from the buffer. * * @category Loading * @example * * ```js * import * as cheerio from 'cheerio'; * * const buffer = fs.readFileSync('index.html'); * const $ = cheerio.fromBuffer(buffer); * ``` * * @param buffer - The buffer to sniff the encoding of. * @param options - The options to pass to Cheerio. * @returns The loaded document. */ function loadBuffer(buffer, options) { if (options === void 0) { options = {}; } var opts = (0, options_js_1.flattenOptions)(options); var str = (0, encoding_sniffer_1.decodeBuffer)(buffer, __assign({ defaultEncoding: (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252' }, options.encoding)); return (0, index_js_1.load)(str, opts); } exports.loadBuffer = loadBuffer; function _stringStream(options, cb) { var _a; if (options === null || options === void 0 ? void 0 : options._useHtmlParser2) { var parser_1 = htmlparser2.createDocumentStream(function (err, document) { return cb(err, (0, index_js_1.load)(document)); }, options); return new node_stream_1.Writable({ decodeStrings: false, write: function (chunk, _encoding, callback) { if (typeof chunk !== 'string') { throw new TypeError('Expected a string'); } parser_1.write(chunk); callback(); }, final: function (callback) { parser_1.end(); callback(); }, }); } options !== null && options !== void 0 ? options : (options = {}); (_a = options.treeAdapter) !== null && _a !== void 0 ? _a : (options.treeAdapter = parse5_htmlparser2_tree_adapter_1.adapter); if (options.scriptingEnabled !== false) { options.scriptingEnabled = true; } var stream = new parse5_parser_stream_1.ParserStream(options); (0, node_stream_1.finished)(stream, function (err) { return cb(err, (0, index_js_1.load)(stream.document)); }); return stream; } /** * Creates a stream that parses a sequence of strings into a document. * * The stream is a `Writable` stream that accepts strings. When the stream is * finished, the callback is called with the loaded document. * * @category Loading * @example * * ```js * import * as cheerio from 'cheerio'; * import * as fs from 'fs'; * * const writeStream = cheerio.stringStream({}, (err, $) => { * if (err) { * // Handle error * } * * console.log($('h1').text()); * // Output: Hello, world! * }); * * fs.createReadStream('my-document.html', { encoding: 'utf8' }).pipe( * writeStream * ); * ``` * * @param options - The options to pass to Cheerio. * @param cb - The callback to call when the stream is finished. * @returns The writable stream. */ function stringStream(options, cb) { return _stringStream((0, options_js_1.flattenOptions)(options), cb); } exports.stringStream = stringStream; /** * Parses a stream of buffers into a document. * * The stream is a `Writable` stream that accepts buffers. When the stream is * finished, the callback is called with the loaded document. * * @category Loading * @param options - The options to pass to Cheerio. * @param cb - The callback to call when the stream is finished. * @returns The writable stream. */ function decodeStream(options, cb) { var _a; var _b = options.encoding, encoding = _b === void 0 ? {} : _b, cheerioOptions = __rest(options, ["encoding"]); var opts = (0, options_js_1.flattenOptions)(cheerioOptions); // Set the default encoding to UTF-8 for XML mode (_a = encoding.defaultEncoding) !== null && _a !== void 0 ? _a : (encoding.defaultEncoding = (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252'); var decodeStream = new encoding_sniffer_1.DecodeStream(encoding); var loadStream = _stringStream(opts, cb); decodeStream.pipe(loadStream); return decodeStream; } exports.decodeStream = decodeStream; var defaultRequestOptions = { method: 'GET', // Allow redirects by default maxRedirections: 5, // NOTE: `throwOnError` currently doesn't work https://github.com/nodejs/undici/issues/1753 throwOnError: true, // Set an Accept header headers: { accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }, }; /** * `fromURL` loads a document from a URL. * * By default, redirects are allowed and non-2xx responses are rejected. * * @category Loading * @example * * ```js * import * as cheerio from 'cheerio'; * * const $ = await cheerio.fromURL('https://example.com'); * ``` * * @param url - The URL to load the document from. * @param options - The options to pass to Cheerio. * @returns The loaded document. */ function fromURL(url, options) { var _a; if (options === void 0) { options = {}; } return __awaiter(this, void 0, void 0, function () { var _b, requestOptions, _c, encoding, cheerioOptions, undiciStream, promise; return __generator(this, function (_d) { switch (_d.label) { case 0: _b = options.requestOptions, requestOptions = _b === void 0 ? defaultRequestOptions : _b, _c = options.encoding, encoding = _c === void 0 ? {} : _c, cheerioOptions = __rest(options, ["requestOptions", "encoding"]); // Add headers if none were supplied. (_a = requestOptions.headers) !== null && _a !== void 0 ? _a : (requestOptions.headers = defaultRequestOptions.headers); promise = new Promise(function (resolve, reject) { undiciStream = undici.stream(url, requestOptions, function (res) { var _a, _b; var contentType = (_a = res.headers['content-type']) !== null && _a !== void 0 ? _a : 'text/html'; var mimeType = new whatwg_mimetype_1.default(Array.isArray(contentType) ? contentType[0] : contentType); if (!mimeType.isHTML() && !mimeType.isXML()) { throw new RangeError("The content-type \"".concat(contentType, "\" is neither HTML nor XML.")); } // Forward the charset from the header to the decodeStream. encoding.transportLayerEncodingLabel = mimeType.parameters.get('charset'); /* * If we allow redirects, we will have entries in the history. * The last entry will be the final URL. */ var history = (_b = res.context) === null || _b === void 0 ? void 0 : _b.history; var opts = __assign({ encoding: encoding, // Set XML mode based on the MIME type. xmlMode: mimeType.isXML(), // Set the `baseURL` to the final URL. baseURL: history ? history[history.length - 1] : url }, cheerioOptions); return decodeStream(opts, function (err, $) { return (err ? reject(err) : resolve($)); }); }); }); // Let's make sure the request is completed before returning the promise. return [4 /*yield*/, undiciStream]; case 1: // Let's make sure the request is completed before returning the promise. _d.sent(); return [2 /*return*/, promise]; } }); }); } exports.fromURL = fromURL; //# sourceMappingURL=batteries.js.map