@divriots/cheerio
Version:
The fast, flexible & elegant library for parsing and manipulating HTML and XML.
290 lines • 13.5 kB
JavaScript
;
/**
* @file Batteries-included version of Cheerio. This module includes several
* convenience methods for loading documents from various sources.
*/
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __exportStar = (this && this.__exportStar) || function(m, exports) {
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
};
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (g && (g = 0, op[0] && (_ = 0)), _) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
var __rest = (this && this.__rest) || function (s, e) {
var t = {};
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)
t[p] = s[p];
if (s != null && typeof Object.getOwnPropertySymbols === "function")
for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {
if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))
t[p[i]] = s[p[i]];
}
return t;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.fromURL = exports.decodeStream = exports.stringStream = exports.loadBuffer = void 0;
__exportStar(require("./index.js"), exports);
var index_js_1 = require("./index.js");
var options_js_1 = require("./options.js");
var parse5_htmlparser2_tree_adapter_1 = require("parse5-htmlparser2-tree-adapter");
var htmlparser2 = __importStar(require("htmlparser2"));
var parse5_parser_stream_1 = require("parse5-parser-stream");
var encoding_sniffer_1 = require("encoding-sniffer");
var undici = __importStar(require("undici"));
var whatwg_mimetype_1 = __importDefault(require("whatwg-mimetype"));
var node_stream_1 = require("node:stream");
/**
* Sniffs the encoding of a buffer, then creates a querying function bound to a
* document created from the buffer.
*
* @category Loading
* @example
*
* ```js
* import * as cheerio from 'cheerio';
*
* const buffer = fs.readFileSync('index.html');
* const $ = cheerio.fromBuffer(buffer);
* ```
*
* @param buffer - The buffer to sniff the encoding of.
* @param options - The options to pass to Cheerio.
* @returns The loaded document.
*/
function loadBuffer(buffer, options) {
if (options === void 0) { options = {}; }
var opts = (0, options_js_1.flattenOptions)(options);
var str = (0, encoding_sniffer_1.decodeBuffer)(buffer, __assign({ defaultEncoding: (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252' }, options.encoding));
return (0, index_js_1.load)(str, opts);
}
exports.loadBuffer = loadBuffer;
function _stringStream(options, cb) {
var _a;
if (options === null || options === void 0 ? void 0 : options._useHtmlParser2) {
var parser_1 = htmlparser2.createDocumentStream(function (err, document) { return cb(err, (0, index_js_1.load)(document)); }, options);
return new node_stream_1.Writable({
decodeStrings: false,
write: function (chunk, _encoding, callback) {
if (typeof chunk !== 'string') {
throw new TypeError('Expected a string');
}
parser_1.write(chunk);
callback();
},
final: function (callback) {
parser_1.end();
callback();
},
});
}
options !== null && options !== void 0 ? options : (options = {});
(_a = options.treeAdapter) !== null && _a !== void 0 ? _a : (options.treeAdapter = parse5_htmlparser2_tree_adapter_1.adapter);
if (options.scriptingEnabled !== false) {
options.scriptingEnabled = true;
}
var stream = new parse5_parser_stream_1.ParserStream(options);
(0, node_stream_1.finished)(stream, function (err) { return cb(err, (0, index_js_1.load)(stream.document)); });
return stream;
}
/**
* Creates a stream that parses a sequence of strings into a document.
*
* The stream is a `Writable` stream that accepts strings. When the stream is
* finished, the callback is called with the loaded document.
*
* @category Loading
* @example
*
* ```js
* import * as cheerio from 'cheerio';
* import * as fs from 'fs';
*
* const writeStream = cheerio.stringStream({}, (err, $) => {
* if (err) {
* // Handle error
* }
*
* console.log($('h1').text());
* // Output: Hello, world!
* });
*
* fs.createReadStream('my-document.html', { encoding: 'utf8' }).pipe(
* writeStream
* );
* ```
*
* @param options - The options to pass to Cheerio.
* @param cb - The callback to call when the stream is finished.
* @returns The writable stream.
*/
function stringStream(options, cb) {
return _stringStream((0, options_js_1.flattenOptions)(options), cb);
}
exports.stringStream = stringStream;
/**
* Parses a stream of buffers into a document.
*
* The stream is a `Writable` stream that accepts buffers. When the stream is
* finished, the callback is called with the loaded document.
*
* @category Loading
* @param options - The options to pass to Cheerio.
* @param cb - The callback to call when the stream is finished.
* @returns The writable stream.
*/
function decodeStream(options, cb) {
var _a;
var _b = options.encoding, encoding = _b === void 0 ? {} : _b, cheerioOptions = __rest(options, ["encoding"]);
var opts = (0, options_js_1.flattenOptions)(cheerioOptions);
// Set the default encoding to UTF-8 for XML mode
(_a = encoding.defaultEncoding) !== null && _a !== void 0 ? _a : (encoding.defaultEncoding = (opts === null || opts === void 0 ? void 0 : opts.xmlMode) ? 'utf8' : 'windows-1252');
var decodeStream = new encoding_sniffer_1.DecodeStream(encoding);
var loadStream = _stringStream(opts, cb);
decodeStream.pipe(loadStream);
return decodeStream;
}
exports.decodeStream = decodeStream;
var defaultRequestOptions = {
method: 'GET',
// Allow redirects by default
maxRedirections: 5,
// NOTE: `throwOnError` currently doesn't work https://github.com/nodejs/undici/issues/1753
throwOnError: true,
// Set an Accept header
headers: {
accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
};
/**
* `fromURL` loads a document from a URL.
*
* By default, redirects are allowed and non-2xx responses are rejected.
*
* @category Loading
* @example
*
* ```js
* import * as cheerio from 'cheerio';
*
* const $ = await cheerio.fromURL('https://example.com');
* ```
*
* @param url - The URL to load the document from.
* @param options - The options to pass to Cheerio.
* @returns The loaded document.
*/
function fromURL(url, options) {
var _a;
if (options === void 0) { options = {}; }
return __awaiter(this, void 0, void 0, function () {
var _b, requestOptions, _c, encoding, cheerioOptions, undiciStream, promise;
return __generator(this, function (_d) {
switch (_d.label) {
case 0:
_b = options.requestOptions, requestOptions = _b === void 0 ? defaultRequestOptions : _b, _c = options.encoding, encoding = _c === void 0 ? {} : _c, cheerioOptions = __rest(options, ["requestOptions", "encoding"]);
// Add headers if none were supplied.
(_a = requestOptions.headers) !== null && _a !== void 0 ? _a : (requestOptions.headers = defaultRequestOptions.headers);
promise = new Promise(function (resolve, reject) {
undiciStream = undici.stream(url, requestOptions, function (res) {
var _a, _b;
var contentType = (_a = res.headers['content-type']) !== null && _a !== void 0 ? _a : 'text/html';
var mimeType = new whatwg_mimetype_1.default(Array.isArray(contentType) ? contentType[0] : contentType);
if (!mimeType.isHTML() && !mimeType.isXML()) {
throw new RangeError("The content-type \"".concat(contentType, "\" is neither HTML nor XML."));
}
// Forward the charset from the header to the decodeStream.
encoding.transportLayerEncodingLabel = mimeType.parameters.get('charset');
/*
* If we allow redirects, we will have entries in the history.
* The last entry will be the final URL.
*/
var history = (_b = res.context) === null || _b === void 0 ? void 0 : _b.history;
var opts = __assign({ encoding: encoding,
// Set XML mode based on the MIME type.
xmlMode: mimeType.isXML(),
// Set the `baseURL` to the final URL.
baseURL: history ? history[history.length - 1] : url }, cheerioOptions);
return decodeStream(opts, function (err, $) { return (err ? reject(err) : resolve($)); });
});
});
// Let's make sure the request is completed before returning the promise.
return [4 /*yield*/, undiciStream];
case 1:
// Let's make sure the request is completed before returning the promise.
_d.sent();
return [2 /*return*/, promise];
}
});
});
}
exports.fromURL = fromURL;
//# sourceMappingURL=batteries.js.map