UNPKG

@tuplo/fletcher

Version:

Web scraping HTTP request library

733 lines (716 loc) 22 kB
"use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var src_exports = {}; __export(src_exports, { default: () => src_default }); module.exports = __toCommonJS(src_exports); var $4 = __toESM(require("cheerio")); var import_deepmerge = __toESM(require("deepmerge")); // src/services/browser.ts var $3 = __toESM(require("cheerio")); var import_puppeteer_core = __toESM(require("puppeteer-core")); // src/helpers/md5.ts var import_node_crypto = __toESM(require("crypto")); function md5(input) { return import_node_crypto.default.createHash("md5").update(input).digest("hex"); } // src/helpers/sort-object.ts function sortObject(obj) { if (Array.isArray(obj)) { return obj.map((item) => sortObject(item)); } if (typeof obj === "object" && obj !== null) { const keys = Object.keys(obj); const o = {}; for (const key of keys.sort()) { o[key] = sortObject(obj[key]); } return o; } return obj; } // src/options/cache.ts var Cache = class { constructor() { this.db = /* @__PURE__ */ new Map(); this.hit = (params) => { const { options } = params; if (!(options == null ? void 0 : options.cache)) { return; } const key = this.key(params); const { cacheMethods } = options; if (cacheMethods == null ? void 0 : cacheMethods.hit) { return cacheMethods.hit(key); } if (this.db.has(key)) { return this.db.get(key); } return; }; this.key = (params) => { const { format, options, url } = params; const { cacheMethods } = options || {}; if (cacheMethods == null ? void 0 : cacheMethods.key) { return cacheMethods.key(params); } const seed = [format, url, JSON.stringify(sortObject(params))].filter(Boolean).join(""); return md5(seed); }; this.write = (params) => { const { options } = params; if (!(options == null ? void 0 : options.cache)) return; const { payload, ...cacheParams } = params; const key = this.key(cacheParams); const { cacheMethods } = options; if (cacheMethods == null ? void 0 : cacheMethods.write) { cacheMethods.write(key, payload); return; } this.db.set(key, payload); }; } }; // src/options/json-ld.ts var $ = __toESM(require("cheerio")); function getJsonLd($page) { return $page.find('script[type="application/ld+json"]').map((_, el) => { const $el = $.load(el); const src = ($el.text() || "").split("\n").join(" "); try { return JSON.parse(src); } catch { return {}; } }); } // src/options/script.ts var import_node_vm = __toESM(require("vm")); var $2 = __toESM(require("cheerio")); function getScript($page, userOptions = {}) { const { scriptFindFn, scriptPath, scriptSandbox } = userOptions; if (!scriptPath && !scriptFindFn) { throw new Error("fletch.script: scriptPath or scriptFindFn are required"); } let $el; if (scriptPath) { $el = $page.find(scriptPath); } else if (scriptFindFn) { const elScript = $page.find("script").toArray().find(scriptFindFn); $el = elScript ? $2.load(elScript).root() : void 0; } if (!$el || ($el == null ? void 0 : $el.length) === 0) { throw new Error("fletch.script: script element not found"); } const src = $el.text() || ""; const code = new import_node_vm.default.Script(src); const sandbox = scriptSandbox || {}; import_node_vm.default.createContext(sandbox); code.runInContext(sandbox, { displayErrors: false }); return sandbox; } // src/services/browser.ts var cache = new Cache(); var browser; async function request(executor, options = {}) { const { proxy, userAgent } = options; const { browser: browserOptions } = options; if (!browser) { browser = (browserOptions == null ? void 0 : browserOptions.endpoint) ? await import_puppeteer_core.default.connect({ browserWSEndpoint: browserOptions.endpoint }) : await import_puppeteer_core.default.launch({ args: ["--no-sandbox", "--disable-gpu"], headless: true }); } if (!browser) throw new Error("Can't launch puppeteer"); const page = await browser.newPage(); if (userAgent) { page.setUserAgent(userAgent); } if (proxy) { const { password = "", username = "" } = proxy; await page.authenticate({ password, username }); } const { blockedResourceTypes } = browserOptions; const shouldBlockResourceTypes = blockedResourceTypes === void 0 || Array.isArray(blockedResourceTypes); if (shouldBlockResourceTypes) { const blockResourceTypes = blockedResourceTypes || [ "stylesheet", "image", "font", "media" ]; const rgBlockedResourceTypesRg = new RegExp(blockResourceTypes.join("|")); await page.setRequestInterception(true); page.on("request", async (req) => { if (rgBlockedResourceTypesRg.test(req.resourceType())) { req.abort(); } else { req.continue(); } }); } const res = await executor(page); await page.close(); return res; } async function html(url, options = {}) { const { browser: browserOptions } = options; if (options == null ? void 0 : options.log) { console.error(url); } const cacheParams = { format: "html", options, url }; const executor = async (page) => { await page.goto(url, { timeout: 0, waitUntil: "networkidle0" }); if (browserOptions == null ? void 0 : browserOptions.screenshot) { await page.screenshot(browserOptions.screenshot); } if (browserOptions == null ? void 0 : browserOptions.waitForSelector) { await page.waitForSelector(browserOptions.waitForSelector); } const src = await page.content(); cache.write({ ...cacheParams, payload: src }); return $3.load(src).root(); }; const hit = cache.hit(cacheParams); if (hit) return $3.load(hit).root(); return request(executor, options); } async function json(pageUrl, requestUrl, options = {}) { const cacheParams = { format: "json", options: { requestUrl, ...options }, url: pageUrl }; const executor = (page) => new Promise((resolve) => { const store = new Proxy( { data: void 0 }, { set: (obj, prop, value) => { cache.write({ ...cacheParams, payload: JSON.stringify(value) }); resolve(value); return true; } } ); page.on("response", async (response) => { const url = response.url(); if (requestUrl instanceof RegExp && !requestUrl.test(url)) return; if (typeof requestUrl === "string" && url !== requestUrl) return; store.data = await response.json(); }); page.goto(pageUrl).then(async () => { const { onPageReady } = options.browser || {}; if (onPageReady) { await onPageReady(page); } }); }); const hit = cache.hit(cacheParams); if (hit) return JSON.parse(hit); return request(executor, options); } async function script(url, options) { const $page = await html(url, options); return getScript($page, options); } async function jsonld(url, options) { const $page = await html(url, options); return getJsonLd($page); } async function close() { if (!browser) return; await browser.close(); browser = void 0; } var browser_default = { close, html, json, jsonld, script }; // src/services/request.ts var import_node_http = require("http"); var import_node_https = __toESM(require("https")); var import_axios = __toESM(require("axios")); var import_hpagent = require("hpagent"); function toAxiosOptions(fletcherOptions) { const { body, encoding, headers: headers2, maxRedirections = 999, // follow all redirects by default method = "GET", proxy, rejectUnauthorized, timeout = 3e4, validateStatus } = fletcherOptions || {}; const options = { maxRedirects: maxRedirections, method, responseType: "text", timeout }; if (body) { options.data = body; } if (encoding) { options.responseEncoding = encoding; } if (headers2) { options.headers = headers2; } if (validateStatus) { options.validateStatus = validateStatus; } if (options.maxRedirects === 0) { options.validateStatus = validateStatus || ((statusCode) => statusCode >= 200 && statusCode < 400); } if (rejectUnauthorized !== void 0 && !proxy) { options.httpsAgent = new import_node_https.default.Agent({ rejectUnauthorized: rejectUnauthorized != null ? rejectUnauthorized : false }); } if (proxy) { const { host, password, port, protocol = "http", username } = proxy; const auth = `${username}:${password}`; options.httpsAgent = new import_hpagent.HttpsProxyAgent({ proxy: `${protocol}://${auth}@${host}:${port}`, rejectUnauthorized: rejectUnauthorized != null ? rejectUnauthorized : false }); } return options; } async function request2(url, userOptions) { try { const uri = new URL(url); const options = toAxiosOptions(userOptions); const response = await (0, import_axios.default)(uri.href, options); const { data, headers: headers2, status: statusCode, statusText: statusMessage } = response; if (userOptions == null ? void 0 : userOptions.onAfterRequest) { const r = userOptions.onAfterRequest({ response }); await Promise.resolve(r); } return { headers: headers2, statusCode, statusMessage, text: async () => data }; } catch (error_) { const error = error_; const { cause, response = {} } = error; const { headers: headers2, status: statusCode } = response; const statusMessage = response.statusText || error.message || import_node_http.STATUS_CODES[statusCode]; return { headers: headers2, statusCode: statusCode || error.code, statusMessage: `${statusMessage} - ${url}`, text: async () => JSON.stringify({ cause, statusCode, statusMessage }) }; } } // src/helpers/async-retry.ts var import_retry = __toESM(require("retry")); function retry(fn, opts) { function run(resolve, reject) { const options = opts || {}; if (!("randomize" in options)) { options.randomize = true; } const op = import_retry.default.operation(options); function bail(err) { reject(err || new Error("Aborted")); } function onError(err, num) { if (err.bail) { bail(err); return; } if (!op.retry(err)) { reject(op.mainError()); } else if (options.onRetry) { options.onRetry(err, num); } } function runAttempt(num) { let val; try { val = fn(bail, num); } catch (error) { onError(error, num); return; } Promise.resolve(val).then(resolve).catch((error) => { onError(error, num); }); } op.attempt(runAttempt); } return new Promise(run); } // src/helpers/cookie-jar.ts var import_cookie = __toESM(require("cookie")); var CookieJar = class { constructor() { this.cookies = []; } find(key) { return this.cookies.find((cookie) => cookie.key === key); } getCookies() { return this.cookies; } getCookieString() { return this.cookies.map((cookie) => { const { key, value } = cookie; return `${key}=${value}`; }).join("; "); } parseCookieFromString(cookieStr) { const [, key] = /^([^=]+)/.exec(cookieStr) || []; const cookie = import_cookie.default.parse(cookieStr); const value = cookie[key]; return { ...cookie, key, value }; } setCookie(cookie) { const c = typeof cookie === "string" ? this.parseCookieFromString(cookie) : cookie; this.cookies.push(c); } setCookies(cookies_) { const cookies2 = Array.isArray(cookies_) ? cookies_ : typeof cookies_ === "string" ? cookies_.split("; ") : []; for (const cookie of cookies2) { this.setCookie(cookie); } } }; // src/helpers/delay.ts function delay(ms, fn) { return new Promise((resolve) => { setTimeout(() => { resolve(fn()); }, ms); }); } // src/helpers/text2json.ts function text2json(input) { let json3; try { json3 = JSON.parse(input); return json3; } catch { return structuredClone(input); } } // src/options/index.ts function getDefaultOptions(url = "http://foo.com") { return { cache: false, delay: process.env.NODE_ENV === "test" ? 0 : 1e3, encoding: "utf8", headers: { referer: new URL(url).origin }, method: "GET", retry: { factor: 2, maxTimeout: Infinity, minTimeout: 1e3, randomize: true, retries: 10 }, timeout: 3e4, validateStatus: (status) => status >= 200 && status < 400 }; } function toFletcherOptions(url, options) { const opts = { ...getDefaultOptions(url), url }; for (const entry of Object.entries(options || {})) { const [key, value] = entry; switch (key) { case "cache": { opts.cache = Boolean(value); break; } case "delay": { opts.delay = Number(value); break; } case "encoding": { opts.encoding = value; break; } case "formData": case "formUrlEncoded": { opts.method = "POST"; opts.headers = { ...opts.headers, "content-type": "application/x-www-form-urlencoded" }; const sp = new URLSearchParams(value); opts.body = sp.toString(); break; } case "headers": { opts.headers = { ...opts.headers, ...value || {} }; break; } case "jsonData": { opts.headers = { ...opts.headers, "content-type": "application/json" }; opts.method = "POST"; opts.body = JSON.stringify(value); break; } case "maxRedirections": { opts.maxRedirections = Number(value); break; } case "method": { opts.method = value.toString(); break; } case "onAfterRequest": { opts.onAfterRequest = value; break; } case "proxy": { opts.proxy = value; break; } case "rejectUnauthorized": { opts.rejectUnauthorized = Boolean(value); break; } case "retry": { if (value === false) { opts.retry = { retries: 0 }; } else if (typeof value === "number") { opts.retry = { ...opts.retry, retries: value }; } else if (typeof value === "object") { opts.retry = value; } break; } case "timeout": { opts.timeout = Number(value); break; } case "urlSearchParams": { const newUrl = new URL(url); newUrl.search = new URLSearchParams(value).toString(); opts.url = newUrl.href; break; } case "userAgent": { opts.headers = { ...opts.headers, "user-agent": value.toString() }; break; } case "validateStatus": { const validateFn = value; opts.validateStatus = validateFn != null ? validateFn : opts.validateStatus; break; } default: } } return opts; } // src/options/embedded-json.ts function getEmbeddedJson($page, userOptions = {}) { const { embeddedJsonSelector } = userOptions; if (!embeddedJsonSelector) { throw new Error("fletch.embeddedJson: embeddedJsonSelector is required"); } const $el = $page.find(embeddedJsonSelector); if (!$el || ($el == null ? void 0 : $el.length) === 0) { throw new Error("fletch.embeddedJson: script element not found"); } const src = $el.html() || ""; let json3; try { json3 = JSON.parse(src); } catch (error) { throw new Error(`fletch.embeddedJson: ${error.message}`); } return json3; } // src/index.ts var cache2 = new Cache(); function fletcher(userUrl, userOptions) { const options = toFletcherOptions(userUrl, userOptions); const { delay: delayMs = 0, retry: retryOptions, url, validateStatus } = options; if (userOptions == null ? void 0 : userOptions.log) { console.error(url); } return delay( delayMs, () => retry(async () => { let res; try { res = await request2(url, options); if (!validateStatus(res.statusCode)) { throw new Error(`${res.statusCode}: ${res.statusMessage}`); } return res; } catch (error) { if (userOptions == null ? void 0 : userOptions.log) { console.error(error); } if (!res) { throw new Error(error); } if (!validateStatus(res.statusCode)) { throw new Error(`${res.statusCode}: ${res.statusMessage}`); } return res; } }, retryOptions) ); } async function text(userUrl, userOptions) { const cacheParams = { format: "text", options: userOptions, url: userUrl }; const hit = cache2.hit(cacheParams); if (hit) { return hit; } const data = await fletcher(userUrl, userOptions).then((res) => res.text()); cache2.write({ ...cacheParams, payload: data }); return data; } async function html2(userUrl, userOptions) { const cacheParams = { format: "html", options: userOptions, url: userUrl }; const hit = cache2.hit(cacheParams); if (hit) { return $4.load(hit).root(); } const src = await fletcher(userUrl, userOptions).then((res) => res.text()); cache2.write({ ...cacheParams, payload: src }); return $4.load(src).root(); } async function json2(userUrl, userOptions) { const cacheParams = { format: "json", options: userOptions, url: userUrl }; const hit = cache2.hit(cacheParams); if (hit) { return JSON.parse(hit); } const raw = await fletcher(userUrl, userOptions).then((res) => res.text()); const src = text2json(raw); cache2.write({ ...cacheParams, payload: JSON.stringify(src) }); return src; } async function script2(userUrl, userOptions) { const $page = await html2(userUrl, userOptions); return getScript($page, userOptions); } async function jsonld2(userUrl, userOptions) { const $page = await html2(userUrl, userOptions); return getJsonLd($page); } async function headers(url, userOptions) { const res = await fletcher(url, userOptions); return res.headers; } async function cookies(url, userOptions) { const res = await fletcher(url, userOptions); const { "set-cookie": setCookies } = res.headers; const jar = new CookieJar(); jar.setCookies(setCookies); return jar; } async function embeddedJson(userUrl, userOptions) { const $page = await html2(userUrl, userOptions); return getEmbeddedJson($page, userOptions); } function create(defaultOptions = {}) { return { browser: { close: () => browser_default.close(), html: (url, options = {}) => browser_default.html(url, (0, import_deepmerge.default)(defaultOptions, options)), json: (pageUrl, requestUrl, options = {}) => browser_default.json(pageUrl, requestUrl, (0, import_deepmerge.default)(defaultOptions, options)), jsonld: (url, options = {}) => browser_default.jsonld(url, (0, import_deepmerge.default)(defaultOptions, options)), script: (url, options = {}) => browser_default.script(url, (0, import_deepmerge.default)(defaultOptions, options)) }, cookies: (url, options = {}) => cookies(url, (0, import_deepmerge.default)(defaultOptions, options)), embeddedJson: (url, options = {}) => embeddedJson(url, (0, import_deepmerge.default)(defaultOptions, options)), headers: (url, options = {}) => headers(url, (0, import_deepmerge.default)(defaultOptions, options)), html: (url, options = {}) => html2(url, (0, import_deepmerge.default)(defaultOptions, options)), json: (url, options = {}) => json2(url, (0, import_deepmerge.default)(defaultOptions, options)), jsonld: (url, options = {}) => jsonld2(url, (0, import_deepmerge.default)(defaultOptions, options)), response: (url, options = {}) => fletcher(url, (0, import_deepmerge.default)(defaultOptions, options)), script: (url, options = {}) => script2(url, (0, import_deepmerge.default)(defaultOptions, options)), text: (url, options = {}) => text(url, (0, import_deepmerge.default)(defaultOptions, options)) }; } var src_default = Object.assign(fletcher, { browser: browser_default, cookies, create, embeddedJson, headers, html: html2, json: json2, jsonld: jsonld2, response: fletcher, script: script2, text });