@tuplo/fletcher
Version:
Web scraping HTTP request library
733 lines (716 loc) • 22 kB
JavaScript
;
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
// If the importer is in node compatibility mode or this is not an ESM
// file that has been converted to a CommonJS file using a Babel-
// compatible transform (i.e. "__esModule" has not been set), then set
// "default" to the CommonJS "module.exports" for node compatibility.
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
mod
));
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.ts
var src_exports = {};
__export(src_exports, {
default: () => src_default
});
module.exports = __toCommonJS(src_exports);
var $4 = __toESM(require("cheerio"));
var import_deepmerge = __toESM(require("deepmerge"));
// src/services/browser.ts
var $3 = __toESM(require("cheerio"));
var import_puppeteer_core = __toESM(require("puppeteer-core"));
// src/helpers/md5.ts
var import_node_crypto = __toESM(require("crypto"));
function md5(input) {
return import_node_crypto.default.createHash("md5").update(input).digest("hex");
}
// src/helpers/sort-object.ts
function sortObject(obj) {
if (Array.isArray(obj)) {
return obj.map((item) => sortObject(item));
}
if (typeof obj === "object" && obj !== null) {
const keys = Object.keys(obj);
const o = {};
for (const key of keys.sort()) {
o[key] = sortObject(obj[key]);
}
return o;
}
return obj;
}
// src/options/cache.ts
var Cache = class {
constructor() {
this.db = /* @__PURE__ */ new Map();
this.hit = (params) => {
const { options } = params;
if (!(options == null ? void 0 : options.cache)) {
return;
}
const key = this.key(params);
const { cacheMethods } = options;
if (cacheMethods == null ? void 0 : cacheMethods.hit) {
return cacheMethods.hit(key);
}
if (this.db.has(key)) {
return this.db.get(key);
}
return;
};
this.key = (params) => {
const { format, options, url } = params;
const { cacheMethods } = options || {};
if (cacheMethods == null ? void 0 : cacheMethods.key) {
return cacheMethods.key(params);
}
const seed = [format, url, JSON.stringify(sortObject(params))].filter(Boolean).join("");
return md5(seed);
};
this.write = (params) => {
const { options } = params;
if (!(options == null ? void 0 : options.cache)) return;
const { payload, ...cacheParams } = params;
const key = this.key(cacheParams);
const { cacheMethods } = options;
if (cacheMethods == null ? void 0 : cacheMethods.write) {
cacheMethods.write(key, payload);
return;
}
this.db.set(key, payload);
};
}
};
// src/options/json-ld.ts
var $ = __toESM(require("cheerio"));
function getJsonLd($page) {
return $page.find('script[type="application/ld+json"]').map((_, el) => {
const $el = $.load(el);
const src = ($el.text() || "").split("\n").join(" ");
try {
return JSON.parse(src);
} catch {
return {};
}
});
}
// src/options/script.ts
var import_node_vm = __toESM(require("vm"));
var $2 = __toESM(require("cheerio"));
function getScript($page, userOptions = {}) {
const { scriptFindFn, scriptPath, scriptSandbox } = userOptions;
if (!scriptPath && !scriptFindFn) {
throw new Error("fletch.script: scriptPath or scriptFindFn are required");
}
let $el;
if (scriptPath) {
$el = $page.find(scriptPath);
} else if (scriptFindFn) {
const elScript = $page.find("script").toArray().find(scriptFindFn);
$el = elScript ? $2.load(elScript).root() : void 0;
}
if (!$el || ($el == null ? void 0 : $el.length) === 0) {
throw new Error("fletch.script: script element not found");
}
const src = $el.text() || "";
const code = new import_node_vm.default.Script(src);
const sandbox = scriptSandbox || {};
import_node_vm.default.createContext(sandbox);
code.runInContext(sandbox, { displayErrors: false });
return sandbox;
}
// src/services/browser.ts
var cache = new Cache();
var browser;
async function request(executor, options = {}) {
const { proxy, userAgent } = options;
const { browser: browserOptions } = options;
if (!browser) {
browser = (browserOptions == null ? void 0 : browserOptions.endpoint) ? await import_puppeteer_core.default.connect({
browserWSEndpoint: browserOptions.endpoint
}) : await import_puppeteer_core.default.launch({
args: ["--no-sandbox", "--disable-gpu"],
headless: true
});
}
if (!browser) throw new Error("Can't launch puppeteer");
const page = await browser.newPage();
if (userAgent) {
page.setUserAgent(userAgent);
}
if (proxy) {
const { password = "", username = "" } = proxy;
await page.authenticate({ password, username });
}
const { blockedResourceTypes } = browserOptions;
const shouldBlockResourceTypes = blockedResourceTypes === void 0 || Array.isArray(blockedResourceTypes);
if (shouldBlockResourceTypes) {
const blockResourceTypes = blockedResourceTypes || [
"stylesheet",
"image",
"font",
"media"
];
const rgBlockedResourceTypesRg = new RegExp(blockResourceTypes.join("|"));
await page.setRequestInterception(true);
page.on("request", async (req) => {
if (rgBlockedResourceTypesRg.test(req.resourceType())) {
req.abort();
} else {
req.continue();
}
});
}
const res = await executor(page);
await page.close();
return res;
}
async function html(url, options = {}) {
const { browser: browserOptions } = options;
if (options == null ? void 0 : options.log) {
console.error(url);
}
const cacheParams = { format: "html", options, url };
const executor = async (page) => {
await page.goto(url, {
timeout: 0,
waitUntil: "networkidle0"
});
if (browserOptions == null ? void 0 : browserOptions.screenshot) {
await page.screenshot(browserOptions.screenshot);
}
if (browserOptions == null ? void 0 : browserOptions.waitForSelector) {
await page.waitForSelector(browserOptions.waitForSelector);
}
const src = await page.content();
cache.write({ ...cacheParams, payload: src });
return $3.load(src).root();
};
const hit = cache.hit(cacheParams);
if (hit) return $3.load(hit).root();
return request(executor, options);
}
async function json(pageUrl, requestUrl, options = {}) {
const cacheParams = {
format: "json",
options: { requestUrl, ...options },
url: pageUrl
};
const executor = (page) => new Promise((resolve) => {
const store = new Proxy(
{ data: void 0 },
{
set: (obj, prop, value) => {
cache.write({ ...cacheParams, payload: JSON.stringify(value) });
resolve(value);
return true;
}
}
);
page.on("response", async (response) => {
const url = response.url();
if (requestUrl instanceof RegExp && !requestUrl.test(url)) return;
if (typeof requestUrl === "string" && url !== requestUrl) return;
store.data = await response.json();
});
page.goto(pageUrl).then(async () => {
const { onPageReady } = options.browser || {};
if (onPageReady) {
await onPageReady(page);
}
});
});
const hit = cache.hit(cacheParams);
if (hit) return JSON.parse(hit);
return request(executor, options);
}
async function script(url, options) {
const $page = await html(url, options);
return getScript($page, options);
}
async function jsonld(url, options) {
const $page = await html(url, options);
return getJsonLd($page);
}
async function close() {
if (!browser) return;
await browser.close();
browser = void 0;
}
var browser_default = { close, html, json, jsonld, script };
// src/services/request.ts
var import_node_http = require("http");
var import_node_https = __toESM(require("https"));
var import_axios = __toESM(require("axios"));
var import_hpagent = require("hpagent");
function toAxiosOptions(fletcherOptions) {
const {
body,
encoding,
headers: headers2,
maxRedirections = 999,
// follow all redirects by default
method = "GET",
proxy,
rejectUnauthorized,
timeout = 3e4,
validateStatus
} = fletcherOptions || {};
const options = {
maxRedirects: maxRedirections,
method,
responseType: "text",
timeout
};
if (body) {
options.data = body;
}
if (encoding) {
options.responseEncoding = encoding;
}
if (headers2) {
options.headers = headers2;
}
if (validateStatus) {
options.validateStatus = validateStatus;
}
if (options.maxRedirects === 0) {
options.validateStatus = validateStatus || ((statusCode) => statusCode >= 200 && statusCode < 400);
}
if (rejectUnauthorized !== void 0 && !proxy) {
options.httpsAgent = new import_node_https.default.Agent({
rejectUnauthorized: rejectUnauthorized != null ? rejectUnauthorized : false
});
}
if (proxy) {
const { host, password, port, protocol = "http", username } = proxy;
const auth = `${username}:${password}`;
options.httpsAgent = new import_hpagent.HttpsProxyAgent({
proxy: `${protocol}://${auth}@${host}:${port}`,
rejectUnauthorized: rejectUnauthorized != null ? rejectUnauthorized : false
});
}
return options;
}
async function request2(url, userOptions) {
try {
const uri = new URL(url);
const options = toAxiosOptions(userOptions);
const response = await (0, import_axios.default)(uri.href, options);
const {
data,
headers: headers2,
status: statusCode,
statusText: statusMessage
} = response;
if (userOptions == null ? void 0 : userOptions.onAfterRequest) {
const r = userOptions.onAfterRequest({ response });
await Promise.resolve(r);
}
return {
headers: headers2,
statusCode,
statusMessage,
text: async () => data
};
} catch (error_) {
const error = error_;
const { cause, response = {} } = error;
const { headers: headers2, status: statusCode } = response;
const statusMessage = response.statusText || error.message || import_node_http.STATUS_CODES[statusCode];
return {
headers: headers2,
statusCode: statusCode || error.code,
statusMessage: `${statusMessage} - ${url}`,
text: async () => JSON.stringify({ cause, statusCode, statusMessage })
};
}
}
// src/helpers/async-retry.ts
var import_retry = __toESM(require("retry"));
function retry(fn, opts) {
function run(resolve, reject) {
const options = opts || {};
if (!("randomize" in options)) {
options.randomize = true;
}
const op = import_retry.default.operation(options);
function bail(err) {
reject(err || new Error("Aborted"));
}
function onError(err, num) {
if (err.bail) {
bail(err);
return;
}
if (!op.retry(err)) {
reject(op.mainError());
} else if (options.onRetry) {
options.onRetry(err, num);
}
}
function runAttempt(num) {
let val;
try {
val = fn(bail, num);
} catch (error) {
onError(error, num);
return;
}
Promise.resolve(val).then(resolve).catch((error) => {
onError(error, num);
});
}
op.attempt(runAttempt);
}
return new Promise(run);
}
// src/helpers/cookie-jar.ts
var import_cookie = __toESM(require("cookie"));
var CookieJar = class {
constructor() {
this.cookies = [];
}
find(key) {
return this.cookies.find((cookie) => cookie.key === key);
}
getCookies() {
return this.cookies;
}
getCookieString() {
return this.cookies.map((cookie) => {
const { key, value } = cookie;
return `${key}=${value}`;
}).join("; ");
}
parseCookieFromString(cookieStr) {
const [, key] = /^([^=]+)/.exec(cookieStr) || [];
const cookie = import_cookie.default.parse(cookieStr);
const value = cookie[key];
return {
...cookie,
key,
value
};
}
setCookie(cookie) {
const c = typeof cookie === "string" ? this.parseCookieFromString(cookie) : cookie;
this.cookies.push(c);
}
setCookies(cookies_) {
const cookies2 = Array.isArray(cookies_) ? cookies_ : typeof cookies_ === "string" ? cookies_.split("; ") : [];
for (const cookie of cookies2) {
this.setCookie(cookie);
}
}
};
// src/helpers/delay.ts
function delay(ms, fn) {
return new Promise((resolve) => {
setTimeout(() => {
resolve(fn());
}, ms);
});
}
// src/helpers/text2json.ts
function text2json(input) {
let json3;
try {
json3 = JSON.parse(input);
return json3;
} catch {
return structuredClone(input);
}
}
// src/options/index.ts
function getDefaultOptions(url = "http://foo.com") {
return {
cache: false,
delay: process.env.NODE_ENV === "test" ? 0 : 1e3,
encoding: "utf8",
headers: {
referer: new URL(url).origin
},
method: "GET",
retry: {
factor: 2,
maxTimeout: Infinity,
minTimeout: 1e3,
randomize: true,
retries: 10
},
timeout: 3e4,
validateStatus: (status) => status >= 200 && status < 400
};
}
function toFletcherOptions(url, options) {
const opts = {
...getDefaultOptions(url),
url
};
for (const entry of Object.entries(options || {})) {
const [key, value] = entry;
switch (key) {
case "cache": {
opts.cache = Boolean(value);
break;
}
case "delay": {
opts.delay = Number(value);
break;
}
case "encoding": {
opts.encoding = value;
break;
}
case "formData":
case "formUrlEncoded": {
opts.method = "POST";
opts.headers = {
...opts.headers,
"content-type": "application/x-www-form-urlencoded"
};
const sp = new URLSearchParams(value);
opts.body = sp.toString();
break;
}
case "headers": {
opts.headers = {
...opts.headers,
...value || {}
};
break;
}
case "jsonData": {
opts.headers = {
...opts.headers,
"content-type": "application/json"
};
opts.method = "POST";
opts.body = JSON.stringify(value);
break;
}
case "maxRedirections": {
opts.maxRedirections = Number(value);
break;
}
case "method": {
opts.method = value.toString();
break;
}
case "onAfterRequest": {
opts.onAfterRequest = value;
break;
}
case "proxy": {
opts.proxy = value;
break;
}
case "rejectUnauthorized": {
opts.rejectUnauthorized = Boolean(value);
break;
}
case "retry": {
if (value === false) {
opts.retry = {
retries: 0
};
} else if (typeof value === "number") {
opts.retry = {
...opts.retry,
retries: value
};
} else if (typeof value === "object") {
opts.retry = value;
}
break;
}
case "timeout": {
opts.timeout = Number(value);
break;
}
case "urlSearchParams": {
const newUrl = new URL(url);
newUrl.search = new URLSearchParams(value).toString();
opts.url = newUrl.href;
break;
}
case "userAgent": {
opts.headers = {
...opts.headers,
"user-agent": value.toString()
};
break;
}
case "validateStatus": {
const validateFn = value;
opts.validateStatus = validateFn != null ? validateFn : opts.validateStatus;
break;
}
default:
}
}
return opts;
}
// src/options/embedded-json.ts
function getEmbeddedJson($page, userOptions = {}) {
const { embeddedJsonSelector } = userOptions;
if (!embeddedJsonSelector) {
throw new Error("fletch.embeddedJson: embeddedJsonSelector is required");
}
const $el = $page.find(embeddedJsonSelector);
if (!$el || ($el == null ? void 0 : $el.length) === 0) {
throw new Error("fletch.embeddedJson: script element not found");
}
const src = $el.html() || "";
let json3;
try {
json3 = JSON.parse(src);
} catch (error) {
throw new Error(`fletch.embeddedJson: ${error.message}`);
}
return json3;
}
// src/index.ts
var cache2 = new Cache();
function fletcher(userUrl, userOptions) {
const options = toFletcherOptions(userUrl, userOptions);
const {
delay: delayMs = 0,
retry: retryOptions,
url,
validateStatus
} = options;
if (userOptions == null ? void 0 : userOptions.log) {
console.error(url);
}
return delay(
delayMs,
() => retry(async () => {
let res;
try {
res = await request2(url, options);
if (!validateStatus(res.statusCode)) {
throw new Error(`${res.statusCode}: ${res.statusMessage}`);
}
return res;
} catch (error) {
if (userOptions == null ? void 0 : userOptions.log) {
console.error(error);
}
if (!res) {
throw new Error(error);
}
if (!validateStatus(res.statusCode)) {
throw new Error(`${res.statusCode}: ${res.statusMessage}`);
}
return res;
}
}, retryOptions)
);
}
async function text(userUrl, userOptions) {
const cacheParams = { format: "text", options: userOptions, url: userUrl };
const hit = cache2.hit(cacheParams);
if (hit) {
return hit;
}
const data = await fletcher(userUrl, userOptions).then((res) => res.text());
cache2.write({ ...cacheParams, payload: data });
return data;
}
async function html2(userUrl, userOptions) {
const cacheParams = { format: "html", options: userOptions, url: userUrl };
const hit = cache2.hit(cacheParams);
if (hit) {
return $4.load(hit).root();
}
const src = await fletcher(userUrl, userOptions).then((res) => res.text());
cache2.write({ ...cacheParams, payload: src });
return $4.load(src).root();
}
async function json2(userUrl, userOptions) {
const cacheParams = { format: "json", options: userOptions, url: userUrl };
const hit = cache2.hit(cacheParams);
if (hit) {
return JSON.parse(hit);
}
const raw = await fletcher(userUrl, userOptions).then((res) => res.text());
const src = text2json(raw);
cache2.write({ ...cacheParams, payload: JSON.stringify(src) });
return src;
}
async function script2(userUrl, userOptions) {
const $page = await html2(userUrl, userOptions);
return getScript($page, userOptions);
}
async function jsonld2(userUrl, userOptions) {
const $page = await html2(userUrl, userOptions);
return getJsonLd($page);
}
async function headers(url, userOptions) {
const res = await fletcher(url, userOptions);
return res.headers;
}
async function cookies(url, userOptions) {
const res = await fletcher(url, userOptions);
const { "set-cookie": setCookies } = res.headers;
const jar = new CookieJar();
jar.setCookies(setCookies);
return jar;
}
async function embeddedJson(userUrl, userOptions) {
const $page = await html2(userUrl, userOptions);
return getEmbeddedJson($page, userOptions);
}
function create(defaultOptions = {}) {
return {
browser: {
close: () => browser_default.close(),
html: (url, options = {}) => browser_default.html(url, (0, import_deepmerge.default)(defaultOptions, options)),
json: (pageUrl, requestUrl, options = {}) => browser_default.json(pageUrl, requestUrl, (0, import_deepmerge.default)(defaultOptions, options)),
jsonld: (url, options = {}) => browser_default.jsonld(url, (0, import_deepmerge.default)(defaultOptions, options)),
script: (url, options = {}) => browser_default.script(url, (0, import_deepmerge.default)(defaultOptions, options))
},
cookies: (url, options = {}) => cookies(url, (0, import_deepmerge.default)(defaultOptions, options)),
embeddedJson: (url, options = {}) => embeddedJson(url, (0, import_deepmerge.default)(defaultOptions, options)),
headers: (url, options = {}) => headers(url, (0, import_deepmerge.default)(defaultOptions, options)),
html: (url, options = {}) => html2(url, (0, import_deepmerge.default)(defaultOptions, options)),
json: (url, options = {}) => json2(url, (0, import_deepmerge.default)(defaultOptions, options)),
jsonld: (url, options = {}) => jsonld2(url, (0, import_deepmerge.default)(defaultOptions, options)),
response: (url, options = {}) => fletcher(url, (0, import_deepmerge.default)(defaultOptions, options)),
script: (url, options = {}) => script2(url, (0, import_deepmerge.default)(defaultOptions, options)),
text: (url, options = {}) => text(url, (0, import_deepmerge.default)(defaultOptions, options))
};
}
var src_default = Object.assign(fletcher, {
browser: browser_default,
cookies,
create,
embeddedJson,
headers,
html: html2,
json: json2,
jsonld: jsonld2,
response: fletcher,
script: script2,
text
});