webhead
Version:
An easy-to-use Node web crawler storing cookies, following redirects, traversing pages and submitting forms.
264 lines (221 loc) • 6.95 kB
JavaScript
import cheerio from 'cheerio';
import FormData from 'form-data';
import fs from 'fs-extra';
import nodeFetch from 'node-fetch';
import param from 'jquery-param';
import toughCookie from 'tough-cookie';
const { CookieJar } = toughCookie;
const Webhead = (opts) => {
const { jarFile, userAgent, verbose, beforeSend, complete } = opts || {};
let webhead = {},
session = {},
cookieJar,
cachedCheerio,
cachedJSON,
request = async (method, url, options) => {
let parameters = {
method: method.toUpperCase(),
url: toURL(url),
options: toOptions(options),
};
if (beforeSend) {
parameters = beforeSend(parameters, session);
parameters.method = parameters.method.toUpperCase();
parameters.url = toURL(parameters.url);
parameters.options = toOptions(parameters.options);
}
const { response, redirect } = await fetch(parameters);
if (redirect) {
return request(redirect.method, redirect.url, redirect.options);
}
webhead.url = parameters.url;
webhead.cookie = getCookie(webhead.url.href);
webhead.response = response;
if (complete) {
complete(parameters, session, webhead);
}
return response;
},
toURL = (url) => {
if (url.constructor == URL) {
url = url.href;
}
return new URL(url, webhead.url);
},
toOptions = (object) => {
object || (object = {});
object.headers = toHeaders(object.headers);
return object;
},
toHeaders = (object) => {
if (object) {
return Object.entries(object).reduce((object, [key, value]) => {
if (key.toLowerCase() != 'set-cookie' && Array.isArray(value)) {
value = value.join('; ');
}
object[key.replace(/\b./g, (c) => c.toUpperCase())] = value;
return object;
}, {});
} else {
return {};
}
},
fetch = async ({ method, url, options }) => {
let { headers, data, multiPartData, json } = options;
const cookie = getCookie(url.href),
opts = {
method,
headers: Object.assign({}, headers),
redirect: 'manual',
};
opts.headers['Host'] = url.host;
url = url.href;
if (cookie.length) {
opts.headers['Cookie'] = cookie;
}
if (!opts.headers['User-Agent'] && userAgent) {
opts.headers['User-Agent'] = userAgent;
}
if (data) {
if (method == 'GET') {
url += (url.match(/\?/) ? '&' : '?') + param(data);
} else {
if (!opts.headers['Content-Type']) {
opts.headers['Content-Type'] = 'application/x-www-form-urlencoded';
}
opts.body = param(data);
}
} else if (multiPartData) {
const form = new FormData();
(multiPartData || []).forEach((part) => {
if (part.file) {
form.append(part.name, fs.createReadStream(part.file));
} else {
form.append(
part.name,
part.hasOwnProperty('value') ? part.value : part.contents
);
}
});
opts.body = form;
opts.headers = {
...opts.headers,
...form.getHeaders(),
};
}
if (json) {
opts.headers['Content-Type'] = 'application/json';
opts.body = JSON.stringify(json);
}
verbose && console.log(method, url, opts);
let response = await nodeFetch(url, {
method,
...opts,
});
return await handleResponse(method, url, options, response);
},
handleResponse = async (method, url, options, response) => {
const statusCode = response.status,
data = await response.text(),
headers = toHeaders(response.headers.raw());
verbose && console.log({ statusCode, data, headers });
if (headers['Set-Cookie']) {
const cookieUrl = toCookieUrl(url);
headers['Set-Cookie'].forEach((cookie) => {
cookieJar.setCookieSync(cookie, cookieUrl);
});
if (jarFile) {
const cookies = cookieJar.toJSON().cookies;
let json = {};
if (fs.pathExistsSync(jarFile)) {
json = fs.readJsonSync(jarFile);
}
if (json.constructor == Object) {
json.cookies = cookies;
} else {
json = cookies;
}
fs.writeFileSync(jarFile, JSON.stringify(json, null, 2));
}
}
let redirect;
if (/^3/.test('' + statusCode)) {
redirect = {
method,
url: headers['Location'],
options,
};
if (statusCode <= 303) {
redirect.method = 'GET';
delete redirect.options.data;
}
}
cachedCheerio = undefined;
cachedJSON = undefined;
return {
response: { statusCode, data, headers },
redirect,
};
},
toCookieUrl = (url) => {
return url.replace(/\?.*/, '');
},
getCookie = (url) => {
return cookieJar.getCookiesSync(toCookieUrl(url)).join('; ');
};
`get post put patch delete head options`.split(' ').forEach((method) => {
webhead[method] = async (...parameters) =>
await request(method, ...parameters);
});
webhead.text = () => {
return webhead.response ? webhead.response.data : '';
};
webhead.json = () => {
if (!cachedJSON && webhead.response) {
const { data, headers } = webhead.response;
if (data && ('' + headers['Content-Type']).match('json')) {
cachedJSON = JSON.parse(data);
}
}
return cachedJSON;
};
webhead.$ = (...args) => {
if (!cachedCheerio && webhead.response) {
const { data, headers } = webhead.response;
const match = ('' + headers['Content-Type']).match(/(html|xml)/);
if (match) {
cachedCheerio = cheerio.load(data, { xmlMode: match[1] == 'xml' });
}
}
return cachedCheerio ? cachedCheerio(...args) : [];
};
webhead.submit = async (selector, data, options) => {
const form = webhead.$(selector);
if (form.length) {
const url = form.attr('action'),
method = form.attr('method') || 'GET';
data = Object.assign(
form.serializeArray().reduce((data, { name, value }) => {
data[name] = value;
return data;
}, {}),
data || {}
);
return await request(method, url, { ...options, data });
}
};
webhead.clearCookies = () => {
cookieJar.removeAllCookiesSync();
};
if (fs.pathExistsSync(jarFile)) {
const json = fs.readJsonSync(jarFile),
cookies = json.cookies || json;
cookieJar = CookieJar.fromJSON({
cookies: cookies.constructor == Array ? cookies : [],
});
} else {
cookieJar = new CookieJar();
}
return webhead;
};
export default Webhead;