UNPKG

webscrape

Version:

Scrape web pages. Uses and returns promises

234 lines (191 loc) 8.24 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports["default"] = void 0; var _request = _interopRequireDefault(require("request")); var _cheerio = _interopRequireDefault(require("cheerio")); var _path = _interopRequireDefault(require("path")); var _fs = _interopRequireDefault(require("fs")); var _url = _interopRequireDefault(require("url")); var _zlib = _interopRequireDefault(require("zlib")); var _util = require("util"); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } function ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); if (enumerableOnly) { symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; }); } keys.push.apply(keys, symbols); } return keys; } function _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i] != null ? arguments[i] : {}; if (i % 2) { ownKeys(Object(source), true).forEach(function (key) { _defineProperty(target, key, source[key]); }); } else if (Object.getOwnPropertyDescriptors) { Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)); } else { ownKeys(Object(source)).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } } return target; } function _defineProperty(obj, key, value) { if (key in obj) { Object.defineProperty(obj, key, { value: value, enumerable: true, configurable: true, writable: true }); } else { obj[key] = value; } return obj; } // base options var BASE_OPTIONS = { headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36", "Cache-Control": "no-cache", "Pragma": "no-cache" } }; // fix unicode in JSON response var UNICODE_HEADER = /\\x([0-9a-fA-F]{2})/g; // adds additional functionality like automatic gunzipping / deflating and 303 redirects // into mikeal's request. function betterRequest(options, callback) { // my god why doesn't mikeal just bake this shit into request var req = (0, _request["default"])(options); // adapted from http://nickfishman.com/post/49533681471/nodejs-http-requests-with-gzip-deflate-compression // TODO: Consider a streamed approach next time req.on('response', function (res) { var chunks = []; res.on('data', function (chunk) { chunks.push(chunk); }); res.on('end', function () { var buffer = Buffer.concat(chunks); var encoding = res.headers['content-encoding']; try { if (encoding === 'gzip') { (0, _util.debuglog)('Content is gzipped'); _zlib["default"].gunzip(buffer, function (err, decoded) { return callback(err, res, decoded && decoded.toString()); }); } else if (encoding === 'deflate') { (0, _util.debuglog)('Content is deflated'); _zlib["default"].inflate(buffer, function (err, decoded) { return callback(err, res, decoded && decoded.toString()); }); } else { // very special case, although this should really be a 303. if (res.statusCode === 302) { var err = new Error("Unexpected Redirect to ".concat(res.headers.location)); err.name = 'UnexpectedRedirectError'; return callback(err); } // manually handle 303... bah if (res.statusCode === 303) { var forwardOptions = typeof options === 'string' ? { uri: res.headers.location } : _objectSpread(_objectSpread({}, options), {}, { uri: res.headers.location }); return betterRequest(forwardOptions, callback); } else { return callback(null, res, buffer && buffer.toString()); } } } catch (e) { callback(e); } }); }); req.on('error', callback); } function constructError(options, resp, body) { var error = new Error(); if (typeof options === 'string') { error.message = "ERROR ".concat(options); } else { error.message = "".concat(options.method || 'GET', " ERROR ").concat(options.uri, " HttpCode ").concat(resp.statusCode, "\n").concat(body); } return error; } // TODO: This could throw errors. Deal with it. function constructResult(resp, body) { var result = { body: body, headers: resp.headers }; var contentType = resp.headers['content-type']; var mimeType = contentType && contentType.split(';')[0]; // augment the result switch (mimeType) { case 'text/html': result.$ = _cheerio["default"].load(body, { lowerCaseTags: true }); break; case 'application/json': result.json = JSON.parse(body.replace(UNICODE_HEADER, function (m, n) { return String.fromCharCode(parseInt(n, 16)); })); } return result; } function constructOptionsWithJar(uri, _ref) { var headers = _ref.headers, query = _ref.query, body = _ref.body, jar = _ref.jar, agentOptions = _ref.agentOptions, _ref$method = _ref.method, method = _ref$method === void 0 ? 'GET' : _ref$method, _ref$indicies = _ref.indicies, indicies = _ref$indicies === void 0 ? true : _ref$indicies; var options = { uri: uri, jar: jar, method: method }; options.headers = Object.assign({}, BASE_OPTIONS.headers, headers); if (query !== undefined) { options.qs = query; options.qsStringifyOptions = { arrayFormat: indicies ? 'indicies' : 'repeat' // the documentation on this is terrible }; } if (agentOptions) { options.agentOptions = agentOptions; } // TODO: this logic may change later, since it is not obvious if (body !== undefined) { var _headers = options.headers || {}; var contentTypeSet = Object.keys(_headers).map(function (key) { return { key: key.toLowerCase(), value: _headers[key] }; }).filter(function (pair) { return pair.key === 'content-type'; }); if (contentTypeSet.length === 1) { // since there is a content type, we assume this is not a HTTP form. // NOTE: as a result, the user must do encoding manually. options.json = contentTypeSet[0].value.toLowerCase().startsWith('application/json'); options.body = body; } else { options.form = body; } } return options; } function determineFilename(uri, filename) { return new Promise(function (resolve, reject) { var baseFilename; try { var pathname = _url["default"].parse(uri, true).pathname; var matchResult = pathname ? /[^/]+$/.exec(pathname) : null; baseFilename = matchResult ? matchResult[0] : 'unknown'; } catch (err) { (0, _util.debuglog)("WARNING Unable to determine base filename for ".concat(uri, ", using \"unknown\"")); baseFilename = 'unknown'; } // why is this the first condition? because we may need baseFilename if filename is a folder if (!filename && !baseFilename) { return reject(new Error("DOWNLOAD ".concat(uri, " - Filename not given and cannot determine base name"))); // TODO: Nicer error } else if (filename) { // if the filename is actually a folder that already exists, then download to the folder using the baseFilename _fs["default"].stat(filename, function (err, result) { try { if (err || !result.isDirectory()) { return resolve(filename); // just carry on using the filename } else { // we append the basefilename to the directory return resolve(_path["default"].join(filename, baseFilename)); } } catch (e) { return reject(e); } }); } else { // no filename, but we have a baseFilename return resolve(baseFilename); } }); } var _default = { betterRequest: betterRequest, constructError: constructError, constructResult: constructResult, constructOptionsWithJar: constructOptionsWithJar, determineFilename: determineFilename }; exports["default"] = _default;