webscrape
Version:
Scrape web pages. Uses and returns promises
234 lines (191 loc) • 8.24 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", {
value: true
});
exports["default"] = void 0;
var _request = _interopRequireDefault(require("request"));
var _cheerio = _interopRequireDefault(require("cheerio"));
var _path = _interopRequireDefault(require("path"));
var _fs = _interopRequireDefault(require("fs"));
var _url = _interopRequireDefault(require("url"));
var _zlib = _interopRequireDefault(require("zlib"));
var _util = require("util");
function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; }
function ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); if (enumerableOnly) { symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; }); } keys.push.apply(keys, symbols); } return keys; }
function _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i] != null ? arguments[i] : {}; if (i % 2) { ownKeys(Object(source), true).forEach(function (key) { _defineProperty(target, key, source[key]); }); } else if (Object.getOwnPropertyDescriptors) { Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)); } else { ownKeys(Object(source)).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } } return target; }
function _defineProperty(obj, key, value) { if (key in obj) { Object.defineProperty(obj, key, { value: value, enumerable: true, configurable: true, writable: true }); } else { obj[key] = value; } return obj; }
// base options
var BASE_OPTIONS = {
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
"Cache-Control": "no-cache",
"Pragma": "no-cache"
}
}; // fix unicode in JSON response
var UNICODE_HEADER = /\\x([0-9a-fA-F]{2})/g; // adds additional functionality like automatic gunzipping / deflating and 303 redirects
// into mikeal's request.
function betterRequest(options, callback) {
// my god why doesn't mikeal just bake this shit into request
var req = (0, _request["default"])(options); // adapted from http://nickfishman.com/post/49533681471/nodejs-http-requests-with-gzip-deflate-compression
// TODO: Consider a streamed approach next time
req.on('response', function (res) {
var chunks = [];
res.on('data', function (chunk) {
chunks.push(chunk);
});
res.on('end', function () {
var buffer = Buffer.concat(chunks);
var encoding = res.headers['content-encoding'];
try {
if (encoding === 'gzip') {
(0, _util.debuglog)('Content is gzipped');
_zlib["default"].gunzip(buffer, function (err, decoded) {
return callback(err, res, decoded && decoded.toString());
});
} else if (encoding === 'deflate') {
(0, _util.debuglog)('Content is deflated');
_zlib["default"].inflate(buffer, function (err, decoded) {
return callback(err, res, decoded && decoded.toString());
});
} else {
// very special case, although this should really be a 303.
if (res.statusCode === 302) {
var err = new Error("Unexpected Redirect to ".concat(res.headers.location));
err.name = 'UnexpectedRedirectError';
return callback(err);
} // manually handle 303... bah
if (res.statusCode === 303) {
var forwardOptions = typeof options === 'string' ? {
uri: res.headers.location
} : _objectSpread(_objectSpread({}, options), {}, {
uri: res.headers.location
});
return betterRequest(forwardOptions, callback);
} else {
return callback(null, res, buffer && buffer.toString());
}
}
} catch (e) {
callback(e);
}
});
});
req.on('error', callback);
}
function constructError(options, resp, body) {
var error = new Error();
if (typeof options === 'string') {
error.message = "ERROR ".concat(options);
} else {
error.message = "".concat(options.method || 'GET', " ERROR ").concat(options.uri, " HttpCode ").concat(resp.statusCode, "\n").concat(body);
}
return error;
}
// TODO: This could throw errors. Deal with it.
function constructResult(resp, body) {
var result = {
body: body,
headers: resp.headers
};
var contentType = resp.headers['content-type'];
var mimeType = contentType && contentType.split(';')[0]; // augment the result
switch (mimeType) {
case 'text/html':
result.$ = _cheerio["default"].load(body, {
lowerCaseTags: true
});
break;
case 'application/json':
result.json = JSON.parse(body.replace(UNICODE_HEADER, function (m, n) {
return String.fromCharCode(parseInt(n, 16));
}));
}
return result;
}
function constructOptionsWithJar(uri, _ref) {
var headers = _ref.headers,
query = _ref.query,
body = _ref.body,
jar = _ref.jar,
agentOptions = _ref.agentOptions,
_ref$method = _ref.method,
method = _ref$method === void 0 ? 'GET' : _ref$method,
_ref$indicies = _ref.indicies,
indicies = _ref$indicies === void 0 ? true : _ref$indicies;
var options = {
uri: uri,
jar: jar,
method: method
};
options.headers = Object.assign({}, BASE_OPTIONS.headers, headers);
if (query !== undefined) {
options.qs = query;
options.qsStringifyOptions = {
arrayFormat: indicies ? 'indicies' : 'repeat' // the documentation on this is terrible
};
}
if (agentOptions) {
options.agentOptions = agentOptions;
} // TODO: this logic may change later, since it is not obvious
if (body !== undefined) {
var _headers = options.headers || {};
var contentTypeSet = Object.keys(_headers).map(function (key) {
return {
key: key.toLowerCase(),
value: _headers[key]
};
}).filter(function (pair) {
return pair.key === 'content-type';
});
if (contentTypeSet.length === 1) {
// since there is a content type, we assume this is not a HTTP form.
// NOTE: as a result, the user must do encoding manually.
options.json = contentTypeSet[0].value.toLowerCase().startsWith('application/json');
options.body = body;
} else {
options.form = body;
}
}
return options;
}
function determineFilename(uri, filename) {
return new Promise(function (resolve, reject) {
var baseFilename;
try {
var pathname = _url["default"].parse(uri, true).pathname;
var matchResult = pathname ? /[^/]+$/.exec(pathname) : null;
baseFilename = matchResult ? matchResult[0] : 'unknown';
} catch (err) {
(0, _util.debuglog)("WARNING Unable to determine base filename for ".concat(uri, ", using \"unknown\""));
baseFilename = 'unknown';
} // why is this the first condition? because we may need baseFilename if filename is a folder
if (!filename && !baseFilename) {
return reject(new Error("DOWNLOAD ".concat(uri, " - Filename not given and cannot determine base name"))); // TODO: Nicer error
} else if (filename) {
// if the filename is actually a folder that already exists, then download to the folder using the baseFilename
_fs["default"].stat(filename, function (err, result) {
try {
if (err || !result.isDirectory()) {
return resolve(filename); // just carry on using the filename
} else {
// we append the basefilename to the directory
return resolve(_path["default"].join(filename, baseFilename));
}
} catch (e) {
return reject(e);
}
});
} else {
// no filename, but we have a baseFilename
return resolve(baseFilename);
}
});
}
var _default = {
betterRequest: betterRequest,
constructError: constructError,
constructResult: constructResult,
constructOptionsWithJar: constructOptionsWithJar,
determineFilename: determineFilename
};
exports["default"] = _default;