webscrape
Version:
Scrape web pages. Uses and returns promises
215 lines (183 loc) • 6.94 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", {
value: true
});
exports["default"] = void 0;
var _request = _interopRequireDefault(require("request"));
var _fs = _interopRequireDefault(require("fs"));
var _zlib = _interopRequireDefault(require("zlib"));
var _lib = _interopRequireDefault(require("./lib"));
var _util = require("util");
function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; }
function _typeof(obj) { "@babel/helpers - typeof"; if (typeof Symbol === "function" && typeof Symbol.iterator === "symbol") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; } return _typeof(obj); }
var log = (0, _util.debuglog)('webscrape');
var betterRequest = _lib["default"].betterRequest,
constructOptionsWithJar = _lib["default"].constructOptionsWithJar,
constructError = _lib["default"].constructError,
constructResult = _lib["default"].constructResult,
determineFilename = _lib["default"].determineFilename;
function isOK(statusCode) {
var str = statusCode && statusCode.toString();
return str && str.length === 3 && str.indexOf('2') === 0 || false;
}
function extractBody(res) {
return new Promise(function (ok, fail) {
var body;
res.on('data', function (chunk) {
return body += chunk;
});
res.on('end', function () {
return ok(body);
});
res.on('error', function (err) {
return fail(err);
});
});
} // creates a closure instance of a scraper
function Scraper() {
var jar = _request["default"].jar();
var constructOptions = function constructOptions(uri) {
var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
headers = _ref.headers,
query = _ref.query,
body = _ref.body,
method = _ref.method,
agentOptions = _ref.agentOptions,
indicies = _ref.indicies;
return constructOptionsWithJar(uri, {
headers: headers,
query: query,
body: body,
method: method,
agentOptions: agentOptions,
indicies: indicies,
jar: jar
});
};
var get = function get(uri) {
var _ref2 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
headers = _ref2.headers,
query = _ref2.query,
agentOptions = _ref2.agentOptions,
indicies = _ref2.indicies;
return new Promise(function (resolve, reject) {
var options = constructOptions(uri, {
headers: headers,
query: query,
agentOptions: agentOptions,
indicies: indicies,
method: 'GET'
});
return betterRequest(options, function (err, resp, body) {
if (err) {
return reject(err);
} else if (resp && body) {
var bodyStr = body instanceof Buffer ? body.toString('utf8') : body;
if (!isOK(resp.statusCode)) {
return reject(constructError(options, resp, bodyStr));
} else {
return resolve(constructResult(resp, bodyStr));
}
} else {
return reject(new Error('no response or body'));
}
});
});
};
var post = function post(uri) {
var _ref3 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
headers = _ref3.headers,
query = _ref3.query,
body = _ref3.body,
agentOptions = _ref3.agentOptions,
indicies = _ref3.indicies;
return new Promise(function (resolve, reject) {
var options = constructOptions(uri, {
headers: headers,
query: query,
body: body,
agentOptions: agentOptions,
indicies: indicies,
method: 'POST'
});
return betterRequest(options, function (err, resp, body) {
if (err) {
return reject(err);
} else if (resp && body) {
var bodyStr = body instanceof Buffer ? body.toString('utf8') : body;
if (!isOK(resp.statusCode)) {
return reject(constructError(options, resp, bodyStr));
} else {
return resolve(constructResult(resp, bodyStr));
}
} else {
return reject(new Error('no response or body'));
}
});
});
};
var download = function download(uri) {
var _ref4 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {},
post = _ref4.post,
headers = _ref4.headers,
query = _ref4.query,
agentOptions = _ref4.agentOptions,
filename = _ref4.filename,
indicies = _ref4.indicies;
return determineFilename(uri, filename).then(function (downloadpath) {
log("DOWNLOAD ".concat(uri, " to ").concat(downloadpath)); // TODO: This mechanic is awkward and a band-aid. Find a better syntax
var method = post && _typeof(post) === 'object' ? 'POST' : 'GET';
var preOpts = {
method: method,
headers: headers,
query: query,
agentOptions: agentOptions,
indicies: indicies
};
if (method === 'POST') {
preOpts.body = post;
} // normal operation resumes
var options = constructOptions(uri, preOpts);
var writeStream = _fs["default"].createWriteStream(downloadpath);
var req = (0, _request["default"])(options);
return new Promise(function (resolve, reject) {
// adapted from http://nickfishman.com/post/49533681471/nodejs-http-requests-with-gzip-deflate-compression,
// but this time this is clearly a use case for streams
req.on('response', function (res) {
var encoding = res.headers['content-encoding']; // TODO: I need to support more status codes
if (!isOK(res.statusCode)) {
extractBody(res).then(function (errorBody) {
return reject(constructError(options, res, errorBody));
})["catch"](function () {
return reject(constructError(options, res, 'no-body-could-be-extracted'));
});
} else {
if (encoding === 'gzip') {
res.pipe(_zlib["default"].createGunzip()).pipe(writeStream);
} else if (encoding === 'deflate') {
res.pipe(_zlib["default"].createInflate()).pipe(writeStream);
} else {
res.pipe(writeStream);
}
}
});
req.on('error', function (err) {
return reject(err);
});
writeStream.on('error', function (err) {
return reject(err);
}).on('finish', function () {
writeStream.close();
return resolve(downloadpath);
});
});
});
};
return {
get: get,
post: post,
download: download
};
}
var _default = Scraper;
exports["default"] = _default;