UNPKG

webscrape

Version:

Scrape web pages. Uses and returns promises

215 lines (183 loc) 6.94 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports["default"] = void 0; var _request = _interopRequireDefault(require("request")); var _fs = _interopRequireDefault(require("fs")); var _zlib = _interopRequireDefault(require("zlib")); var _lib = _interopRequireDefault(require("./lib")); var _util = require("util"); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } function _typeof(obj) { "@babel/helpers - typeof"; if (typeof Symbol === "function" && typeof Symbol.iterator === "symbol") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; } return _typeof(obj); } var log = (0, _util.debuglog)('webscrape'); var betterRequest = _lib["default"].betterRequest, constructOptionsWithJar = _lib["default"].constructOptionsWithJar, constructError = _lib["default"].constructError, constructResult = _lib["default"].constructResult, determineFilename = _lib["default"].determineFilename; function isOK(statusCode) { var str = statusCode && statusCode.toString(); return str && str.length === 3 && str.indexOf('2') === 0 || false; } function extractBody(res) { return new Promise(function (ok, fail) { var body; res.on('data', function (chunk) { return body += chunk; }); res.on('end', function () { return ok(body); }); res.on('error', function (err) { return fail(err); }); }); } // creates a closure instance of a scraper function Scraper() { var jar = _request["default"].jar(); var constructOptions = function constructOptions(uri) { var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}, headers = _ref.headers, query = _ref.query, body = _ref.body, method = _ref.method, agentOptions = _ref.agentOptions, indicies = _ref.indicies; return constructOptionsWithJar(uri, { headers: headers, query: query, body: body, method: method, agentOptions: agentOptions, indicies: indicies, jar: jar }); }; var get = function get(uri) { var _ref2 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}, headers = _ref2.headers, query = _ref2.query, agentOptions = _ref2.agentOptions, indicies = _ref2.indicies; return new Promise(function (resolve, reject) { var options = constructOptions(uri, { headers: headers, query: query, agentOptions: agentOptions, indicies: indicies, method: 'GET' }); return betterRequest(options, function (err, resp, body) { if (err) { return reject(err); } else if (resp && body) { var bodyStr = body instanceof Buffer ? body.toString('utf8') : body; if (!isOK(resp.statusCode)) { return reject(constructError(options, resp, bodyStr)); } else { return resolve(constructResult(resp, bodyStr)); } } else { return reject(new Error('no response or body')); } }); }); }; var post = function post(uri) { var _ref3 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}, headers = _ref3.headers, query = _ref3.query, body = _ref3.body, agentOptions = _ref3.agentOptions, indicies = _ref3.indicies; return new Promise(function (resolve, reject) { var options = constructOptions(uri, { headers: headers, query: query, body: body, agentOptions: agentOptions, indicies: indicies, method: 'POST' }); return betterRequest(options, function (err, resp, body) { if (err) { return reject(err); } else if (resp && body) { var bodyStr = body instanceof Buffer ? body.toString('utf8') : body; if (!isOK(resp.statusCode)) { return reject(constructError(options, resp, bodyStr)); } else { return resolve(constructResult(resp, bodyStr)); } } else { return reject(new Error('no response or body')); } }); }); }; var download = function download(uri) { var _ref4 = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}, post = _ref4.post, headers = _ref4.headers, query = _ref4.query, agentOptions = _ref4.agentOptions, filename = _ref4.filename, indicies = _ref4.indicies; return determineFilename(uri, filename).then(function (downloadpath) { log("DOWNLOAD ".concat(uri, " to ").concat(downloadpath)); // TODO: This mechanic is awkward and a band-aid. Find a better syntax var method = post && _typeof(post) === 'object' ? 'POST' : 'GET'; var preOpts = { method: method, headers: headers, query: query, agentOptions: agentOptions, indicies: indicies }; if (method === 'POST') { preOpts.body = post; } // normal operation resumes var options = constructOptions(uri, preOpts); var writeStream = _fs["default"].createWriteStream(downloadpath); var req = (0, _request["default"])(options); return new Promise(function (resolve, reject) { // adapted from http://nickfishman.com/post/49533681471/nodejs-http-requests-with-gzip-deflate-compression, // but this time this is clearly a use case for streams req.on('response', function (res) { var encoding = res.headers['content-encoding']; // TODO: I need to support more status codes if (!isOK(res.statusCode)) { extractBody(res).then(function (errorBody) { return reject(constructError(options, res, errorBody)); })["catch"](function () { return reject(constructError(options, res, 'no-body-could-be-extracted')); }); } else { if (encoding === 'gzip') { res.pipe(_zlib["default"].createGunzip()).pipe(writeStream); } else if (encoding === 'deflate') { res.pipe(_zlib["default"].createInflate()).pipe(writeStream); } else { res.pipe(writeStream); } } }); req.on('error', function (err) { return reject(err); }); writeStream.on('error', function (err) { return reject(err); }).on('finish', function () { writeStream.close(); return resolve(downloadpath); }); }); }); }; return { get: get, post: post, download: download }; } var _default = Scraper; exports["default"] = _default;