UNPKG

@warren-bank/node-request-cli

Version:

An extremely lightweight HTTP request client for the command-line. Supports: http, https, proxy, redirects, cookies, content-encoding, multipart/form-data, multi-threading, recursive website crawling and mirroring.

299 lines (246 loc) 10.3 kB
const concurrency = require('./concurrency') const fp_helper = require('./filepath_helper') const proxy = require('./proxy') const web_crawler = require('./web_crawler') const {request} = require('@warren-bank/node-request') const {getCookieJar} = require('@warren-bank/node-request/lib/cookie_jar') const fs = require('fs') const stream = require('stream') const parse_url = require('url').parse // returns a Promise that resolves after all downloads are complete. const process_cli = function(argv_vals){ const options = { headers: argv_vals["--headers"], method: argv_vals["--method"] } const config = { followRedirect: (! argv_vals["--no-follow-redirect"]), maxRedirects: ( argv_vals["--max-redirect"] || 10), binary: true, stream: true } if (argv_vals["--no-validate-status-code"]) { config.validate_status_code = false } if (argv_vals["--load-cookies"]) { config.cookieJar = getCookieJar(argv_vals["--load-cookies"]) } let urls = [] if (argv_vals["--input-file"] && argv_vals["--input-file"].length) { urls = argv_vals["--input-file"] .map(url => url.trim().split(/[\t]+/, 2)) // Array<url, output_filepath> from first 2x tab separated columns; additional columns are ignored .filter(urldata => ((urldata[0].length) && (urldata[0][0] !== "#"))) // ignore empty lines, and lines that begin with "#" } if (argv_vals["--url"]) { urls.push(argv_vals["--url"]) // String<url> } const max_range_errors = 10 // number of failed multi-threaded downloads of byte ranges before the whole operation is aborted let range_error_counter = 0 // ----------------------------------------------------------------------------- // download next URL; returns a Promise const download_next_url = function(is_concurrent){ const urldata = urls.shift() const _url = Array.isArray(urldata) ? urldata[0] : urldata const _options = Object.assign( {}, options, ((typeof _url === 'string') ? parse_url(_url) : {}), ((_url instanceof Object) ? _url : {}) // includes 'range' header ) proxy.addProxyAgent(argv_vals, _options) const _config = {...config} let output_filepath = fp_helper.get_output_filepath(argv_vals, urldata) const is_range = output_filepath && (typeof output_filepath === 'number') && (output_filepath > 0) // file descriptor if (is_range) { // "response" is a Buffer _config.stream = false } if (!is_range && output_filepath && (output_filepath !== "-")) { if (fs.existsSync(output_filepath)) { if (argv_vals["--no-clobber"]) { // short-circuit HTTP request return Promise.resolve() } if (argv_vals["--continue"]) { // add 'Range' header to request const stats = fs.statSync(output_filepath, {throwIfNoEntry: false}) if (stats && stats.isFile() && stats.size) { _options.headers = options.headers ? {...options.headers} : {} _options.headers['range'] = `bytes=${stats.size}-` } } } } return request(_options, argv_vals["--post-data"], _config) .then(data => { // reset options that should only apply to the first request in a batch of downloads (ex: "--input-file" or "--recursive") if (!is_range && argv_vals["--post-data"]) { argv_vals["--post-data"] = null options.method = null } return data }) .then(({url, redirects, response} = {}) => { return web_crawler.apply_crawler_middleware(argv_vals, urls, {url, redirects, response, urldata}) }) .then(({url, redirects, response, new_output_filepath} = {}) => { if (new_output_filepath) { output_filepath = new_output_filepath } let abort = false if (url && argv_vals["--server-response"]) { console.log('url:', url) if (response.statusCode) console.log('status code:', response.statusCode) if (Array.isArray(redirects) && redirects.length) console.log('redirects:', JSON.stringify(redirects, null, 2)) if (response.headers) console.log('headers:', JSON.stringify(response.headers, null, 2)) } if (argv_vals["--dry-run"]) { abort = true } if (!abort && !output_filepath) { output_filepath = fp_helper.get_output_filepath(argv_vals, urldata, {url, redirects, response}) if (!output_filepath) { abort = true } } if (!abort && !is_range && (output_filepath !== "-")) { if (fs.existsSync(output_filepath)) { if (argv_vals["--no-clobber"]) { abort = true } if (argv_vals["--continue"] && _options.headers['range'] && (response.statusCode !== 206)) { console.log(`url: ${url}\nfile: ${output_filepath}\nerror:\n File is incomplete.\n Request to --continue does not return partial content.\n`) abort = true } } } if (abort) { if (response instanceof stream.Readable) { response.destroy() } return } if (!is_range && (output_filepath !== "-")) { fp_helper.make_parent_directory(output_filepath) } return new Promise((resolve, reject) => { if (!is_range) { const output_stream = (output_filepath !== "-") ? fs.createWriteStream(output_filepath, {flags: (argv_vals["--continue"] ? 'a' : 'w')}) : process.stdout if (response.headers && argv_vals["--save-headers"]) { output_stream.write(`${JSON.stringify(response.headers, null, 2)}\n\n`) } if (response instanceof stream.Readable) { response .pipe(output_stream) .on('finish', () => { resolve() }) .on('error', (error) => { response.destroy() reject(error) }) } else { // "response" is a utf8-encoded String output_stream.write(response.toString(), 'utf8', () => { // wait until data written to the output stream has been flushed to the file system output_stream.end() resolve() }) } } else { // "response" is a Buffer concurrency.write_range_data(argv_vals, response, urldata, resolve, reject) } }) }) .then(() => { return (argv_vals["--wait"] && urls.length && !is_concurrent) ? new Promise((resolve, reject) => { let delay_ms = argv_vals["--wait"] * 1000 if (argv_vals["--random-wait"]) { delay_ms = Math.floor(delay_ms * get_random_float_within_range(0.5, 1.5)) } setTimeout(resolve, delay_ms) }) : true }) .catch((error) => { if (!is_range) { console.log(`url: ${_url}\n${error.statusCode ? `status code: ${error.statusCode}\n` : ''}${error.location ? `location: ${error.location}\n` : ''}error: ${error.message}\n`) } else { range_error_counter++ if (range_error_counter < max_range_errors) { urls.unshift(urldata) } else { const error_msg = `ERROR: Too many failed attempts to download partial ranges.\n${max_range_errors} is the maximum allowed.\nAborting threaded download.` concurrency.abort_operation(argv_vals, error_msg, output_filepath) } } }) } const get_random_float_within_range = (min, max) => Math.random() * (max - min) + min // ----------------------------------------------------------------------------- // download URLs sequentially const process_download_queue_sequentially = async function(){ while(urls.length){ await download_next_url(false) } } // ----------------------------------------------------------------------------- // download URLs concurrently let active_download_count = 0 const process_download_queue_concurrently = function(cb){ if (active_download_count >= argv_vals["--max-concurrency"]) return if (!urls.length && !active_download_count) return cb() if (!urls.length) return active_download_count++ download_next_url(true) .then(() => { active_download_count-- process_download_queue_concurrently(cb) }) if (active_download_count < argv_vals["--max-concurrency"]) process_download_queue_concurrently(cb) } // ----------------------------------------------------------------------------- // download URLs if (argv_vals["--max-concurrency"] && (argv_vals["--max-concurrency"] >= 2)) { return Promise.resolve() .then(() => { if (urls.length === 1) { return concurrency.get_url_ranges(argv_vals, urls[0], options, config) } }) .then(({new_urls, file_descriptor} = {}) => { if (Array.isArray(new_urls) && new_urls.length) { urls = new_urls } return new Promise((resolve, reject) => { const cb = () => { if (file_descriptor) { fs.closeSync(file_descriptor) } resolve() } process_download_queue_concurrently(cb) }) }) } else { return process_download_queue_sequentially() } } // ----------------------------------------------------------------------------- module.exports = {request, download: process_cli}