UNPKG

jul11co-wdt

Version:

Jul11Co Web Download Tools

821 lines (704 loc) 24.5 kB
var util = require('util'); var fs = require('fs'); var path = require('path'); var urlutil = require('url'); var zlib = require('zlib'); var request = require('request'); var cheerio = require('cheerio'); var fse = require('fs-extra'); var async = require('async'); var mkdirp = require('mkdirp'); var mimetypes = require('./mimetypes'); var utils = require('./utils'); function requestWithEncoding(options, callback) { var cb_called = null; var finish = function(err, res, buffer) { if (!cb_called) { cb_called = true; callback(err, res, buffer); } } try { var req = request.get(options); var content_length = 0; var content_downloaded = 0; req.on('response', function(res) { var chunks = []; if (res.headers['content-length']) { content_length = parseInt(res.headers['content-length']); } res.on('data', function(chunk) { chunks.push(chunk); if (chunk) { content_downloaded += chunk.length; if (typeof options.progress == 'function') { options.progress(content_downloaded, content_length); } } }); res.on('end', function() { var buffer = Buffer.concat(chunks); var encoding = res.headers['content-encoding']; if (encoding == 'gzip') { zlib.gunzip(buffer, function(err, decoded) { finish(err, res, decoded && decoded.toString()); }); } else if (encoding == 'deflate') { zlib.inflate(buffer, function(err, decoded) { finish(err, res, decoded && decoded.toString()); }) } else { finish(null, res, buffer.toString()); } }); }); req.on('error', function(err) { finish(err); }); } catch(e) { finish(e); } } function requestJSON(options, callback) { var request_url = ''; if (typeof options == 'string') { request_url = options; options = {}; options.url = request_url; } var cb_called = false; var finish = function(err, result) { if (!cb_called) { cb_called = true; callback(err, result); } } if (!options.json) { options.json = true; } try { var req = request(options, function(err, res, json) { if (err) return finish(err, json, res); finish(null, json, res); }); } catch(e) { return finish(e); } } exports.downloadJSON = requestJSON; exports.downloadJson = requestJSON; function requestToFile(options, local_file, callback) { var request_url = ''; if (typeof options == 'string') { request_url = options; options = {}; options.url = request_url; } var cb_called = false; var finish = function(err, result) { if (!cb_called) { cb_called = true; callback(err, result); } } try { var req = request(options); var result = { headers: {}, content_type: '', content_length: 0, content_downloaded: 0, file: local_file }; var outFileStream = fs.createWriteStream(local_file); req.on('response', function (res) { result.headers = res.headers; if (res.headers['content-type']) { result.content_type = res.headers['content-type']; } if (res.headers['content-length']) { result.content_length = parseInt(res.headers['content-length']); } result.statusCode = res.statusCode; if (res.statusCode !== 200) { var error = new Error('Response status code: ' + res.statusCode); error.httpStatusCode = res.statusCode; return finish(error); } var encoding = res.headers['content-encoding']; if (encoding == 'gzip') { res.pipe(zlib.createGunzip()).pipe(outFileStream); } else if (encoding == 'deflate') { res.pipe(zlib.createInflate()).pipe(outFileStream); } else { res.pipe(outFileStream); } }); req.on('data', function(chunk) { if (chunk) { result.content_downloaded += chunk.length; if (typeof options.progress == 'function') { options.progress(result.content_downloaded, result.content_length); } } }); req.on('error', function(err) { finish(err); }); req.on('end', function() { if (result.headers['last-modified']) { try { var fd = fs.openSync(local_file, 'r'); if (fd >= 0) { fs.futimesSync(fd, new Date(), new Date(result.headers['last-modified'])); fs.closeSync(fd); } } catch (ex) { // console.log('Warning:', ex.message); } } finish(null, result); }); } catch(e) { return finish(e); } } // http://www.phaster.com/golden_hill_free_web/ghfw_connection_speed.shtml function computeDownloadSpeed(start_time, end_time, file_size) { // This function returns the speed in kB/s of the user's connection. speed = (Math.floor((((file_size) / ((end_time - start_time) / 1000)) / 1024) * 10) / 10); return speed; } // http://stackoverflow.com/questions/2901102/how-to-print-a-number-with // -commas-as-thousands-separators-in-javascript // function numberWithCommas(x) { // return x.toString().replace(/\B(?=(\d{3})+(?!\d))/g, ","); // } // options: // { // skip_if_exist: Boolean, // request_headers: Object, // key: value // request_timeout: Integer, // milliseconds // max_attempts: Integer, // backoff_delay: Integer, // no_rename: Boolean, // return_headers: Boolean, // // Callbacks // onProgress: function({url: String, file: String, timestamp: Date, // speed: Float, percentage: Float, current: Integer, total: Integer}), // onDownloadStart: function({url: String, local_file: String}), // onDownloadTimeout: function({url: String, local_file: String, attempts: Integer, max_attempts: Integer}), // onDownloadFailed: function(err, {url: String, local_file: String}), // onRename: function({old_file: String, new_file: String}), // onDownloadFinished: function({file: String, file_size: Integer, content_type: String, headers: Object}) // } exports.downloadFile = function(url, local_file, options, attempts, callback) { if (typeof options == 'function') { callback = options; options = {}; attempts = 0; } if (typeof attempts == 'function') { callback = attempts; attempts = 0; } if (options.skip_if_exist && utils.fileExists(local_file)) { if (options.debug) console.log('File exists: ' + utils.ellipsisMiddle(local_file)); return callback(null, { file: local_file }); } var output_dir = path.dirname(local_file); utils.ensureDirectoryExists(output_dir); var output_file_tmp = local_file + '.part'; var default_headers = { 'User-Agent': options.user_agent || 'request' }; var default_timeout = 20000; /* 20 seconds */ var request_options = { url: url, headers: options.download_headers || options.request_headers || default_headers, timeout: options.download_timeout || options.request_timeout || default_timeout }; if (typeof options.progress == 'function') { var start_time = (new Date()).getTime(); request_options.progress = function (current, total) { var current_time = (new Date()).getTime(); var current_speed = computeDownloadSpeed(start_time, current_time, current); var percentage = ((current/total)*100).toFixed(); options.progress({ url: url, file: local_file, timestamp: current_time, speed: current_speed, percentage: percentage, current: current, total: total }); }; } if (utils.fileExists(output_file_tmp)) { utils.removeFileSync(output_file_tmp); } if (typeof options.onDownloadStart == 'function') { options.onDownloadStart(err, {url: url, local_file: local_file}); } requestToFile(request_options, output_file_tmp, function (err, res) { if (err) { // if (options.debug) console.log(err); if (err.code == 'ECONNRESET') { if (utils.fileExists(output_file_tmp)) { utils.removeFileSync(output_file_tmp); } } if (err.code == 'ESOCKETTIMEDOUT' || err.code == 'ETIMEDOUT' || err.code == 'ECONNRESET' || err.code == 'ECONNREFUSED') { attempts++; var max_attempts = options.max_attempts || 5; var backoff_delay = (options.backoff_delay || 5000) * (attempts); // 5 seconds if (typeof options.onDownloadTimeout == 'function') { options.onDownloadTimeout(err, { url: url, local_file: local_file, attempts: attempts, max_attempts: max_attempts }); } if (attempts < max_attempts) { setTimeout(function() { if (typeof options.onDownloadRetry == 'function') { options.onDownloadRetry({ url: url, local_file: local_file, attempts: attempts, max_attempts: max_attempts }); } exports.downloadFile(url, local_file, options, attempts, callback); }, backoff_delay); return; } } if (typeof options.onDownloadFailed == 'function') { options.onDownloadFailed(err, {url: url, local_file: local_file}); } if (err.code == 404) { var error = new Error('File not found'); error.httpStatusCode = 404; return callback(error); } else if (err.code) { var error = new Error('Download error'); error.httpStatusCode = err.code; return callback(error); } return callback(err); } if (!res.file) { if (typeof options.onDownloadFailed == 'function') { options.onDownloadFailed(err, {url: url, local_file: local_file}); } return callback(new Error('File download failed. Unknown error.')); } var result_file_size = utils.getFileSize(res.file); if (res.content_length && result_file_size && res.content_length != result_file_size) { var retry_downloading = options.retry_incomplete_download; if (typeof options.onDownloadIncomplete == 'function') { retry_downloading = options.onDownloadIncomplete(err, { url: url, local_file: local_file, file_size: res.content_length, incomplete_size: result_file_size }); } if (retry_downloading) { // retry downloading attempts++; var max_attempts = options.max_attempts || 5; var backoff_delay = (options.backoff_delay || 2000) * (attempts); if (attempts < max_attempts) { if (options.debug) console.log('Retry download file (attempt ' + attempts + '/' + max_attempts +'):', res.file); if (utils.fileExists(res.file)) { utils.removeFileSync(res.file); } setTimeout(function() { if (typeof options.onDownloadRetry == 'function') { options.onDownloadRetry({ url: url, local_file: local_file, attempts: attempts, max_attempts: max_attempts }); } exports.downloadFile(url, local_file, options, attempts, callback); }, backoff_delay); return; } } } if (utils.fileExists(res.file)) { fse.moveSync(res.file, local_file, {overwrite: true}); } var result_file = local_file; var result_content_type = res.content_type; if (!options.no_rename && typeof result_content_type != 'undefined' && utils.fileExists(result_file)) { var semicolon = result_content_type.indexOf(';'); if (semicolon > 0) { result_content_type = result_content_type.substring(0, semicolon); } var extensions = mimetypes.extensions(result_content_type); var extname = path.extname(result_file).toLowerCase(); if (extensions && extensions.length > 0 && extensions.indexOf(extname.replace('.','')) == -1) { var dirname = path.dirname(result_file); var new_file = path.join(dirname, path.basename(result_file, extname) + '.' + extensions[0]); if (typeof options.onRename == 'function') { options.onRename({ old_file: result_file, new_file: new_file }); } fs.renameSync(result_file, new_file); result_file = new_file; } } var result = { file: result_file, file_size: result_file_size, content_type: result_content_type } if (options.return_headers) { result.headers = res.headers; } if (typeof options.onDownloadFinished == 'function') { options.onDownloadFinished(result) } callback(null, result); }); } exports.downloadFiles = function(files, options, callback) { if (typeof options == 'function') { callback = options; options = {}; } if (typeof options.output_dir != 'undefined') { utils.ensureDirectoryExists(options.output_dir); } var getLocalFilePath = function(file_url) { var file_url_obj = urlutil.parse(file_url); var file_path = file_url_obj.host + file_url_obj.pathname; return (options.output_dir || '.') + '/' + file_path; } var cb_called = false; var finish = function(err, files) { if (!cb_called) { cb_called = true; callback(err, files); } } var max_download_threads = options.max_download_threads || 4; // limit number of concurrent downloads at a time async.eachLimit(files, max_download_threads, function(file_info, cb) { var file_url = ''; var local_file = ''; if (typeof file_info == 'string') { file_url = file_info; local_file = getLocalFilePath(file_url); } else if (typeof file_info == 'object') { file_url = file_info.url; if (!file_info.local_file) { local_file = getLocalFilePath(file_url); } else { local_file = file_info.local_file; } } if (options.skip_if_exist && utils.fileExists(local_file)) { if (options.debug) console.log('File exists: ' + utils.ellipsisMiddle(local_file)); return cb(); } exports.downloadFile(file_url, local_file, options, function(err, result) { if (err) { if (typeof file_info == 'object') { file_info.error = true; if (typeof err.code != 'undefined') { file_info.error_code = err.code; } } return cb(/*err*/); } cb(); }); }, function(err) { finish(err, files); }); } exports.downloadHtml = function(url, options, attempts, callback) { if (typeof options == 'function') { callback = options; options = {}; attempts = 0; } if (typeof attempts == 'function') { callback = attempts; attempts = 0; } var request_url = url; if (options.html_proxy && options.html_proxy != '') { request_url = options.html_proxy + '?url=' + encodeURIComponent(request_url); } var default_headers = { 'User-Agent': options.user_agent || 'request' }; var default_timeout = 20000; /* 20 seconds */ var request_options = { url: request_url, headers: options.download_headers || options.request_headers || default_headers, timeout: options.download_timeout || options.request_timeout || default_timeout }; requestWithEncoding(request_options, function(error, response, html) { if (error) { attempts++; if (error.code == "ESOCKETTIMEDOUT" || error.code == "ETIMEDOUT" || error.code == "ECONNRESET" || error.code == "ECONNREFUSED") { var max_attempts = options.max_attempts || 5; var backoff_delay = (options.backoff_delay || 5000) * attempts; // 5 seconds if (typeof options.onDownloadTimeout == 'function') { options.onDownloadTimeout(err, { url: url, attempts: attempts, max_attempts: max_attempts }); } if (attempts < max_attempts) { setTimeout(function() { if (typeof options.onDownloadRetry == 'function') { options.onDownloadRetry({ url: url, attempts: attempts, max_attempts: max_attempts }); } exports.downloadHtml(url, options, attempts, callback); }, backoff_delay); return; } } if (error.code) { error.httpStatusCode = error.code; } return callback(error); } if (response.statusCode != 200) { var error = new Error('Request failed with status code ' + response.statusCode); error.httpStatusCode = response.statusCode; return callback(error); } var content_type = response.headers['content-type']; if (content_type && content_type.indexOf('html') == -1) { var error = new Error('Not HTML page (' + content_type + ')'); error.httpStatusCode = response.statusCode; error.httpHeaders = response.headers; return callback(error); } return callback(null, { requested_url: url, resolved_url: response.request.href, html: html }, response); }); } exports.downloadHtml2 = function(url, options, callback) { return exports.downloadHtml(url, options, function(err, result, response) { if (err) return callback(err); if (!result || !result.html) return callback(null, null, response); callback(null, result.html, response); }); } exports.downloadPage = function(url, options, attempts, callback) { if (typeof options == 'function') { callback = options; options = {}; attempts = 0; } if (typeof attempts == 'function') { callback = attempts; attempts = 0; } var request_url = url; if (options.html_proxy && options.html_proxy != '') { request_url = options.html_proxy + '?url=' + encodeURIComponent(request_url); } var default_headers = { 'User-Agent': options.user_agent || 'request' }; var default_timeout = 20000; /* 20 seconds */ var request_options = { url: request_url, headers: options.download_headers || options.request_headers || default_headers, timeout: options.download_timeout || options.request_timeout || default_timeout }; requestWithEncoding(request_options, function(error, response, html) { if (error) { if (error.code == "ESOCKETTIMEDOUT" || error.code == "ETIMEDOUT" || error.code == "ECONNRESET" || error.code == "ECONNREFUSED") { attempts++; var max_attempts = options.max_attempts || 5; var backoff_delay = (options.backoff_delay || 5000) * attempts; // 5 seconds if (typeof options.onDownloadTimeout == 'function') { options.onDownloadTimeout(err, { url: url, attempts: attempts, max_attempts: max_attempts }); } if (attempts < max_attempts) { setTimeout(function() { if (typeof options.onDownloadRetry == 'function') { options.onDownloadRetry({ url: url, attempts: attempts, max_attempts: max_attempts }); } exports.downloadPage(url, options, attempts, callback); }, backoff_delay); return; } } if (error.code) { error.httpStatusCode = error.code; } return callback(error); } if (response.statusCode != 200) { var error = new Error('Request failed with status code ' + response.statusCode); error.httpStatusCode = response.statusCode; return callback(error); } var content_type = response.headers['content-type']; if (content_type && content_type.indexOf('html') == -1) { var error = new Error('Requested data is not HTML'); error.httpStatusCode = response.statusCode; error.httpContentType = content_type; return callback(error); } var page_url = response.request.href; if (options.html_proxy && options.html_proxy != '') { page_url = url; } var $ = cheerio.load(html); var page_base_url = null; if ($('head base').length) { page_base_url = $('head base').attr('href'); } // Fix links var page_host_url = utils.urlGetHost(page_url); var page_host_url_obj = urlutil.parse(page_host_url); var page_url_obj = urlutil.parse(page_base_url || page_url); $('body a').each(function(){ var link_href = $(this).attr('href'); if (!utils.isValidLink(link_href)) return; var link_url = link_href; link_url = link_url.replace('http:///', '/'); if (link_url.indexOf('//') == 0) { link_url = page_host_url_obj.protocol + link_url; } var link_url_obj = urlutil.parse(link_url); if (!link_url_obj.host) { if (link_url.indexOf('/') == 0) { link_url = urlutil.resolve(page_host_url_obj, link_url_obj); } else { link_url = urlutil.resolve(page_url_obj, link_url_obj); } } else { link_url = urlutil.format(link_url_obj); } $(this).attr('href', link_url); }); callback(null, { url: page_url, base_url: page_base_url, $: $, html: html }); }); } exports.downloadImage = function(image_src, options, callback) { var image_file = ''; if (typeof options.image_file != 'undefined') { image_file = options.image_file; } else { var image_src_obj = urlutil.parse(image_src); image_file = path.basename(image_src_obj.pathname); } if (typeof options.output_dir != 'undefined') { image_file = path.join(options.output_dir, image_file); } if (options.skip_if_exist && utils.fileExists(image_file)) { if (options.debug) console.log('File exists: ' + utils.ellipsisMiddle(image_file)); return callback(null, { file: image_file }); } exports.downloadFile(image_src, image_file, options, callback); } exports.downloadImages = function(images, options, callback) { if (typeof options == 'function') { callback = options; options = {}; } options = options || {}; callback = callback || function(err) {}; if (typeof options.output_dir != 'undefined') { utils.ensureDirectoryExists(options.output_dir); } var max_download_threads = options.max_download_threads || 4; // limit number of concurrent downloads at a time async.eachLimit(images, max_download_threads, function(image_info, cb) { var image_src = ''; var image_file = ''; if (typeof image_info == 'string') { image_src = image_info; var image_src_obj = urlutil.parse(image_src); image_file = path.basename(image_src_obj.pathname); } else if (typeof image_info == 'object') { image_src = image_info.image_src || image_info.src; image_file = image_info.image_file || image_info.file || path.basename(image_src); } var download_options = { image_file: image_file, output_dir: options.output_dir, skip_if_exist: options.skip_if_exist, return_headers: options.return_headers, no_rename: options.no_rename, request_headers: options.request_headers, request_timeout: options.request_timeout, verbose: options.verbose, debug: options.debug, retry_incomplete_download: options.retry_incomplete_download, // callbacks onProgress: options.progress || options.onProgress, onDownloadStart: options.onDownloadStart, onDownloadTimeout: options.onDownloadTimeout, onDownloadFailed: options.onDownloadFailed, onRename: options.onRename, onDownloadFinished: options.onDownloadFinished }; exports.downloadImage(image_src, download_options, function(err, result) { if (err) { if (typeof image_info == 'object') { image_info.error = true; if (typeof err.code != 'undefined') { image_info.error_code = err.code; } } return cb(/*err*/); } else { if (image_info.image_file) { image_info.image_file = path.basename(result.file); } else { image_info.file = path.basename(result.file); } } cb(); }); }, function(err) { callback(err, images); }); }