UNPKG

jul11co-wdt

Version:

Jul11Co Web Download Tools

970 lines (811 loc) 26.2 kB
var util = require('util'); var fs = require('fs'); var path = require('path'); var urlutil = require('url'); var async = require('async'); var archiver = require('archiver'); var mkdirp = require('mkdirp'); var jsonfile = require('jsonfile'); var cheerio = require('cheerio'); var downloader = require('./downloader'); var utils = require('./utils'); var EventEmitter = require('events').EventEmitter; var page_handlers = []; // if (utils.directoryExists(__dirname + '/handlers')) { // fs.readdirSync(__dirname + '/handlers').forEach(function(file) { // if (file.indexOf('.js') > 0) { // var handler = require("./handlers/" + file); // page_handlers.push(handler); // } // }); // } var Saver = function(options) { EventEmitter.call(this); options = options || {}; this._output_dir = options.output_dir; this._config_file = options.config_file; this._state_file = options.state_file; this._state_file_name = options.state_file_name || 'saver.json'; this._state = {}; this._config = {}; this._logs = []; this._page_handlers = page_handlers.slice(0); this._debug = options.debug; this._exited = false; if (typeof this._state_file != 'undefined') { this._state = (this.loadStateSync(this._state_file) || {}); } else if (typeof this._output_dir != 'undefined') { this._state_file = path.join(this._output_dir, this._state_file_name); this._state = (this.loadStateSync(this._state_file) || {}); } if (typeof this._config_file != 'undefined') { this._config = (this.loadConfigSync(this._config_file) || {}); } this._save_state_on_exit = true; if (typeof options.save_state_on_exit != 'undefined') { this._save_state_on_exit = options.save_state_on_exit; } if (this._save_state_on_exit) { if (this._debug) { console.log('[Saver] State will be saved on exit to ' + this._output_dir); } this._exit_handler = exitHandler.bind(this); process.on('exit', this._exit_handler); } } util.inherits(Saver, EventEmitter); function exitHandler(error) { if (!this._exited) { this._exited = true; if (typeof this._output_dir != 'undefined') { if (this._debug) console.log('[Saver] State saved to: ' + this._output_dir); this.saveStateSync(this._state_file); } } } Saver.prototype.exit = function(err) { this.emit('before_exit'); if (this._save_state_on_exit) { process.removeListener('exit', this._exit_handler); } exitHandler.call(this, err); this.emit('exit', err); } Saver.prototype.log = function(log) { console.log('[Saver] [LOG]', log); // this._logs.push(log); this.emit('log', log); } Saver.prototype.error = function(error) { console.log('[Saver] [ERROR]', error); // this._logs.push('ERROR', error); this.emit('error', error); } // ===== // config // ===== Saver.prototype.loadConfigSync = function(config_file) { var config = null; if (utils.fileExists(config_file)) { try { config = jsonfile.readFileSync(config_file); } catch (e) { } } return config; } // ===== // state // ===== Saver.prototype.getOutputDir = function() { return this._output_dir; } Saver.prototype.setOutputDir = function(output_dir) { this._output_dir = output_dir; this._state_file = path.join(this._output_dir, this._state_file_name); } Saver.prototype.loadStateSync = function(state_file) { if (this._debug) console.log('[Saver] Load state file:', state_file); var state = null; try { var stats = fs.statSync(state_file); if (stats.isFile()) { state = jsonfile.readFileSync(state_file); } } catch (e) { } if (state != null) { this._state = state; } return state; } Saver.prototype.saveStateSync = function(state_file) { var err = null; try { jsonfile.writeFileSync(state_file, this._state, { spaces: 2 }); } catch (e) { err = e; } return err; } Saver.prototype.getState = function() { return this._state; } Saver.prototype.setStateData = function(key, value) { this._state[key] = value; } Saver.prototype.getStateData = function(key) { return this._state[key]; } function updateObject(original, update, verbose) { if (typeof original == 'object' && typeof update == 'object') { for (var prop in update) { if (verbose) { console.log('Update prop "' + prop + '":', ' (' + typeof original[prop] + ' --> ' + typeof update[prop] + ')'); } if (typeof original[prop] == 'object' && typeof update[prop] == 'object') { updateObject(original[prop], update[prop], verbose); } else { original[prop] = update[prop]; } } } else { original = update; } } Saver.prototype.updateStateData = function(key, update, save_to_file) { if (typeof this._state[key] == 'object' && typeof update == 'object') { updateObject(this._state[key], update); } else { this._state[key] = update; } if (save_to_file && typeof this._state_file != 'undefined') { this.saveStateSync(this._state_file); } } // for array data only Saver.prototype.pushStateData = function(key, value, save_to_file) { if (Object.prototype.toString.call(this._state[key]) === '[object Array]') { this._state[key].push(value); } else if (typeof this._state[key] == 'undefined') { this._state[key] = []; this._state[key].push(value); } if (save_to_file && typeof this._state_file != 'undefined') { this.saveStateSync(this._state_file); } } Saver.prototype.deleteStateData = function(key, save_to_file) { if (typeof this._state[key] != 'undefined') { delete this._state[key]; } if (save_to_file && typeof this._state_file != 'undefined') { this.saveStateSync(this._state_file); } } // options // { // page_url: String, // output_dir: String // } Saver.prototype.start = function(options, callback) { var self = this; if (options.page_url) { self.processPage(options.page_url, options, function(err) { if (err) { console.log(err); } self.exit(err); }); } else { self.exit(); } } // ===== // page processing // ===== // handler // { // name: String, // url_match: new RegExp(...), // DEPRECATED // match: function(link, options) {...}, // dispatch: function($, page, options, callback) {...} // } Saver.prototype.addHandler = function(handler) { if (this._debug) console.log('[Saver] Add handler:', handler.name); this._page_handlers.push(handler); } Saver.prototype.isVisited = function(link) { var state = this.getStateData(link); return (state && state.visited); } Saver.prototype.setVisited = function(link) { this.updateStateData(link, { visited: 1, last_visited: new Date() }) } // link // { // url: String, // cache_bypass: Boolean // } Saver.prototype.getPage = function(link, options, callback) { var self = this; var local_file = getIndexHTMLFilePath({url: link.url}, options); if (!options.cache_bypass && !link.cache_bypass && utils.fileExists(local_file)) { if (options.debug) console.log('[Saver] Cached: ' + local_file); if (link.url.substring(link.url.length-16) == '.html/index.html') { link.url = link.url.substring(0, link.url.length-11); if (options.verbose || options.debug) console.log('[Saver] URL: ' + link.url); } var page = { url: link.url }; var page_html = self.loadHtmlSync(local_file); if (page_html) { page.html_cached = true; page.html = page_html; page.$ = cheerio.load(page_html); return callback(null, page); } // download new HTML // fall through } return self.downloadPage(link.url, options, callback); } // link: String or following object // { // url: String, // cache_bypass: Boolean // } Saver.prototype.processPage = function(link, options, callback) { var self = this; if (self._debug || options.debug) console.log('[Saver] Process page: ', link); var link_url = (typeof link == 'object') ? link.url : link; var link_obj = { url: link_url }; if (typeof link == 'object') { link_obj.cache_bypass = link.cache_bypass; } // self.downloadPage(link, options, function(err, result) { self.getPage(link_obj, options, function(err, result) { if (err) { self.setVisited(link_url); // set visited return callback(err); } if (!result.$) { self.setVisited(link_url); // set visited return callback(new Error('Invalid HTML ($==null)')); } self.setVisited(link_url); // set visited var $ = result.$; var page = { url: result.url, html: result.html, html_cached: result.html_cached }; if (options.verbose || options.debug) console.log("[Saver] Visit page: " + page.url); if ($('head base').length) { page.base_url = $('head base').attr('href'); if (options.debug) console.log('[Saver] Base URL:', page.base_url); } page.title = $('title').first().text(); if (page.title) { page.title = page.title.replace(/(\r\n|\n|\r)/gm, ''); } if (options.verbose || options.debug) console.log('[Saver] Page title:', page.title); var link_obj = urlutil.parse(page.url); var output_dir_name = path.basename(link_obj.pathname); var output_dir = path.join((options.output_dir || '.'), output_dir_name); if (options.verbose || options.debug) console.log('[Saver] Output dir: ' + output_dir); page.output_dir = output_dir; var handlers = []; for (var i = 0; i < self._page_handlers.length; i++) { if (self._page_handlers[i].match(page.url, options)) { handlers.push(self._page_handlers[i]); } /*else if (page.url.match(self._page_handlers[i].url_match)) { handlers.push(self._page_handlers[i]); }*/ } if (handlers.length == 0 && (self._debug || options.debug)) { console.log('[Saver] No handler:', page.url); } async.eachSeries(handlers, function(handler, cb) { if (self._debug || options.debug) console.log('[Saver] Handler:', handler.name); handler.dispatch(self, $, page, options, function(err) { if (err) return cb(err); cb(); }); }, function(err) { if (err) { return callback(err); } callback(null, page); }); }); } Saver.prototype.processPages = function(links, options, callback) { var self = this; if (!links || links.length == 0) { return callback(); } var process_queue = []; for (var i = 0; i < links.length; i++) { var link = links[i]; if (!options.refresh) { var saved_data = self.getStateData(link); if (options.force || !saved_data || !saved_data.done) { process_queue.push(link); } } } if (process_queue.length == 0) { return callback(); } var current = 0; var total = process_queue.length; async.eachSeries(process_queue, function(link, cb) { current++; if (self.isVisited(link)) { if (options.verbose || options.debug) console.log("[Saver] [" + current + "/" + total + "] Visited: " + link); return cb(); } if (options.verbose || options.debug) console.log("[Saver] [" + current + "/" + total + "] Visit page: " + link); self.processPage(link, options, function(err, page) { if (err) { if (self._debug || options.debug) console.log('[Saver] Process page error: ' + link + ' ' + err.message); return cb(err); } if (page && page.html_cached) { return cb(); } else { var page_delay = options.page_delay ? parseInt(options.page_delay) : 1000; if (isNaN(page_delay)) page_delay = 1000; return setTimeout(cb, page_delay); // delay } }); }, function(err) { if (err) { if (self._debug || options.debug) console.log('[Saver] Process pages error!'); return callback(err); } callback(); }); } // ===== // download // ===== Saver.prototype.downloadHtml = function(url, options, callback) { downloader.downloadHtml(url, options, callback); } Saver.prototype.downloadPage = function(url, options, callback) { downloader.downloadPage(url, options, callback); } Saver.prototype.downloadFile = function(url, local_file, options, callback) { downloader.downloadFile(url, local_file, options, callback); } Saver.prototype.downloadImage = function(image_src, options, callback) { downloader.downloadImage(image_src, options, callback); } Saver.prototype.downloadImages = function(images, options, callback) { downloader.downloadImages(images, options, callback); } function cloneObject(obj) { var clone = {}; for (var prop in obj) { if (obj[prop] && typeof obj[prop] == 'object') { clone[prop] = cloneObject(obj[prop]); } else { clone[prop] = obj[prop]; } } return clone; } Saver.prototype.saveImages = function(page, images, options, callback) { var self = this; if (typeof options == 'function') { callback = options; options = {}; } self.updateStateData(page.url, { output_dir: path.relative(options.output_dir, page.output_dir||options.output_dir), images: images, done: false, last_update: new Date() }); var download_options = cloneObject(options); download_options.output_dir = page.output_dir; download_options.skip_if_exist = true; // download images self.downloadImages(images, download_options, function(err, images) { if (err) { return callback(err); } self.updateStateData(page.url, { images: images, done: true, last_update: new Date() }); callback(); }); } // === // save // === Saver.prototype.loadHtmlSync = function(input_file) { if (!utils.fileExists(input_file)) return ''; return fs.readFileSync(input_file, 'utf8'); } Saver.prototype.saveHtmlSync = function(output_file, html) { var output_dir = path.dirname(output_file); utils.ensureDirectoryExists(output_dir); fs.writeFileSync(output_file, html, 'utf8'); } function getIndexHTMLFilePath(page, options) { var output_dir = (options.output_dir || '.'); var index_file = ''; var page_url_obj = urlutil.parse(page.url); var page_output_dir_path = path.join(page_url_obj.host, page_url_obj.pathname); var page_output_dir = path.join(output_dir, page_output_dir_path); if (options.html_file_root) { page_output_dir = path.join(output_dir, options.html_file_root, page_output_dir_path); } if (page_url_obj.query) { index_file = path.resolve(page_output_dir, 'index-' + page_url_obj.query + '.html'); } else { index_file = path.resolve(page_output_dir, 'index.html'); } return index_file; } Saver.prototype.saveHtmlFile = function($, page, options) { var html_file = getIndexHTMLFilePath(page, options); this.saveHtmlSync(html_file, $.html()); } Saver.prototype.saveTextSync = function(output_file, text, encoding) { var output_dir = path.dirname(output_file); utils.ensureDirectoryExists(output_dir); fs.writeFileSync(output_file, text, encoding || 'utf8'); } Saver.prototype.loadJsonSync = function(json_file) { var json_obj = null; try { var stats = fs.statSync(json_file); if (stats.isFile()) { json_obj = jsonfile.readFileSync(json_file); } } catch (e) { } return json_obj; } Saver.prototype.saveJsonSync = function(json_file, object/*, encoding*/) { var output_dir = path.dirname(json_file); utils.ensureDirectoryExists(output_dir); try { jsonfile.writeFileSync(json_file, object, { spaces: 2 }); } catch (e) { console.log(e); return e; } return null; } // files: [ // {path: String, name: String} // ] Saver.prototype.createZipArchive = function(output_file, files, callback) { var error = null; var output_dir = path.dirname(output_file); // utils.ensureDirectoryExists(output_dir); var archive = archiver('zip'); archive.on('end', function() { if (!error) { // console.log('Zip archive created %d bytes', archive.pointer()); console.log('File zipped:', output_file); callback(); } }); archive.on('error', function(err){ console.log('File zip error:', output_file); if (!error) { error = err; callback(err); } }); var output = fs.createWriteStream(output_file); archive.pipe(output); files.forEach(function(file) { if (!file.path || file.path == '') return; if (file.name && file.name != '') { archive.file(file.path, { name: file.name }); } else { archive.file(file.path, { name: path.basename(file.path) }); } }); archive.finalize(); } // === // fix // === Saver.prototype.fixLink = function(url, page, options) { options = options || {}; var page_host_url = utils.urlGetHost(page.url); var page_host_url_obj = urlutil.parse(page_host_url); var page_url_obj = urlutil.parse(page.base_url || page.url); var link_url = url; if (!utils.isValidLink(link_url)) { return link_url; } if (link_url.indexOf('//') == 0) { link_url = page_host_url_obj.protocol + link_url; } var link_url_obj = urlutil.parse(link_url); if (!link_url_obj.host) { if (link_url.indexOf('/') == 0) { link_url = urlutil.resolve(page_host_url_obj, link_url_obj); } else { link_url = urlutil.resolve(page_url_obj, link_url_obj); } } else { link_url = urlutil.format(link_url_obj); } if (typeof options.link_editor == 'function') { link_url = options.link_editor(link_url); } return link_url; } Saver.prototype.fixLinks = function($, page, selector, options) { options = options || {}; var page_host_url = utils.urlGetHost(page.url); var page_host_url_obj = urlutil.parse(page_host_url); var page_url_obj = urlutil.parse(page.base_url || page.url); $('' + selector + ' a').each(function(){ var link_href = $(this).attr('href'); if (!utils.isValidLink(link_href)) return; var link_url = link_href; if (link_url.indexOf('//') == 0) { link_url = page_host_url_obj.protocol + link_url; } var link_url_obj = urlutil.parse(link_url); if (!link_url_obj.host) { if (link_url.indexOf('/') == 0) { link_url = urlutil.resolve(page_host_url_obj, link_url_obj); } else { link_url = urlutil.resolve(page_url_obj, link_url_obj); } } else { link_url = urlutil.format(link_url_obj); } if (typeof options.link_editor == 'function') { link_url = options.link_editor(link_url); } $(this).attr('href', link_url); }); } Saver.prototype.fixImages = function($, page, selector, options) { options = options || {}; var page_host_url = utils.urlGetHost(page.url); var page_host_url_obj = urlutil.parse(page_host_url); var page_url_obj = urlutil.parse(page.base_url || page.url); $('' + selector + ' img').each(function(){ var img_src = $(this).attr('src'); if (img_src && img_src != "") { var img_url = img_src; if (img_url.indexOf('data:') == 0) return; if (img_url.indexOf('//') == 0) { img_url = page_host_url_obj.protocol + img_url; } var img_url_obj = urlutil.parse(img_url); if (!img_url_obj.host) { if (img_url.indexOf('/') == 0) { img_url = urlutil.resolve(page_host_url_obj, img_url_obj); } else { img_url = urlutil.resolve(page_url_obj, img_url_obj); } } else { img_url = urlutil.format(img_url_obj); } $(this).attr('src', img_url); } }); } // === // get // === function getUniqueFileName(file_names, file_name) { var result_file_name = file_name; var file_name_ext = path.extname(file_name); var file_name_base = path.basename(file_name, file_name_ext); var collision = false; for (var i = 0; i < file_names.length; i++) { if (file_name == file_names[i].file_name) { collision = true; file_names[i].current_index++; result_file_name = file_name_base + '(' + file_names[i].current_index + ')' + file_name_ext; } } if (!collision) { file_names.push({ file_name: file_name, current_index: 0 }); } return result_file_name; } Saver.prototype.getUniqueFileName = getUniqueFileName; function getUniqueFilePath(file_path) { var result_file_dir = path.dirname(file_path); var result_file_path = file_path; var file_index = 0; var file_ext = path.extname(result_file_path); var file_name_base = path.basename(result_file_path, file_ext); while (utils.fileExists(result_file_path)) { file_index++; var file_name = file_name_base + '(' + file_index + ')' + file_ext; result_file_path = path.join(result_file_dir, file_name); } return result_file_path; } Saver.prototype.getUniqueFilePath = getUniqueFilePath; // options // { // blacklist: [String], // visited_links: [String], // filters: [String], // validator: function(link) {...}, // exclude_visited_links: Boolean // } Saver.prototype.getLinks = function($, page, selector, options) { options = options || {}; var self = this; var blacklist = options.blacklist || []; var visited_links = options.visited_links || []; var filters = options.filters || []; var isVisited = function(link) { if (visited_links && visited_links.length) { // && Array.isArray(visited_links) return (visited_links.indexOf(link) >= 0); } else { // 'object' or 'undefined' return self.isVisited(link); } } var links = []; var page_host_url = utils.urlGetHost(page.url); var page_host_url_obj = urlutil.parse(page_host_url); var page_url_obj = urlutil.parse(page.base_url || page.url); $('' + selector + ' a').each(function(){ var link_href = $(this).attr('href'); if (!utils.isValidLink(link_href)) return; var link_url = link_href; link_url = link_url.replace('http:///', '/'); if (link_url.indexOf('//') == 0) { link_url = page_host_url_obj.protocol + link_url; } var link_url_obj = urlutil.parse(link_url); var link_url_host = link_url_obj.host; if (!link_url_host) { // link_url = urlutil.resolve(page_host_url_obj, link_url_obj); if (link_url.indexOf('/') == 0) { link_url = urlutil.resolve(page_host_url_obj, link_url_obj); } else { link_url = urlutil.resolve(page_url_obj, link_url_obj); } link_url_host = page_host_url_obj.host; } else { link_url = urlutil.format(link_url_obj); } // filter_host if (typeof options.filter_host != 'undefined') { if (link_url_host != options.filter_host) return; } // $(this).attr('href', link_url); link_url = link_url.split('#')[0]; if (link_url == page.url) return; // exclude visited link if (options.exclude_visited_links) { if (isVisited(link_url)) return; } // blacklist if (typeof blacklist != 'undefined' && blacklist.length > 0) { var blacklisted = false; for (var i = 0; i < blacklist.length; i++) { if (link_url.indexOf(blacklist[i]) >= 0) { blacklisted = true; break; } } if (blacklisted) return; } // filters if (typeof filters != 'undefined' && filters.length > 0) { var filter_out = true; for (var i = 0; i < filters.length; i++) { if (link_url.indexOf(filters[i]) >= 0) { filter_out = false; break; } } if (filter_out) return; } if (links.indexOf(link_url) == -1) { if (typeof options.validator == 'function') { if (options.validator(link_url)){ links.push(link_url); } } else { links.push(link_url); } } }); return links; } // options // { // blacklist: [String], // filters: [String] // } Saver.prototype.getImages = function($, page, selector, options) { options = options || {}; var blacklist = options.blacklist || []; var filters = options.filters || []; var image_urls = []; var image_file_names = []; var images = []; var page_host_url = utils.urlGetHost(page.url); var page_host_url_obj = urlutil.parse(page_host_url); var page_url_obj = urlutil.parse(page.base_url || page.url); $('' + selector + ' img').each(function(){ var image_src = $(this).attr('src'); var image_alt = $(this).attr('alt'); if (image_src && image_src != "") { if (image_src.indexOf('data:') == 0) return; var image_url = image_src; if (image_url.indexOf('//') == 0) { image_url = page_host_url_obj.protocol + image_url; } var image_url_obj = urlutil.parse(image_url); if (!image_url_obj.host) { // image_url = urlutil.resolve(page_host_url_obj, image_url_obj); if (image_url.indexOf('/') == 0) { image_url = urlutil.resolve(page_host_url_obj, image_url_obj); } else { image_url = urlutil.resolve(page_url_obj, image_url_obj); } } else { image_url = urlutil.format(image_url_obj); } if (image_urls.indexOf(image_url) >= 0) return; image_urls.push(image_url); // blacklist if (typeof blacklist != 'undefined' && blacklist.length > 0) { var blacklisted = false; for (var i = 0; i < blacklist.length; i++) { if (image_url.indexOf(blacklist[i]) >= 0) { blacklisted = true; break; } } if (blacklisted) return; } // filters if (typeof filters != 'undefined' && filters.length > 0) { var filter_out = true; for (var i = 0; i < filters.length; i++) { if (image_url.indexOf(filters[i]) >= 0) { filter_out = false; break; } } if (filter_out) return; } var image_file_name = path.basename(image_url_obj.pathname); image_file_name = getUniqueFileName(image_file_names, image_file_name); var image_info = { src: image_url, file: image_file_name }; if (image_alt && image_alt != '') image_info.alt = image_alt; images.push(image_info); } }); return images; } module.exports = Saver;