UNPKG

jul11co-siteripper

Version:

Jul11Co Website Ripper

388 lines (347 loc) 13 kB
#!/usr/bin/env node var async = require('async'); var path = require('path'); var urlutil = require('url'); var Ripper = require('./lib/ripper'); var utils = require('./lib/utils'); function printUsage() { console.log('Usage: siterip [OPTIONS...] <ARGS>'); console.log(''); console.log(' siterip --init <start_url> : Create configuration file (ripper.json will be created)'); console.log(' siterip --download [start_url] : Download (start_url will be loaded from ripper.json by default)'); console.log(' siterip --serve [output_dir] : Serve ripped contents'); console.log(' siterip --resume [output_dir] : Resume'); console.log(' siterip --update [output_dir] : Update (and check for incompleted links)'); console.log(' siterip --fix-links [output_dir] : Fix links'); // console.log(' siterip --add-link <start_url> [output_dir] : Add link'); console.log(''); console.log('OPTIONS:'); console.log(' --verbose : Verbose'); console.log(' --force : Forcibly action'); console.log(''); console.log(' --no-images : Do not download images (default: download)'); console.log(' --no-scripts : Do not download scripts (default: download)'); console.log(' --no-stylesheets : Do not download stylesheets (default: download)'); console.log(''); console.log(' --max-depth=X : Specify max depth'); console.log(''); console.log(' --no-links-fix : Not fix links after downloading/resuming/updating'); console.log(''); } if (process.argv.length < 3 || process.argv.indexOf('--help') >= 0) { printUsage(); process.exit(); return; } process.on('SIGINT', function() { console.log("\nCaught interrupt signal"); process.exit(); }); var argv = []; var options = {}; for (var i = 2; i < process.argv.length; i++) { if (process.argv[i] == '--init') { options.init = true; } else if (process.argv[i] == '--download') { options.download = true; } else if (process.argv[i] == '--serve') { options.serve = true; } else if (process.argv[i] == '--test') { options.test = true; } else if (process.argv[i] == '--resume') { options.resume = true; } else if (process.argv[i] == '--update') { options.update = true; } else if (process.argv[i] == '--fix-links') { options.fix_links = true; } else if (process.argv[i] == '--no-links-fix') { options.no_links_fix = true; } else if (process.argv[i] == '--no-images') { options.download_images = false; } else if (process.argv[i] == '--no-scripts') { options.download_scripts = false; } else if (process.argv[i] == '--no-stylesheets') { options.download_stylesheets = false; } else if (process.argv[i].indexOf('--max-depth=') == 0) { var max_depth_str = process.argv[i].replace('--max-depth=',''); var max_depth = parseInt(max_depth_str); if (!isNaN(max_depth)) options.max_depth = max_depth; } else if (process.argv[i] == '--force') { options.force = true; } else if (process.argv[i] == '--verbose') { options.verbose = true; } else if (process.argv[i].indexOf('--') == 0) { console.log('Invalid options:', process.argv[i]); process.exit(); } else { argv.push(process.argv[i]); } } // if (options.resume) { // options.max_depth = 0; // } // if (typeof options.max_depth == 'undefined') { // options.max_depth = -1; // } if (typeof options.download_images == 'undefined') { options.download_images = true; } if (typeof options.download_scripts == 'undefined') { options.download_scripts = true; } if (typeof options.download_stylesheets == 'undefined') { options.download_stylesheets = true; } var start_url = ''; var output_dir = ''; if (argv[0] && argv[0] != '') { if (/^((http|https):\/\/)/.test(argv[0])) { start_url = argv[0]; output_dir = argv[1] || './'; } else { output_dir = argv[0]; } } else { output_dir = './'; } options.output_dir = output_dir; console.log('Output directory: ' + options.output_dir); if (start_url && start_url != '') { options.page_url = start_url; console.log('Start URL: ' + start_url); } var ripper_config = {}; function initRipperOptions() { if (!options.force && utils.fileExists(path.join(output_dir, 'ripper.json'))) { console.log('Warning: ripper.json exists. Append --force to overwrite it.'); return false; } if (start_url != '') { ripper_config['start_link'] = start_url; } // links ripper_config['link_blacklist'] = []; ripper_config['link_filters'] = []; if (typeof options.max_depth != 'undefined') { ripper_config['max_depth'] = options.max_depth; } else { ripper_config['max_depth'] = 1; } // HTML output ripper_config['html_output_dir'] = 'html'; ripper_config['download_linked_file'] = false; // scripts if (typeof options.download_scripts != 'undefined') { ripper_config['download_scripts'] = options.download_scripts; } else { ripper_config['download_scripts'] = true; } ripper_config['script_blacklist'] = []; ripper_config['script_filters'] = []; ripper_config['script_output_dir'] = 'scripts'; // stylesheets if (typeof options.download_stylesheets != 'undefined') { ripper_config['download_stylesheets'] = options.download_stylesheets; } else { ripper_config['download_stylesheets'] = true; } ripper_config['stylesheet_blacklist'] = []; ripper_config['stylesheet_filters'] = []; ripper_config['stylesheet_output_dir'] = 'styles'; // images if (typeof options.download_images != 'undefined') { ripper_config['download_images'] = options.download_images; } else { ripper_config['download_images'] = true; } ripper_config['image_blacklist'] = []; ripper_config['image_filters'] = []; ripper_config['image_output_dir'] = 'images'; // proxy // ripper_config['html_proxy'] = ''; return true; } var getIndexFilePath = function(page_url, options) { options = options || {}; var index_file = ''; var page_url_obj = urlutil.parse(page_url); var page_output_dir_path = path.join(page_url_obj.host || 'nohost', page_url_obj.pathname || '/'); var page_output_dir = path.join((options.output_dir || '.'), page_output_dir_path); if (path.basename(page_url_obj.pathname) == 'index.html') { // special case page_output_dir_path = path.dirname(page_output_dir_path); } var html_output_dir = path.join(options.output_dir || '.', options.html_output_dir || 'html'); var page_output_dir = path.join(html_output_dir, page_output_dir_path); if (page_url_obj.query) { index_file = path.resolve(page_output_dir, 'index-' + page_url_obj.query + '.html'); } else { index_file = path.resolve(page_output_dir, 'index.html'); } return index_file; } if (options.init) { var result = initRipperOptions(); if (result) { utils.saveToJsonFile(ripper_config, path.join(output_dir, 'ripper.json')); console.log('Ripper initialized with following configurations:'); console.log(ripper_config); } process.exit(); } else if (options.serve) { if (utils.fileExists(path.join(output_dir, 'ripper.json'))) { console.log('Load ripper options from:', path.join(output_dir, 'ripper.json')); ripper_config = utils.loadFromJsonFile(path.join(output_dir, 'ripper.json')); } if (!ripper_config['start_link']) { console.log('Missing start_link'); process.exit(); } var index_file = getIndexFilePath(ripper_config['start_link'], options); console.log('Start URL: ' + start_url); console.log('Index file: ' + index_file); var express = require('express'); var app = express(); var listen_port = 3333; app.set('port', process.env.PORT || listen_port); app.use(express.static(path.resolve(output_dir))); app.use(express.static(path.join(output_dir, 'html'))); app.use(express.static(path.join(output_dir, 'scripts'))); app.use(express.static(path.join(output_dir, 'styles'))); app.use(express.static(path.join(output_dir, 'images'))); // app.use(express.static(path.dirname(index_file))); // catch 404 and forward to error handler // app.use(function (req, res, next) { // var err = new Error('Not Found'); // err.status = 404; // next(err); // }); var server = require('http').createServer(app); server.listen(app.get('port'), function() { console.log('Siterip is listening on http://127.0.0.1:' + app.get('port')); if (!options.no_open) { var index_path = 'http://127.0.0.1:' + app.get('port'); index_path = index_path + '/' + path.relative(path.join(output_dir, 'html'), index_file); require("opn")(index_path); } }); } else { if (utils.fileExists(path.join(output_dir, 'ripper.json'))) { console.log('Load ripper options from:', path.join(output_dir, 'ripper.json')); ripper_config = utils.loadFromJsonFile(path.join(output_dir, 'ripper.json')); } else { if (initRipperOptions()) { utils.saveToJsonFile(ripper_config, path.join(output_dir, 'ripper.json')); } } if (start_url == '' && ripper_config['start_link']) { start_url = ripper_config['start_link']; options.page_url = start_url; console.log('Start URL: ' + start_url); } else { ripper_config['start_link'] = start_url; utils.saveToJsonFile(ripper_config, path.join(output_dir, 'ripper.json')); } // the config file can have following contents // { // start_url: String, // link_blacklist: [String], // link_filters: [String], // download_scripts: Boolean, // script_blacklist: [String], // script_filters: [String], // script_output_dir: String, // download_stylesheets: Boolean, // stylesheet_blacklist: [String], // stylesheet_filters: [String], // stylesheet_output_dir: String, // download_images: Boolean, // image_blacklist: [String], // image_filters: [String], // image_output_dir: String, // html_proxy: String, // html_output_dir: String // } options.config_file = path.join(options.output_dir, 'ripper.json'); options.state_file = path.join(options.output_dir, 'ripper-state.json'); var ripper = new Ripper(options); ripper.on('error', function(err) { console.log('Ripper error:'); console.log(err); }); ripper.on('exit', function(err) { if (err) console.log(err); else console.log('Done.'); }); function testRipper(start_url, output_dir, options, callback) { ripper.test(start_url, output_dir, options, function(err, page) { if (err) { return callback(err); } else if (page) { if (!page.title && page.download_links) { console.log('URL:', page.url); console.log('Download links:', page.download_links.length); page.download_links.forEach(function(link, idx) { console.log(' ' + idx + '. ' + link); }); } else { console.log('URL:', page.url); console.log('Title:', page.title); if (page.base_url) console.log('Base URL:', page.base_url); if (page.output_dir) console.log('Output dir:', page.output_dir); if (page.stylesheets) { console.log(''); console.log('Stylesheets:', page.stylesheets.length); page.stylesheets.forEach(function(stylesheet, idx) { console.log(' ' + idx + '. ' + stylesheet); }); } if (page.scripts) { console.log(''); console.log('Scripts:', page.scripts.length); page.scripts.forEach(function(script, idx) { console.log(' ' + idx + '. ' + script); }); } if (page.images) { console.log(''); console.log('Images:', page.images.length); page.images.forEach(function(image, idx) { console.log(' ' + idx + '. ' + image); }); } if (page.links) { console.log(''); console.log('Links:', page.links.length); page.links.forEach(function(link, idx) { console.log(' ' + idx + '. ' + link); }); } } } console.log(''); callback(); }); } if (options.fix_links) { ripper.fix_links(output_dir, options); } else if (options.resume) { options.no_links_fix = true; ripper.resume(output_dir, options); } else if (options.update) { ripper.update(output_dir, options); } else if (options.test && start_url) { testRipper(start_url, output_dir, options, function(err) { if (err) { console.log(err); } process.exit(); }); } else { options.no_links_fix = true; if (start_url != '') { ripper.download(start_url, output_dir, options); } else { console.log('Missing start URL'); } } }