jul11co-siteripper
Version:
Jul11Co Website Ripper
388 lines (347 loc) • 13 kB
JavaScript
var async = require('async');
var path = require('path');
var urlutil = require('url');
var Ripper = require('./lib/ripper');
var utils = require('./lib/utils');
function printUsage() {
console.log('Usage: siterip [OPTIONS...] <ARGS>');
console.log('');
console.log(' siterip --init <start_url> : Create configuration file (ripper.json will be created)');
console.log(' siterip --download [start_url] : Download (start_url will be loaded from ripper.json by default)');
console.log(' siterip --serve [output_dir] : Serve ripped contents');
console.log(' siterip --resume [output_dir] : Resume');
console.log(' siterip --update [output_dir] : Update (and check for incompleted links)');
console.log(' siterip --fix-links [output_dir] : Fix links');
// console.log(' siterip --add-link <start_url> [output_dir] : Add link');
console.log('');
console.log('OPTIONS:');
console.log(' --verbose : Verbose');
console.log(' --force : Forcibly action');
console.log('');
console.log(' --no-images : Do not download images (default: download)');
console.log(' --no-scripts : Do not download scripts (default: download)');
console.log(' --no-stylesheets : Do not download stylesheets (default: download)');
console.log('');
console.log(' --max-depth=X : Specify max depth');
console.log('');
console.log(' --no-links-fix : Not fix links after downloading/resuming/updating');
console.log('');
}
if (process.argv.length < 3 || process.argv.indexOf('--help') >= 0) {
printUsage();
process.exit();
return;
}
process.on('SIGINT', function() {
console.log("\nCaught interrupt signal");
process.exit();
});
var argv = [];
var options = {};
for (var i = 2; i < process.argv.length; i++) {
if (process.argv[i] == '--init') {
options.init = true;
} else if (process.argv[i] == '--download') {
options.download = true;
} else if (process.argv[i] == '--serve') {
options.serve = true;
} else if (process.argv[i] == '--test') {
options.test = true;
} else if (process.argv[i] == '--resume') {
options.resume = true;
} else if (process.argv[i] == '--update') {
options.update = true;
} else if (process.argv[i] == '--fix-links') {
options.fix_links = true;
} else if (process.argv[i] == '--no-links-fix') {
options.no_links_fix = true;
} else if (process.argv[i] == '--no-images') {
options.download_images = false;
} else if (process.argv[i] == '--no-scripts') {
options.download_scripts = false;
} else if (process.argv[i] == '--no-stylesheets') {
options.download_stylesheets = false;
} else if (process.argv[i].indexOf('--max-depth=') == 0) {
var max_depth_str = process.argv[i].replace('--max-depth=','');
var max_depth = parseInt(max_depth_str);
if (!isNaN(max_depth)) options.max_depth = max_depth;
} else if (process.argv[i] == '--force') {
options.force = true;
} else if (process.argv[i] == '--verbose') {
options.verbose = true;
} else if (process.argv[i].indexOf('--') == 0) {
console.log('Invalid options:', process.argv[i]);
process.exit();
} else {
argv.push(process.argv[i]);
}
}
// if (options.resume) {
// options.max_depth = 0;
// }
// if (typeof options.max_depth == 'undefined') {
// options.max_depth = -1;
// }
if (typeof options.download_images == 'undefined') {
options.download_images = true;
}
if (typeof options.download_scripts == 'undefined') {
options.download_scripts = true;
}
if (typeof options.download_stylesheets == 'undefined') {
options.download_stylesheets = true;
}
var start_url = '';
var output_dir = '';
if (argv[0] && argv[0] != '') {
if (/^((http|https):\/\/)/.test(argv[0])) {
start_url = argv[0];
output_dir = argv[1] || './';
} else {
output_dir = argv[0];
}
} else {
output_dir = './';
}
options.output_dir = output_dir;
console.log('Output directory: ' + options.output_dir);
if (start_url && start_url != '') {
options.page_url = start_url;
console.log('Start URL: ' + start_url);
}
var ripper_config = {};
function initRipperOptions() {
if (!options.force && utils.fileExists(path.join(output_dir, 'ripper.json'))) {
console.log('Warning: ripper.json exists. Append --force to overwrite it.');
return false;
}
if (start_url != '') {
ripper_config['start_link'] = start_url;
}
// links
ripper_config['link_blacklist'] = [];
ripper_config['link_filters'] = [];
if (typeof options.max_depth != 'undefined') {
ripper_config['max_depth'] = options.max_depth;
} else {
ripper_config['max_depth'] = 1;
}
// HTML output
ripper_config['html_output_dir'] = 'html';
ripper_config['download_linked_file'] = false;
// scripts
if (typeof options.download_scripts != 'undefined') {
ripper_config['download_scripts'] = options.download_scripts;
} else {
ripper_config['download_scripts'] = true;
}
ripper_config['script_blacklist'] = [];
ripper_config['script_filters'] = [];
ripper_config['script_output_dir'] = 'scripts';
// stylesheets
if (typeof options.download_stylesheets != 'undefined') {
ripper_config['download_stylesheets'] = options.download_stylesheets;
} else {
ripper_config['download_stylesheets'] = true;
}
ripper_config['stylesheet_blacklist'] = [];
ripper_config['stylesheet_filters'] = [];
ripper_config['stylesheet_output_dir'] = 'styles';
// images
if (typeof options.download_images != 'undefined') {
ripper_config['download_images'] = options.download_images;
} else {
ripper_config['download_images'] = true;
}
ripper_config['image_blacklist'] = [];
ripper_config['image_filters'] = [];
ripper_config['image_output_dir'] = 'images';
// proxy
// ripper_config['html_proxy'] = '';
return true;
}
var getIndexFilePath = function(page_url, options) {
options = options || {};
var index_file = '';
var page_url_obj = urlutil.parse(page_url);
var page_output_dir_path = path.join(page_url_obj.host || 'nohost', page_url_obj.pathname || '/');
var page_output_dir = path.join((options.output_dir || '.'), page_output_dir_path);
if (path.basename(page_url_obj.pathname) == 'index.html') { // special case
page_output_dir_path = path.dirname(page_output_dir_path);
}
var html_output_dir = path.join(options.output_dir || '.', options.html_output_dir || 'html');
var page_output_dir = path.join(html_output_dir, page_output_dir_path);
if (page_url_obj.query) {
index_file = path.resolve(page_output_dir, 'index-' + page_url_obj.query + '.html');
} else {
index_file = path.resolve(page_output_dir, 'index.html');
}
return index_file;
}
if (options.init) {
var result = initRipperOptions();
if (result) {
utils.saveToJsonFile(ripper_config, path.join(output_dir, 'ripper.json'));
console.log('Ripper initialized with following configurations:');
console.log(ripper_config);
}
process.exit();
} else if (options.serve) {
if (utils.fileExists(path.join(output_dir, 'ripper.json'))) {
console.log('Load ripper options from:', path.join(output_dir, 'ripper.json'));
ripper_config = utils.loadFromJsonFile(path.join(output_dir, 'ripper.json'));
}
if (!ripper_config['start_link']) {
console.log('Missing start_link');
process.exit();
}
var index_file = getIndexFilePath(ripper_config['start_link'], options);
console.log('Start URL: ' + start_url);
console.log('Index file: ' + index_file);
var express = require('express');
var app = express();
var listen_port = 3333;
app.set('port', process.env.PORT || listen_port);
app.use(express.static(path.resolve(output_dir)));
app.use(express.static(path.join(output_dir, 'html')));
app.use(express.static(path.join(output_dir, 'scripts')));
app.use(express.static(path.join(output_dir, 'styles')));
app.use(express.static(path.join(output_dir, 'images')));
// app.use(express.static(path.dirname(index_file)));
// catch 404 and forward to error handler
// app.use(function (req, res, next) {
// var err = new Error('Not Found');
// err.status = 404;
// next(err);
// });
var server = require('http').createServer(app);
server.listen(app.get('port'), function() {
console.log('Siterip is listening on http://127.0.0.1:' + app.get('port'));
if (!options.no_open) {
var index_path = 'http://127.0.0.1:' + app.get('port');
index_path = index_path + '/' + path.relative(path.join(output_dir, 'html'), index_file);
require("opn")(index_path);
}
});
} else {
if (utils.fileExists(path.join(output_dir, 'ripper.json'))) {
console.log('Load ripper options from:', path.join(output_dir, 'ripper.json'));
ripper_config = utils.loadFromJsonFile(path.join(output_dir, 'ripper.json'));
} else {
if (initRipperOptions()) {
utils.saveToJsonFile(ripper_config, path.join(output_dir, 'ripper.json'));
}
}
if (start_url == '' && ripper_config['start_link']) {
start_url = ripper_config['start_link'];
options.page_url = start_url;
console.log('Start URL: ' + start_url);
} else {
ripper_config['start_link'] = start_url;
utils.saveToJsonFile(ripper_config, path.join(output_dir, 'ripper.json'));
}
// the config file can have following contents
// {
// start_url: String,
// link_blacklist: [String],
// link_filters: [String],
// download_scripts: Boolean,
// script_blacklist: [String],
// script_filters: [String],
// script_output_dir: String,
// download_stylesheets: Boolean,
// stylesheet_blacklist: [String],
// stylesheet_filters: [String],
// stylesheet_output_dir: String,
// download_images: Boolean,
// image_blacklist: [String],
// image_filters: [String],
// image_output_dir: String,
// html_proxy: String,
// html_output_dir: String
// }
options.config_file = path.join(options.output_dir, 'ripper.json');
options.state_file = path.join(options.output_dir, 'ripper-state.json');
var ripper = new Ripper(options);
ripper.on('error', function(err) {
console.log('Ripper error:');
console.log(err);
});
ripper.on('exit', function(err) {
if (err) console.log(err);
else console.log('Done.');
});
function testRipper(start_url, output_dir, options, callback) {
ripper.test(start_url, output_dir, options, function(err, page) {
if (err) {
return callback(err);
} else if (page) {
if (!page.title && page.download_links) {
console.log('URL:', page.url);
console.log('Download links:', page.download_links.length);
page.download_links.forEach(function(link, idx) {
console.log(' ' + idx + '. ' + link);
});
} else {
console.log('URL:', page.url);
console.log('Title:', page.title);
if (page.base_url) console.log('Base URL:', page.base_url);
if (page.output_dir) console.log('Output dir:', page.output_dir);
if (page.stylesheets) {
console.log('');
console.log('Stylesheets:', page.stylesheets.length);
page.stylesheets.forEach(function(stylesheet, idx) {
console.log(' ' + idx + '. ' + stylesheet);
});
}
if (page.scripts) {
console.log('');
console.log('Scripts:', page.scripts.length);
page.scripts.forEach(function(script, idx) {
console.log(' ' + idx + '. ' + script);
});
}
if (page.images) {
console.log('');
console.log('Images:', page.images.length);
page.images.forEach(function(image, idx) {
console.log(' ' + idx + '. ' + image);
});
}
if (page.links) {
console.log('');
console.log('Links:', page.links.length);
page.links.forEach(function(link, idx) {
console.log(' ' + idx + '. ' + link);
});
}
}
}
console.log('');
callback();
});
}
if (options.fix_links) {
ripper.fix_links(output_dir, options);
} else if (options.resume) {
options.no_links_fix = true;
ripper.resume(output_dir, options);
} else if (options.update) {
ripper.update(output_dir, options);
} else if (options.test && start_url) {
testRipper(start_url, output_dir, options, function(err) {
if (err) {
console.log(err);
}
process.exit();
});
} else {
options.no_links_fix = true;
if (start_url != '') {
ripper.download(start_url, output_dir, options);
} else {
console.log('Missing start URL');
}
}
}