libzim
Version:
Bindings to zimlib (read/write OpenZIM files)
448 lines (431 loc) • 12.7 kB
JavaScript
var zim = require('../');
var fs = require('fs');
var path = require('path');
/**
* (Re-)Implementation of ZIM file dump tool, based on the node binding.
*/
var ZimDumper = module.exports = function ZimDumper(fname, titleSort) {
this.file = new zim.File(fname);
this.verbose = false;
this.pos = titleSort ? this.file.iteratorByTitle() : this.file.iterator();
};
ZimDumper.prototype.setVerbose = function(sw) {
this.verbose = (sw === undefined ? true : !!sw);
};
ZimDumper.prototype.log = function() {
console.log.apply(console, arguments);
};
ZimDumper.prototype.printInfo = function() {
var log = this.log.bind(this);
var f = this.file;
var fh = f.getFileheader();
var i;
log('count-articles:', f.getCountArticles());
if (this.verbose) {
var ns = f.getNamespaces();
log('namespaces:', ns);
for (i = 0; i < ns.length;i++) {
log('namespace', ns[i], 'size:', f.getNamespaceCount(ns[i]));
}
}
log('uuid:', '' + fh.getUuid());
log('article count:', fh.getArticleCount());
log('mime list pos:', fh.getMimeListPos());
log('url ptr pos:', fh.getUrlPtrPos());
log('title idx pos:', fh.getTitleIdxPos());
log('cluster count:', fh.getClusterCount());
log('cluster ptr pos:', fh.getClusterPtrPos());
if (fh.hasChecksum()) {
log('checksum pos:', fh.getChecksumPos());
log('checksum:', f.getChecksum());
} else {
log('no checksum');
}
log('main page:', fh.hasMainPage() ? fh.getMainPage() : '-');
log('layout page:', fh.hasLayoutPage() ? fh.getLayoutPage() : '-');
};
ZimDumper.prototype.printNsInfo = function(ch) {
var log = this.log.bind(this);
log('namespace', ch);
log('lower bound idx:', this.file.getNamespaceBeginOffset(ch));
log('upper bound idx:', this.file.getNamespaceEndOffset(ch));
};
ZimDumper.prototype.locateArticle = function(idx) {
this.pos = this.file.iterator(idx);
};
ZimDumper.prototype.findArticle = function(ns, expr, title) {
if (title) {
this.pos = this.file.findByTitle(ns, expr);
} else {
this.pos = this.file.find(ns, expr);
}
};
ZimDumper.prototype.findArticleByUrl = function(url) {
this.pos = this.file.find(url);
};
ZimDumper.prototype.printPage = function() {
if (this.pos.hasNext()) {
this.log(this.pos.get().getPage());
}
};
ZimDumper.prototype.dumpArticle = function() {
if (this.pos.hasNext()) {
this.log('%s', this.pos.get().getData().toString('utf8'));
}
};
ZimDumper.prototype.dumpIndex = function() {
if (this.pos.get().getNamespace() === 'X') {
var parameter = zim.ZIntStream.fromBuffer(this.pos.get().getParameter());
var ppos = 0;
var off = 0;
// Read flags
var flags = parameter[ppos++];
if (ppos > parameter.length) {
throw new Error('invalid index parameter data');
}
// Process categories
for (var c = 0, flag = 1; c < 4; c++, flag <<= 1) {
if (!(flags & flag)) {
continue; // Category empty.
}
var len = parameter[ppos++];
var idx = parameter[ppos++];
var wpos = parameter[ppos++];
if (ppos > parameter.length) {
throw new Error('invalid index parameter data');
}
if (this.verbose) {
this.log('c' + c + '\tidx=' + idx + '\tpos=' + wpos);
} else {
this.log('c' + c + '\t' + idx + ';' + wpos);
}
// Prepare data stream.
var blob = this.pos.get().getData();
if (off + len > blob.size()) {
throw new Error('invalid index data');
}
var ins = zim.ZIntStream.fromBuffer(blob.data().slice(off, off + len));
var inspos = 0;
off += len;
var lastidx = 0;
var lastpos = 0;
var str = '';
while (inspos < ins.length) {
idx = ins[inspos++];
wpos = ins[inspos++];
var oidx = idx;
var owpos = wpos;
if (idx === 0) {
idx = lastidx;
lastpos = (wpos += lastpos);
} else {
lastidx = (idx += lastidx);
lastpos = wpos;
}
if (this.verbose) {
this.log(
'c' + c + '\tidx=' + oidx + ' => ' + idx +
'\tpos=' + owpos + ' => ' + wpos
);
} else {
str += '\t' + idx + ';' + wpos;
}
}
if (!this.verbose) {
this.log(str);
}
}
} else {
this.log('no index article');
}
};
ZimDumper.prototype.listArticles = function(info, listTable, extra) {
for (var it = this.pos; it.hasNext();) {
var a = it.next().value;
if (listTable) {
this.listArticleT(a, extra);
} else if (info) {
this.listArticle(a, extra);
} else {
this.log(a.getUrl());
}
}
};
ZimDumper.prototype.listArticle = function(article, extra) {
var dirent = article.getDirent();
this.log('url:', dirent.getUrl());
this.log('\ttitle: ', dirent.getTitle());
this.log('\tidx: ', article.getIndex());
this.log('\tnamespace: ', dirent.getNamespace());
this.log('\ttype: ', dirent.isRedirect() ? 'redirect' :
dirent.isLinktarget() ? 'linktarget' :
dirent.isDeleted() ? 'deleted' :
'article');
if (dirent.isRedirect()) {
this.log('\tredirect index: ', dirent.getRedirectIndex());
} else if (dirent.isLinktarget()) {
// Nothing else
} else if (dirent.isDeleted()) {
// Nothing else
} else {
this.log('\tmime-type: ', article.getMimeType());
this.log('\tarticle size: ', article.getArticleSize());
if (this.verbose) {
var cluster = article.getCluster();
this.log('\tcluster number: ', dirent.getClusterNumber());
this.log('\tcluster count: ', cluster.count());
this.log('\tcluster size: ', cluster.size());
this.log('\tcluster offset: ',
this.file.getClusterOffset(dirent.getClusterNumber()));
this.log('\tblob number: ', dirent.getBlobNumber());
var c;
switch (cluster.getCompression()) {
case zim.zimcompDefault: { c = 'default'; break; }
case zim.zimcompNone: { c = 'none'; break; }
case zim.zimcompZip: { c = 'zip'; break; }
case zim.zimcompBzip2: { c = 'bzip2'; break; }
case zim.zimcompLzma: { c = 'lzma'; break; }
default: { c = 'unknown (' + cluster.getCompression() + ')'; break; }
}
this.log('\tcompression: ', c);
}
}
if (extra) {
var str = '\textra: ';
var b = dirent.getParameter();
var i;
for (i = 0; i < b.length; i++) {
str += b[i].toString(16) + ' ';
}
str += ':';
if (b.length > 1) {
var a = zim.ZIntStream.fromBuffer(b);
for (i = 0; i < a.length; i++) {
str += '\t' + a[i];
}
}
this.log(str);
}
};
ZimDumper.prototype.listArticleT = function(article, extra) {
var dirent = article.getDirent();
var str = dirent.getNamespace() +
'\t' + dirent.getUrl() +
'\t' + dirent.getTitle() +
'\t' + article.getIndex() +
'\t' + (dirent.isRedirect() ? 'R' :
dirent.isLinktarget() ? 'L' :
dirent.isDeleted() ? 'D' :
'A');
if (dirent.isRedirect()) {
str += '\t' + dirent.getRedirectIndex();
} else if (dirent.isLinktarget()) {
// Nothing else
} else if (dirent.isDeleted()) {
// Nothing else
} else {
str +=
'\t' + dirent.getMimeType() +
'\t' + article.getArticleSize();
if (this.verbose) {
var cluster = article.getCluster();
str +=
'\t' + dirent.getClusterNumber() +
'\t' + cluster.count() +
'\t' + cluster.size() +
'\t' + this.file.getClusterOffset(dirent.getClusterNumber()) +
'\t' + dirent.getBlobNumber() +
'\t' + cluster.getCompression();
}
}
if (extra) {
var parameter = dirent.getParameter();
var i;
str += '\t';
for (i = 0; i < parameter.length; i++) {
str += parameter[i].toString(16) + '\t';
}
if (parameter.length > 1) {
var a = zim.ZIntStream.fromBuffer(parameter);
for (i = 0; i < a.length; i++) {
str += '\t' + a[i];
}
}
}
this.log(str);
};
ZimDumper.prototype.dumpFiles = function(directory) {
fs.mkdirSync(directory, 0777);
var ns = Object.create(null);
for (var it = this.pos; it.hasNext();) {
var article = it.next().value;
var d = path.join(directory, article.getNamespace());
if (!ns[article.getNamespace()]) {
fs.mkdirSync(d, 0777);
ns[article.getNamespace()] = true;
}
var t = article.getTitle();
t = t.replace('/', '%2f');
var f = path.join(d, t);
fs.writeFileSync(f, article.getData().data());
}
};
ZimDumper.prototype.verifyChecksum = function() {
if (this.file.verify()) {
this.log('checksum ok');
} else {
this.log('no checksum');
}
};
ZimDumper.main = function(args) {
var yargs = require('yargs');
var argv = (args ? yargs(args) : yargs)
.usage('Usage: $0 [options] zimfile')
.demand(1, 1, 'The name of a zimfile is required.')
.option('F', {
describe: 'Print fileinfo',
type: 'boolean',
alias: 'fileinfo',
})
.option('N', {
describe: 'Print info about given namespace',
type: 'string',
requiresArg: true,
nargs: 1,
alias: 'nsinfo',
})
.option('i', {
describe: 'Print info about articles',
type: 'boolean',
alias: 'info',
})
.option('d', {
describe: 'Print data of articles',
type: 'boolean',
alias: 'data',
})
.option('p', {
describe: 'Print page',
type: 'boolean',
alias: 'page',
})
.option('f', {
describe: 'Find article with given title',
type: 'string',
requiresArg: true,
nargs: 1,
alias: 'find',
})
.option('u', {
describe: 'Find article with given url',
type: 'string',
requiresArg: true,
nargs: 1,
alias: 'url',
})
.option('l', {
describe: 'List articles',
type: 'boolean',
alias: 'list',
})
.option('L', {
describe: 'List articles as table',
type: 'boolean',
alias: 'tableList',
})
.option('o', {
describe: 'Find article with given index',
type: 'number',
requiresArg: true,
nargs: 1,
alias: 'indexOffset',
})
.option('x', {
describe: 'Print extra parameters',
type: 'boolean',
alias: 'extra',
})
.option('n', {
describe: 'Specify namespace',
type: 'string',
requiresArg: true,
nargs: 1,
alias: 'ns',
default: 'A',
})
.option('D', {
describe: 'Dump all files into specified directory',
type: 'string',
requiresArg: true,
nargs: 1,
alias: 'dumpAll',
})
.option('v', {
describe: 'Verbose\n' +
'(print uncompressed length of articles when -i is set)\n' +
'(print namespaces with counts with -F)',
type: 'boolean',
alias: 'verbose',
})
.option('Z', {
describe: 'Dump index data',
type: 'boolean',
alias: 'zint',
})
.option('t', {
describe: 'Sort (and find) articles by title instead of url',
type: 'boolean',
alias: 'titleSort',
})
.option('C', {
describe: 'Verify checksum',
type: 'boolean',
alias: 'verifyChecksum',
})
.help('h').alias('h', 'help')
.example('$0 -F wikipedia.zim')
.example('$0 -l wikipedia.zim')
.example('$0 -f Auto -i wikipedia.zim')
.example('$0 -f Auto -d wikipedia.zim')
.example('$0 -f Auto -l wikipedia.zim')
.example('$0 -f Auto -l -i -v wikipedia.zim')
.example('$0 -o 123159 -l -i wikipedia.zim')
.argv;
var app = new ZimDumper(argv._[0], argv.titleSort);
app.setVerbose(argv.verbose);
// global info
if (argv.fileinfo) { app.printInfo(); }
// Namespace info
if (argv.nsinfo) { app.printNsInfo(argv.nsinfo); }
// Locate article
if (argv.indexOffset !== undefined) {
app.locateArticle(argv.indexOffset);
} else if (argv.find !== undefined) {
app.findArticle(argv.ns, argv.find, argv.titleSort);
} else if (argv.url !== undefined) {
app.findArticleByUrl(argv.url);
}
// Dump files
if (argv.dumpAll !== undefined) {
app.dumpFiles(argv.dumpAll);
}
// Print requested info
if (argv.data) {
app.dumpArticle();
} else if (argv.page) {
app.printPage();
} else if (argv.list || argv.tableList) {
app.listArticles(argv.info, argv.tableList, argv.extra);
} else if (argv.info) {
app.listArticle(argv.extra);
} else if (argv.zint) {
app.dumpIndex();
}
if (argv.verifyChecksum) {
app.verifyChecksum();
}
};
if (require.main === module) {
ZimDumper.main();
}