mwoffliner
Version:
MediaWiki ZIM scraper
190 lines • 7.27 kB
JavaScript
import * as pathParser from 'path';
import * as urlParser from 'url';
import { existsSync } from 'fs';
import * as domino from 'domino';
import * as logger from './Logger.js';
import { getStringsForLang } from './util/index.js';
import WebURLDirector from './util/builders/url/web.director.js';
import MediaWiki from './MediaWiki.js';
export class Dump {
customProcessor;
nopic;
novid;
nopdf;
nodet;
opts;
strings;
mwMetaData;
outFile;
mediaQueue;
isMainPage = (articleId) => {
return this.mwMetaData.mainPage === articleId ? true : false;
};
status = {
files: {
success: 0,
fail: 0,
},
articles: {
success: 0,
fail: 0,
},
redirects: {
written: 0,
},
};
formatFlavour;
constructor(format, opts, mwMetaData, customProcessor) {
this.mwMetaData = mwMetaData;
this.opts = opts;
this.customProcessor = customProcessor;
const [formatStr, formatFlavour] = format.split(':');
this.nopic = formatStr.includes('nopic');
this.novid = formatStr.includes('novid');
this.nopdf = formatStr.includes('nopdf');
this.nodet = formatStr.includes('nodet');
this.formatFlavour = formatFlavour;
/* Get language specific strings */
this.strings = getStringsForLang(mwMetaData.langIso2 || 'en', 'en');
}
computeFlavour() {
const flavour = [];
if (typeof this.formatFlavour === 'string') {
flavour.push(this.formatFlavour);
}
else {
if (this.nopic) {
flavour.push('nopic');
}
else if (this.nopdf) {
flavour.push('nopdf');
}
else if (this.novid && !this.nodet) {
flavour.push('novid');
}
if (this.nodet) {
flavour.push('nodet');
}
}
return flavour.join('_');
}
computeFilenameRadical(withoutSelection, withoutFlavour, withoutDate) {
let radical;
if (this.opts.filenamePrefix) {
radical = this.opts.filenamePrefix;
}
else {
radical = `${this.mwMetaData.creator.charAt(0).toLowerCase() + this.mwMetaData.creator.substr(1)}_`;
const hostParts = urlParser.parse(this.mwMetaData.webUrl).hostname.split('.');
let langSuffix = this.mwMetaData.langIso2;
for (const part of hostParts) {
if (part === this.mwMetaData.langIso3) {
langSuffix = part;
break;
}
}
radical += langSuffix;
}
if (!withoutSelection && !this.opts.filenamePrefix) {
if (this.opts.articleList) {
let filenamePostfix = pathParser
.basename(this.opts.articleList)
.toLowerCase()
.replace(/\.\w{3}$/, '')
.replace(/[\,\s]/g, '_');
if (filenamePostfix.length > 50) {
filenamePostfix = filenamePostfix.slice(0, 50);
}
radical += `_${filenamePostfix}`;
}
else {
radical += '_all';
}
}
if (!withoutFlavour && this.computeFlavour()) {
radical += `_${this.computeFlavour()}`;
}
if (!withoutDate) {
radical += `_${this.opts.filenameDate}`;
}
return radical;
}
checkResume() {
if (this.opts.resume) {
const zimPath = this.computeZimRootPath();
if (existsSync(zimPath)) {
logger.log(`${zimPath} is already done, skip dumping & ZIM file generation`);
throw new Error('TODO: IMPLEMENT RESUME');
}
}
}
computeZimTags() {
/* Add tag and avoid duplicates */
function addTagWithoutDuplicate(newTag) {
if (!tags.find((tag) => tag === newTag)) {
tags.push(newTag);
}
}
/* Split tags in a list */
let tags = (this.opts.tags || '').split(';');
/* Add Mediawiki hostname radical as a tag */
const mwUrlHostParts = urlParser.parse(this.mwMetaData.baseUrl).host.split('.');
const mwUrlHostPartsRadical = mwUrlHostParts.length > 1 ? mwUrlHostParts[mwUrlHostParts.length - 2] : mwUrlHostParts[mwUrlHostParts.length - 1];
const mwUrlHostPartsTag = mwUrlHostPartsRadical.toLowerCase();
addTagWithoutDuplicate(mwUrlHostPartsTag);
/* Famous Web sites have their own hidden category */
if (mwUrlHostPartsTag.match(/^(gutenberg|phet|psiram|stack_exchange|ted|vikidia|wikibooks|wikinews|wikipedia|wikiquote|wikisource|wikiversity|wikivoyage|wiktionary)$/)) {
addTagWithoutDuplicate('_category:' + mwUrlHostPartsTag);
}
/* Add --format tags */
if (this.nopic) {
addTagWithoutDuplicate('_pictures:no');
addTagWithoutDuplicate('_videos:no');
}
else if (this.novid) {
addTagWithoutDuplicate('_pictures:yes');
addTagWithoutDuplicate('_videos:no');
}
if (this.nodet) {
addTagWithoutDuplicate('_videos:no');
addTagWithoutDuplicate('_details:no');
}
else {
addTagWithoutDuplicate('_details:yes');
}
/* Add proper _ftindex tag */
addTagWithoutDuplicate('_ftindex:' + (this.opts.withoutZimFullTextIndex ? 'no' : 'yes'));
/* Remove empty tags */
tags = tags.filter((x) => x);
return tags.join(';');
}
computeZimRootPath() {
let zimRootPath = this.opts.outputDirectory[0] === '/' ? this.opts.outputDirectory : `${pathParser.resolve(process.cwd(), this.opts.outputDirectory)}/`;
zimRootPath += `${this.computeFilenameRadical()}.zim`;
return zimRootPath;
}
async getRelevantStylesheetUrls(downloader) {
// TODO: consider moving to Downloader
const sheetUrls = [];
/* Load main page to see which CSS files are needed */
const { content } = await downloader.downloadContent(this.mwMetaData.webUrl, 'data');
const html = content.toString();
const doc = domino.createDocument(html);
const links = Array.from(doc.getElementsByTagName('link'));
/* Go through all CSS links */
for (const link of links) {
if (link.getAttribute('rel') === 'stylesheet' && link.getAttribute('href') && !link.getAttribute('href').match('^data')) {
sheetUrls.push(link);
}
}
/* Push Mediawiki:Offline.css (at the end) */
// TODO: Weak URL (might fail in a number of cases where the wiki path is not like on Wikipedia)
const webUrlDirector = new WebURLDirector(MediaWiki.webUrl.href);
const offlineCssUrl = webUrlDirector.buildArticleRawURL('Mediawiki:offline.css');
if (await downloader.canGetUrl(offlineCssUrl)) {
sheetUrls.push(offlineCssUrl);
}
return sheetUrls.filter((a) => a);
}
}
//# sourceMappingURL=Dump.js.map