mwoffliner
Version:
MediaWiki ZIM scraper
507 lines • 24.4 kB
JavaScript
// tslint:disable-next-line: no-reference
/// <reference path="./types.d.ts" />
/* ********************************** */
/* MODULE VARIABLE SECTION ********** */
/* ********************************** */
import fs, { readFileSync } from 'fs';
import os from 'os';
import pmap from 'p-map';
import sharp from 'sharp';
import domino from 'domino';
import rimraf from 'rimraf';
import urlParser from 'url';
import semver from 'semver';
import * as path from 'path';
import * as QueryStringParser from 'querystring';
import { ZimArticle, ZimCreator } from '@openzim/libzim';
import { checkApiAvailability } from './util/mw-api.js';
import { MAX_CPU_CORES, MIN_IMAGE_THRESHOLD_ARTICLELIST_PAGE, downloadAndSaveModule, genCanonicalLink, genHeaderCSSLink, genHeaderScript, getAndProcessStylesheets, getDumps, getMediaBase, getRelativeFilePath, getSizeFromUrl, isValidEmail, makeArticleImageTile, makeArticleListItem, mkdirPromise, sanitizeString, saveStaticFiles, importPolyfillModules, extractArticleList, getTmpDirectory, validateMetadata, } from './util/index.js';
import S3 from './S3.js';
import RedisStore from './RedisStore.js';
import * as logger from './Logger.js';
import { Dump } from './Dump.js';
import { config } from './config.js';
import MediaWiki from './MediaWiki.js';
import Downloader from './Downloader.js';
import { getArticleIds } from './util/mw-api.js';
import { articleListHomeTemplate } from './Templates.js';
import { downloadFiles, saveArticles } from './util/saveArticles.js';
import { getCategoriesForArticles, trimUnmirroredPages } from './util/categories.js';
import { fileURLToPath } from 'url';
import ApiURLDirector from './util/builders/url/api.director.js';
import urlHelper from './util/url.helper.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const packageJSON = JSON.parse(readFileSync(path.join(__dirname, '../package.json'), 'utf8'));
async function execute(argv) {
/* ********************************* */
/* CUSTOM VARIABLE SECTION ********* */
/* ********************************* */
const { speed: _speed, adminEmail, verbose, minifyHtml, keepEmptyParagraphs, mwUrl, mwWikiPath, mwActionApiPath, mwRestApiPath, mwModulePath, mwDomain, mwUsername, mwPassword, requestTimeout, customMainPage, customZimTitle, customZimDescription, customZimLongDescription, customZimTags, customZimLanguage, withoutZimFullTextIndex, webp, format, filenamePrefix, resume, publisher: _publisher, outputDirectory: _outputDirectory, addNamespaces: _addNamespaces, customZimFavicon, optimisationCacheUrl, customFlavour, forceRender, } = argv;
let { articleList, articleListToIgnore } = argv;
logger.setVerboseLevel(verbose ? verbose : 'log'); // Default log level is 'log'
logger.log(`Starting mwoffliner v${packageJSON.version}...`);
// TODO: Move it to sanitaze method
if (articleList)
articleList = String(articleList);
if (articleListToIgnore)
articleListToIgnore = String(articleListToIgnore);
const publisher = _publisher || config.defaults.publisher;
// TODO: Move it to sanitaze method
/* HTTP user-agent string */
// const adminEmail = argv.adminEmail;
if (!isValidEmail(adminEmail)) {
throw new Error(`Admin email [${adminEmail}] is not valid`);
}
// TODO: Move it to sanitaze method
/* Number of parallel requests. To secure stability and avoid HTTP
429 errors, no more than MAX_CPU_CORES can be considered */
if (_speed && isNaN(_speed)) {
throw new Error('speed is not a number, please give a number value to --speed');
}
const cpuCount = Math.min(os.cpus().length, MAX_CPU_CORES);
const speed = Math.max(1, Math.round(cpuCount * (_speed || 1)));
/* Check Node.js version */
const nodeVersionSatisfiesPackage = semver.satisfies(process.version, packageJSON.engines.node);
if (!nodeVersionSatisfiesPackage) {
logger.warn(`***********\n\n\tCurrent node version is [${process.version}]. We recommend [${packageJSON.engines.node}]\n\n***********`);
}
/* Instanciate custom flavour module */
logger.info(`Using custom flavour: ${customFlavour || 'no'}`);
const customProcessor = customFlavour ? new (await import(customFlavour))() : null;
let s3Obj;
// Check for S3 creds
if (optimisationCacheUrl) {
// Decompose the url with path and other S3 creds
const s3UrlObj = urlParser.parse(optimisationCacheUrl);
const queryReader = QueryStringParser.parse(s3UrlObj.query);
const s3Url = (s3UrlObj.protocol || 'https:') + '//' + (s3UrlObj.host || '') + (s3UrlObj.pathname || '');
s3Obj = new S3(s3Url, queryReader, requestTimeout * 1000 || config.defaults.requestTimeout, argv.insecure);
await s3Obj.initialise().then(() => {
logger.log('Successfully logged in S3');
});
}
// Extract S3 obj to pass to downloader class
const s3 = s3Obj ? s3Obj : {};
/* Wikipedia/... URL; Normalize by adding trailing / as necessary */
MediaWiki.base = mwUrl;
MediaWiki.getCategories = !!argv.getCategories;
MediaWiki.wikiPath = mwWikiPath;
MediaWiki.actionApiPath = mwActionApiPath;
MediaWiki.restApiPath = mwRestApiPath;
MediaWiki.modulePathOpt = mwModulePath;
MediaWiki.domain = mwDomain;
MediaWiki.password = mwPassword;
MediaWiki.username = mwUsername;
/* Download helpers; TODO: Merge with something else / expand this. */
const downloader = new Downloader({
uaString: `${config.userAgent} (${adminEmail})`,
speed,
reqTimeout: requestTimeout * 1000 || config.defaults.requestTimeout,
optimisationCacheUrl,
s3,
webp,
insecure: argv.insecure,
});
/* perform login */
await MediaWiki.login(downloader);
/* Get MediaWiki Info */
let mwMetaData;
try {
mwMetaData = await MediaWiki.getMwMetaData(downloader);
}
catch (err) {
logger.error('FATAL - Failed to get MediaWiki Metadata');
throw err;
}
const metaDataRequiredKeys = {
Creator: mwMetaData.creator,
Description: customZimDescription || mwMetaData.subTitle,
Language: customZimLanguage || mwMetaData.langIso3,
Publisher: publisher,
Title: customZimTitle || mwMetaData.title,
'Illustration_48x48@1': await getIllustrationMetadata(downloader),
};
validateMetadata(metaDataRequiredKeys);
// Sanitizing main page
let mainPage = articleList ? '' : mwMetaData.mainPage;
if (customMainPage) {
mainPage = customMainPage;
const mainPageUrl = MediaWiki.webUrl + encodeURIComponent(mainPage);
if (!(await checkApiAvailability(downloader, mainPageUrl))) {
throw new Error(`customMainPage doesn't return 200 status code for url ${mainPageUrl}`);
}
}
MediaWiki.apiCheckArticleId = mwMetaData.mainPage;
await MediaWiki.hasCoordinates(downloader);
await MediaWiki.hasWikimediaDesktopApi(downloader);
const hasWikimediaMobileApi = await MediaWiki.hasWikimediaMobileApi(downloader);
await MediaWiki.hasRestApi(downloader);
await MediaWiki.hasVisualEditorApi(downloader);
RedisStore.setOptions(argv.redis || config.defaults.redisPath);
await RedisStore.connect();
const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = RedisStore;
// Output directory
const outputDirectory = path.isAbsolute(_outputDirectory || '') ? _outputDirectory : path.join(process.cwd(), _outputDirectory || 'out');
await mkdirPromise(outputDirectory);
logger.log(`Using output directory ${outputDirectory}`);
// Temporary directory
const tmpDirectory = await getTmpDirectory();
logger.log(`Using temporary directory ${tmpDirectory}`);
process.on('exit', async (code) => {
logger.log(`Exiting with code [${code}]`);
logger.log(`Deleting temporary directory [${tmpDirectory}]`);
rimraf.sync(tmpDirectory);
});
process.on('SIGTERM', async () => {
logger.log('SIGTERM');
await RedisStore.close();
process.exit(128 + 15);
});
process.on('SIGINT', async () => {
logger.log('SIGINT');
await RedisStore.close();
process.exit(128 + 2);
});
/* *********************************** */
/* SYSTEM VARIABLE SECTION */
/* *********************************** */
const dumpFormats = getDumps(format);
const addNamespaces = _addNamespaces
? String(_addNamespaces)
.split(',')
.map((a) => Number(a))
: [];
/* ********************************* */
/* GET CONTENT ********************* */
/* ********************************* */
let articleListToIgnoreLines;
if (articleListToIgnore) {
try {
articleListToIgnoreLines = await extractArticleList(articleListToIgnore, downloader);
logger.info(`ArticleListToIgnore has [${articleListToIgnoreLines.length}] items`);
}
catch (err) {
logger.error(`Failed to read articleListToIgnore from [${articleListToIgnore}]`, err);
throw err;
}
}
let articleListLines;
if (articleList) {
try {
articleListLines = await extractArticleList(articleList, downloader);
if (articleListToIgnore) {
articleListLines = articleListLines.filter((title) => !articleListToIgnoreLines.includes(title));
}
logger.info(`ArticleList has [${articleListLines.length}] items`);
}
catch (err) {
logger.error(`Failed to read articleList from [${articleList}]`, err);
throw err;
}
}
await MediaWiki.getNamespaces(addNamespaces, downloader);
logger.info('Getting article ids');
let stime = Date.now();
await getArticleIds(downloader, mainPage, articleList ? articleListLines : null, articleListToIgnore ? articleListToIgnoreLines : null);
logger.log(`Got ArticleIDs in ${(Date.now() - stime) / 1000} seconds`);
if (MediaWiki.getCategories) {
await getCategoriesForArticles(articleDetailXId, downloader);
while ((await trimUnmirroredPages(downloader)) > 0) {
// Remove unmirrored pages, categories, subCategories
// trimUnmirroredPages returns number of modified articles
}
// while ((await simplifyGraph(downloader, redisStore)).deletedNodes !== 0) {
// // keep simplifying graph
// }
// await trimUnmirroredPages(downloader); // TODO: improve simplify graph to remove the need for a second trim
}
const filenameDate = new Date().toISOString().slice(0, 7);
// Getting total number of articles from Redis
logger.log(`Total articles found in Redis: ${await articleDetailXId.len()}`);
const dumps = [];
for (const dumpFormat of dumpFormats) {
const dump = new Dump(dumpFormat, {
tmpDir: tmpDirectory,
username: mwUsername,
password: mwPassword,
outputDirectory,
mainPage,
filenamePrefix,
articleList,
publisher,
customZimDescription,
customZimLongDescription,
customZimTags,
customZimTitle,
customZimLanguage,
withoutZimFullTextIndex,
resume,
minifyHtml,
keepEmptyParagraphs,
tags: customZimTags,
filenameDate,
}, { ...mwMetaData, mainPage }, customProcessor);
dumps.push(dump);
logger.log('Doing dump');
let shouldSkip = false;
try {
dump.checkResume();
}
catch (err) {
shouldSkip = true;
}
if (shouldSkip) {
logger.log('Skipping dump');
}
else {
try {
await doDump(dump);
await filesToDownloadXPath.flush();
}
catch (err) {
debugger;
throw err;
}
logger.log('Finished dump');
}
}
logger.log('Closing HTTP agents...');
logger.log('All dumping(s) finished with success.');
async function doDump(dump) {
const outZim = path.resolve(dump.opts.outputDirectory, dump.computeFilenameRadical() + '.zim');
logger.log(`Writing zim to [${outZim}]`);
dump.outFile = outZim;
const metadata = {
...metaDataRequiredKeys,
Tags: dump.computeZimTags(),
Name: dump.computeFilenameRadical(false, true, true),
Flavour: dump.computeFlavour(),
...(dump.opts.customZimLongDescription ? { LongDescription: `${dump.opts.customZimLongDescription}` } : {}),
};
validateMetadata(metadata);
const zimCreator = new ZimCreator({
fileName: outZim,
fullTextIndexLanguage: dump.opts.withoutZimFullTextIndex ? '' : dump.mwMetaData.langIso3,
welcome: dump.opts.mainPage ? dump.opts.mainPage : 'index',
compression: 'zstd',
}, metadata);
const scraperArticle = new ZimArticle({
ns: 'M',
data: `mwoffliner ${packageJSON.version}`,
url: 'Scraper',
});
zimCreator.addArticle(scraperArticle);
logger.info('Finding stylesheets to download');
const stylesheetsToGet = await dump.getRelevantStylesheetUrls(downloader);
logger.log(`Found [${stylesheetsToGet.length}] stylesheets to download`);
logger.log('Downloading stylesheets and populating media queue');
const { finalCss } = await getAndProcessStylesheets(downloader, stylesheetsToGet);
logger.log('Downloaded stylesheets');
const article = new ZimArticle({ url: `${config.output.dirs.mediawiki}/style.css`, data: finalCss, ns: '-' });
zimCreator.addArticle(article);
await saveFavicon(zimCreator, metaDataRequiredKeys['Illustration_48x48@1']);
await getThumbnailsData();
logger.log('Getting Main Page');
await getMainPage(dump, zimCreator);
logger.log('Getting articles');
stime = Date.now();
const { jsModuleDependencies, cssModuleDependencies, staticFilesList } = await saveArticles(zimCreator, downloader, dump, hasWikimediaMobileApi, forceRender);
logger.log(`Fetching Articles finished in ${(Date.now() - stime) / 1000} seconds`);
logger.log(`Found [${jsModuleDependencies.size}] js module dependencies`);
logger.log(`Found [${cssModuleDependencies.size}] style module dependencies`);
logger.info('Copying Static Resource Files');
await saveStaticFiles(staticFilesList, zimCreator);
const allDependenciesWithType = [
{ type: 'js', moduleList: Array.from(jsModuleDependencies) },
{ type: 'css', moduleList: Array.from(cssModuleDependencies) },
];
if (downloader.webp) {
logger.log('Downloading polyfill module');
await importPolyfillModules(downloader, zimCreator);
}
logger.log('Downloading module dependencies');
await Promise.all(allDependenciesWithType.map(({ type, moduleList }) => {
return pmap(moduleList, (oneModule) => {
return downloadAndSaveModule(zimCreator, downloader, dump, oneModule, type);
}, { concurrency: downloader.speed });
}));
await downloadFiles(filesToDownloadXPath, filesToRetryXPath, zimCreator, dump, downloader);
logger.log('Writing Article Redirects');
await writeArticleRedirects(downloader, dump, zimCreator);
logger.log('Finishing Zim Creation');
await zimCreator.finalise();
logger.log('Summary of scrape actions:', JSON.stringify(dump.status, null, '\t'));
}
/* ********************************* */
/* FUNCTIONS *********************** */
/* ********************************* */
async function writeArticleRedirects(downloader, dump, zimCreator) {
await redirectsXId.iterateItems(downloader.speed, async (redirects) => {
for (const [redirectId, { targetId }] of Object.entries(redirects)) {
if (redirectId !== targetId) {
const redirectArticle = new ZimArticle({
url: redirectId,
shouldIndex: true,
data: '',
ns: 'A',
mimeType: 'text/html',
// We fake a title, by just removing the underscores
title: String(redirectId).replace(/_/g, ' '),
redirectUrl: targetId,
});
zimCreator.addArticle(redirectArticle);
dump.status.redirects.written += 1;
}
}
});
}
async function getIllustrationMetadata(downloader) {
if (customZimFavicon) {
const faviconIsRemote = customZimFavicon.includes('http');
let content;
if (faviconIsRemote) {
logger.log(`Downloading remote zim favicon from [${customZimFavicon}]`);
content = await downloader
.request({ url: customZimFavicon, method: 'GET', ...downloader.arrayBufferRequestOptions })
.then((a) => a.data)
.catch(() => {
throw new Error(`Failed to download custom zim favicon from [${customZimFavicon}]`);
});
}
else {
try {
content = fs.readFileSync(customZimFavicon);
}
catch (err) {
throw new Error(`Failed to read custom zim favicon from [${customZimFavicon}]`);
}
}
try {
return sharp(content).resize(48, 48, { fit: sharp.fit.inside, withoutEnlargement: true }).png().toBuffer();
}
catch (e) {
throw new Error('Failed to read or process IllustrationMetadata using sharp');
}
}
const apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href);
const body = await downloader.getJSON(apiUrlDirector.buildSiteInfoURL());
const entries = body.query.general;
if (!entries.logo) {
throw new Error(`********\nNo site Logo Url. Expected a string, but got [${entries.logo}].\n\nPlease try specifying a customZimFavicon (--customZimFavicon=./path/to/your/file.ico)\n********`);
}
const parsedUrl = urlParser.parse(entries.logo);
const logoUrl = parsedUrl.protocol ? entries.logo : MediaWiki.baseUrl.protocol + entries.logo;
const { content } = await downloader.downloadContent(logoUrl, 'image');
return sharp(content).resize(48, 48, { fit: sharp.fit.inside, withoutEnlargement: true }).png().toBuffer();
}
async function saveFavicon(zimCreator, data) {
logger.log('Saving favicon.png...');
try {
const article = new ZimArticle({ url: 'favicon', mimeType: 'image/png', data, ns: '-' });
return zimCreator.addArticle(article);
}
catch (e) {
throw new Error('Failed to save favicon');
}
}
function getMainPage(dump, zimCreator) {
async function createMainPage() {
logger.log('Creating main page...');
const doc = domino.createDocument(articleListHomeTemplate.replace('</head>', genHeaderCSSLink(config, 'mobile_main_page', dump.mwMetaData.mainPage) +
'\n' +
genHeaderCSSLink(config, 'style', dump.mwMetaData.mainPage) +
'\n' +
genHeaderScript(config, 'images_loaded.min', dump.mwMetaData.mainPage) +
'\n' +
genHeaderScript(config, 'masonry.min', dump.mwMetaData.mainPage) +
'\n' +
genHeaderScript(config, 'article_list_home', dump.mwMetaData.mainPage) +
'\n' +
genCanonicalLink(config, dump.mwMetaData.webUrl, dump.mwMetaData.mainPage) +
'\n' +
'\n</head>'));
doc.querySelector('title').innerHTML = sanitizeString(dump.mwMetaData.title) || sanitizeString(dump.opts.customZimTitle);
const articlesWithImages = [];
const allArticles = [];
for (const articleId of articleListLines) {
const articleDetail = await articleDetailXId.get(articleId);
if (articleDetail) {
allArticles.push(articleDetail);
if (articleDetail.thumbnail && articleDetail.internalThumbnailUrl) {
articlesWithImages.push(articleDetail);
if (articlesWithImages.length >= 100) {
break;
}
}
}
}
if (articlesWithImages.length > MIN_IMAGE_THRESHOLD_ARTICLELIST_PAGE) {
const articlesWithImagesEl = articlesWithImages.map((article) => makeArticleImageTile(dump, article)).join('\n');
doc.body.innerHTML = `<div id='container'><div id='content'>${articlesWithImagesEl}</div></div>`;
}
else {
const articlesWithoutImagesEl = allArticles.map((article) => makeArticleListItem(dump, article)).join('\n');
doc.body.innerHTML = `<ul id='list'>${articlesWithoutImagesEl}</ul>`;
}
/* Write the static html file */
const article = new ZimArticle({ url: 'index', data: doc.documentElement.outerHTML, ns: 'A', mimeType: 'text/html', title: 'Main Page' });
return zimCreator.addArticle(article);
}
function createMainPageRedirect() {
logger.log(`Create main page redirection from [index] to [${'A/' + mainPage}]`);
const article = new ZimArticle({
url: 'index',
shouldIndex: true,
data: '',
ns: 'A',
mimeType: 'text/html',
title: mainPage,
redirectUrl: mainPage,
});
return zimCreator.addArticle(article);
}
return mainPage ? createMainPageRedirect() : createMainPage();
}
async function fetchArticleDetail(articleId) {
return articleDetailXId.get(articleId);
}
async function updateArticleThumbnail(articleDetail, articleId) {
const imageUrl = articleDetail.thumbnail;
const { width: oldWidth } = getSizeFromUrl(imageUrl.source);
const suitableResUrl = imageUrl.source.replace(`/${oldWidth}px-`, '/500px-').replace(`-${oldWidth}px-`, '-500px-');
const { mult, width } = getSizeFromUrl(suitableResUrl);
const path = getMediaBase(suitableResUrl, false);
articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I');
await Promise.all([
filesToDownloadXPath.set(path, { url: urlHelper.serializeUrl(suitableResUrl), mult, width, kind: 'image' }),
articleDetailXId.set(articleId, articleDetail),
]);
}
async function getThumbnailsData() {
if (customMainPage || !articleList || articleListLines.length <= MIN_IMAGE_THRESHOLD_ARTICLELIST_PAGE)
return;
logger.log('Updating article thumbnails for articles');
let articleIndex = 0;
let articlesWithImages = 0;
while (articleIndex < articleListLines.length && articlesWithImages < 100) {
const articleId = articleListLines[articleIndex];
articleIndex++;
try {
const articleDetail = await fetchArticleDetail(articleId);
if (!articleDetail || !articleDetail.thumbnail)
continue;
await updateArticleThumbnail(articleDetail, articleId);
articlesWithImages++;
}
catch (err) {
logger.warn(`Failed to parse thumbnail for [${articleId}], skipping...`);
}
}
}
MediaWiki.reset();
RedisStore.close();
return dumps;
}
export { execute };
//# sourceMappingURL=mwoffliner.lib.js.map