mwoffliner
Version:
MediaWiki ZIM scraper
506 lines • 24.5 kB
JavaScript
// eslint-disable-next-line @typescript-eslint/triple-slash-reference
/// <reference path="./types.d.ts" />
/* ********************************** */
/* MODULE VARIABLE SECTION ********** */
/* ********************************** */
import fs, { readFileSync } from 'fs';
import os from 'os';
import pmap from 'p-map';
import sharp from 'sharp';
import domino from 'domino';
import { rimraf } from 'rimraf';
import { fileURLToPath } from 'url';
import semver from 'semver';
import * as path from 'path';
import { Blob, Compression, Creator, StringItem } from '@openzim/libzim';
import { checkApiAvailability, getArticleIds } from './util/mw-api.js';
import { MAX_CPU_CORES, MIN_IMAGE_THRESHOLD_ARTICLELIST_PAGE, downloadAndSaveModule, genCanonicalLink, genHeaderCSSLink, genHeaderScript, getDumps, getMediaBase, getRelativeFilePath, getSizeFromUrl, isValidEmail, makeArticleImageTile, makeArticleListItem, mkdirPromise, sanitizeString, saveStaticFiles, addWebpJsScripts, extractArticleList, getTmpDirectory, validateMetadata, truncateUtf8Bytes, } from './util/index.js';
import S3 from './S3.js';
import RedisStore from './RedisStore.js';
import * as logger from './Logger.js';
import { Dump } from './Dump.js';
import { config } from './config.js';
import MediaWiki from './MediaWiki.js';
import Downloader from './Downloader.js';
import RenderingContext from './renderers/rendering.context.js';
import { articleListHomeTemplate } from './Templates.js';
import { downloadFiles, saveArticles } from './util/saveArticles.js';
import { getCategoriesForArticles, trimUnmirroredPages } from './util/categories.js';
import ApiURLDirector from './util/builders/url/api.director.js';
import urlHelper from './util/url.helper.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const packageJSON = JSON.parse(readFileSync(path.join(__dirname, '../package.json'), 'utf8'));
async function execute(argv) {
/* ********************************* */
/* CUSTOM VARIABLE SECTION ********* */
/* ********************************* */
const { speed: _speed, adminEmail, verbose, minifyHtml, keepEmptyParagraphs, mwUrl, mwWikiPath, mwIndexPhpPath, mwActionApiPath, mwRestApiPath, mwModulePath, mwDomain, mwUsername, mwPassword, requestTimeout, customMainPage, customZimTitle, customZimDescription, customZimLongDescription, customZimTags, customZimLanguage, withoutZimFullTextIndex, webp, format, filenamePrefix, resume, publisher: _publisher, outputDirectory: _outputDirectory, addNamespaces: _addNamespaces, customZimFavicon, optimisationCacheUrl, customFlavour, forceRender, } = argv;
let { articleList, articleListToIgnore } = argv;
logger.setVerboseLevel(verbose ? verbose : 'log'); // Default log level is 'log'
logger.log(`Starting mwoffliner v${packageJSON.version}...`);
// TODO: Move it to sanitaze method
if (articleList)
articleList = String(articleList);
if (articleListToIgnore)
articleListToIgnore = String(articleListToIgnore);
const publisher = _publisher || config.defaults.publisher;
// TODO: Move it to sanitaze method
/* HTTP user-agent string */
// const adminEmail = argv.adminEmail;
if (!isValidEmail(adminEmail)) {
throw new Error(`Admin email [${adminEmail}] is not valid`);
}
// TODO: Move it to sanitaze method
/* Number of parallel requests. To secure stability and avoid HTTP
429 errors, no more than MAX_CPU_CORES can be considered */
if (_speed && isNaN(_speed)) {
throw new Error('speed is not a number, please give a number value to --speed');
}
const cpuCount = Math.min(os.cpus().length, MAX_CPU_CORES);
const speed = Math.max(1, Math.round(cpuCount * (_speed || 1)));
/* Check Node.js version */
const nodeVersionSatisfiesPackage = semver.satisfies(process.version, packageJSON.engines.node);
if (!nodeVersionSatisfiesPackage) {
logger.warn(`***********\n\n\tCurrent node version is [${process.version}]. We recommend [${packageJSON.engines.node}]\n\n***********`);
}
/* Instanciate custom flavour module */
logger.info(`Using custom flavour: ${customFlavour || 'no'}`);
const customProcessor = customFlavour ? new (await import(customFlavour))() : null;
let s3Obj;
// Check for S3 creds
if (optimisationCacheUrl) {
// Decompose the url with path and other S3 creds
const s3UrlObj = new URL(optimisationCacheUrl);
const s3Url = (s3UrlObj.protocol || 'https:') + '//' + (s3UrlObj.host || '') + (s3UrlObj.pathname || '');
s3Obj = new S3(s3Url, s3UrlObj.searchParams, requestTimeout * 1000 || config.defaults.requestTimeout, argv.insecure);
await s3Obj.initialise().then(() => {
logger.log('Successfully logged in S3');
});
}
// Extract S3 obj to pass to downloader class
const s3 = s3Obj ? s3Obj : {};
/* Wikipedia/... URL; Normalize by adding trailing / as necessary */
MediaWiki.base = mwUrl;
MediaWiki.getCategories = !!argv.getCategories;
MediaWiki.wikiPath = mwWikiPath;
MediaWiki.indexPhpPath = mwIndexPhpPath;
MediaWiki.actionApiPath = mwActionApiPath;
MediaWiki.restApiPath = mwRestApiPath;
MediaWiki.modulePathOpt = mwModulePath;
MediaWiki.domain = mwDomain;
MediaWiki.password = mwPassword;
MediaWiki.username = mwUsername;
/* Download helpers; TODO: Merge with something else / expand this. */
Downloader.init = {
uaString: `${config.userAgent} (${adminEmail})`,
speed,
reqTimeout: requestTimeout * 1000 || config.defaults.requestTimeout,
optimisationCacheUrl,
s3,
webp,
insecure: argv.insecure,
};
/* perform login */
await MediaWiki.login();
/* Get MediaWiki Info */
let mwMetaData;
try {
mwMetaData = await MediaWiki.getMwMetaData();
}
catch (err) {
logger.error('FATAL - Failed to get MediaWiki Metadata');
throw err;
}
const metaDataRequiredKeys = {
Creator: mwMetaData.creator,
Description: customZimDescription || mwMetaData.subTitle,
Language: customZimLanguage || mwMetaData.langIso3,
Publisher: publisher,
Title: customZimTitle || mwMetaData.title,
Date: new Date().toISOString().split('T')[0],
'Illustration_48x48@1': await getIllustrationMetadata(),
};
validateMetadata(metaDataRequiredKeys);
// Sanitizing main page
let mainPage = articleList ? '' : mwMetaData.mainPage;
if (customMainPage) {
mainPage = customMainPage;
const mainPageUrl = MediaWiki.webUrl + encodeURIComponent(mainPage);
if (!(await checkApiAvailability(mainPageUrl))) {
throw new Error(`customMainPage doesn't return 200 status code for url ${mainPageUrl}`);
}
}
MediaWiki.apiCheckArticleId = mwMetaData.mainPage;
await MediaWiki.hasCoordinates();
await MediaWiki.hasWikimediaDesktopApi();
const hasWikimediaMobileApi = await MediaWiki.hasWikimediaMobileApi();
await MediaWiki.hasRestApi();
await MediaWiki.hasVisualEditorApi();
await MediaWiki.hasActionParseApi();
await MediaWiki.hasModuleApi();
await RenderingContext.createRenderers(forceRender, hasWikimediaMobileApi);
RedisStore.setOptions(argv.redis || config.defaults.redisPath);
await RedisStore.connect();
const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = RedisStore;
// Output directory
const outputDirectory = path.isAbsolute(_outputDirectory || '') ? _outputDirectory : path.join(process.cwd(), _outputDirectory || 'out');
await mkdirPromise(outputDirectory);
logger.log(`Using output directory ${outputDirectory}`);
// Temporary directory
const tmpDirectory = await getTmpDirectory();
logger.log(`Using temporary directory ${tmpDirectory}`);
process.on('exit', async (code) => {
logger.log(`Exiting with code [${code}]`);
logger.log(`Deleting temporary directory [${tmpDirectory}]`);
rimraf.sync(tmpDirectory);
});
process.on('SIGTERM', async () => {
logger.log('SIGTERM');
await RedisStore.close();
process.exit(128 + 15);
});
process.on('SIGINT', async () => {
logger.log('SIGINT');
await RedisStore.close();
process.exit(128 + 2);
});
/* *********************************** */
/* SYSTEM VARIABLE SECTION */
/* *********************************** */
const dumpFormats = getDumps(format);
const addNamespaces = _addNamespaces
? String(_addNamespaces)
.split(',')
.map((a) => Number(a))
: [];
/* ********************************* */
/* GET CONTENT ********************* */
/* ********************************* */
let articleListToIgnoreLines;
if (articleListToIgnore) {
try {
articleListToIgnoreLines = await extractArticleList(articleListToIgnore);
logger.info(`ArticleListToIgnore has [${articleListToIgnoreLines.length}] items`);
}
catch (err) {
logger.error(`Failed to read articleListToIgnore from [${articleListToIgnore}]`, err);
throw err;
}
}
let articleListLines;
if (articleList) {
try {
articleListLines = await extractArticleList(articleList);
if (articleListToIgnore) {
articleListLines = articleListLines.filter((title) => !articleListToIgnoreLines.includes(title));
}
logger.info(`ArticleList has [${articleListLines.length}] items`);
}
catch (err) {
logger.error(`Failed to read articleList from [${articleList}]`, err);
throw err;
}
}
await MediaWiki.getNamespaces(addNamespaces);
logger.info('Getting article ids');
let stime = Date.now();
await getArticleIds(mainPage, articleList ? articleListLines : null, articleListToIgnore ? articleListToIgnoreLines : null);
logger.log(`Got ArticleIDs in ${(Date.now() - stime) / 1000} seconds`);
if (MediaWiki.getCategories) {
await getCategoriesForArticles(articleDetailXId);
while ((await trimUnmirroredPages()) > 0) {
// Remove unmirrored pages, categories, subCategories
// trimUnmirroredPages returns number of modified articles
}
// while ((await simplifyGraph(downloader, redisStore)).deletedNodes !== 0) {
// // keep simplifying graph
// }
// await trimUnmirroredPages(downloader); // TODO: improve simplify graph to remove the need for a second trim
}
const filenameDate = new Date().toISOString().slice(0, 7);
// Getting total number of articles from Redis
logger.log(`Total articles found in Redis: ${await articleDetailXId.len()}`);
const dumps = [];
for (const dumpFormat of dumpFormats) {
const dump = new Dump(dumpFormat, {
tmpDir: tmpDirectory,
username: mwUsername,
password: mwPassword,
outputDirectory,
mainPage,
filenamePrefix,
articleList,
publisher,
customZimDescription,
customZimLongDescription,
customZimTags,
customZimTitle,
customZimLanguage,
withoutZimFullTextIndex,
resume,
minifyHtml,
keepEmptyParagraphs,
tags: customZimTags,
filenameDate,
}, { ...mwMetaData, mainPage }, customProcessor);
dumps.push(dump);
logger.log('Doing dump');
let shouldSkip = false;
try {
dump.checkResume();
}
catch {
shouldSkip = true;
}
if (shouldSkip) {
logger.log('Skipping dump');
}
else {
await doDump(dump);
await filesToDownloadXPath.flush();
Downloader.cssDependenceUrls = {};
logger.log('Finished dump');
}
}
logger.log('Closing HTTP agents...');
logger.log('All dumping(s) finished with success.');
async function doDump(dump) {
const outZim = path.resolve(dump.opts.outputDirectory, dump.computeFilenameRadical() + '.zim');
logger.log(`Writing ZIM to [${outZim}]`);
dump.outFile = outZim;
const metadata = {
...metaDataRequiredKeys,
Tags: dump.computeZimTags(),
Name: dump.computeFilenameRadical(false, true, true),
Flavour: dump.computeFlavour(),
Scraper: `mwoffliner ${packageJSON.version}`,
Source: MediaWiki.webUrl.hostname,
...(dump.opts.customZimLongDescription ? { LongDescription: `${dump.opts.customZimLongDescription}` } : {}),
};
validateMetadata(metadata);
const zimCreator = new Creator().configCompression(Compression.Zstd);
if (!dump.opts.withoutZimFullTextIndex) {
zimCreator.configIndexing(true, dump.mwMetaData.langIso3);
}
zimCreator.startZimCreation(outZim);
zimCreator.setMainPath(dump.opts.mainPage ? dump.opts.mainPage : 'index');
// Helper function to transform a Buffer into a libzim ContentProvider
const createBufferContentProvider = (buffer) => {
let dataSent = false;
return {
size: buffer.length,
feed: () => {
if (!dataSent) {
dataSent = true;
return new Blob(buffer);
}
return new Blob();
},
};
};
Object.entries(metadata).forEach(([key, value]) => {
zimCreator.addMetadata(key, Buffer.isBuffer(value) ? createBufferContentProvider(value) : value, key.startsWith('Illustration_') ? 'image/png' : undefined);
});
await saveFavicon(zimCreator, metaDataRequiredKeys['Illustration_48x48@1']);
if (Downloader.webp) {
logger.log('Adding webp polyfilling JS scripts');
await addWebpJsScripts(zimCreator);
}
await getThumbnailsData();
logger.log('Checking Main Page rendering');
await getMainPage(dump, true, zimCreator);
logger.log('Getting articles');
stime = Date.now();
const { jsModuleDependencies, cssModuleDependencies, staticFilesList } = await saveArticles(zimCreator, dump);
logger.log(`Fetching Articles finished in ${(Date.now() - stime) / 1000} seconds`);
logger.log(`Found [${jsModuleDependencies.size}] js module dependencies`);
logger.log(`Found [${cssModuleDependencies.size}] style module dependencies`);
logger.info('Copying Static Resource Files');
await saveStaticFiles(staticFilesList, zimCreator);
const allDependenciesWithType = [
{ type: 'js', moduleList: Array.from(jsModuleDependencies) },
{ type: 'css', moduleList: Array.from(cssModuleDependencies) },
];
logger.log('Downloading module dependencies');
await Promise.all(allDependenciesWithType.map(({ type, moduleList }) => {
return pmap(moduleList, (oneModule) => {
return downloadAndSaveModule(zimCreator, oneModule, type);
}, { concurrency: Downloader.speed });
}));
await downloadFiles(filesToDownloadXPath, filesToRetryXPath, zimCreator, dump);
logger.log('Writing Article Redirects');
await writeArticleRedirects(dump, zimCreator);
logger.log('Writing Main Page to the ZIM');
await getMainPage(dump, false, zimCreator);
logger.log('Finishing ZIM Creation');
await zimCreator.finishZimCreation();
logger.log('Summary of scrape actions:', JSON.stringify(dump.status, null, '\t'));
logger.log(`ZIM is ready at [${outZim}]`);
}
/* ********************************* */
/* FUNCTIONS *********************** */
/* ********************************* */
async function writeArticleRedirects(dump, zimCreator) {
await redirectsXId.iterateItems(Downloader.speed, async (redirects) => {
for (const [redirectId, { targetId }] of Object.entries(redirects)) {
if (await RedisStore.articleDetailXId.exists(redirectId)) {
logger.warn(`Skipping redirect of '${redirectId}' because it already exists as an article`);
continue;
}
if (redirectId === targetId) {
logger.warn(`Skipping redirect of '${redirectId}' to self`);
continue;
}
if (!(await RedisStore.articleDetailXId.exists(targetId))) {
logger.warn(`Skipping redirect of '${redirectId}' to '${targetId}' because target is not a known article`);
continue;
}
zimCreator.addRedirection(redirectId,
// We fake a title, by just removing the underscores
truncateUtf8Bytes(String(redirectId).replace(/_/g, ' '), 245), targetId, { FRONT_ARTICLE: 1 });
dump.status.redirects.written += 1;
}
});
}
async function getIllustrationMetadata() {
if (customZimFavicon) {
const faviconIsRemote = customZimFavicon.includes('http');
let content;
if (faviconIsRemote) {
logger.log(`Downloading remote ZIM favicon from [${customZimFavicon}]`);
content = await Downloader.request({ url: customZimFavicon, method: 'GET', ...Downloader.arrayBufferRequestOptions })
.then((a) => a.data)
.catch(() => {
throw new Error(`Failed to download custom ZIM favicon from [${customZimFavicon}]`);
});
}
else {
try {
content = fs.readFileSync(customZimFavicon);
}
catch {
throw new Error(`Failed to read custom ZIM favicon from [${customZimFavicon}]`);
}
}
try {
return sharp(content).resize(48, 48, { fit: sharp.fit.inside, withoutEnlargement: true }).png().toBuffer();
}
catch {
throw new Error('Failed to read or process IllustrationMetadata using sharp');
}
}
const apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href);
const body = await Downloader.getJSON(apiUrlDirector.buildSiteInfoURL());
const entries = body.query.general;
if (!entries.logo) {
throw new Error(`********\nNo site Logo Url. Expected a string, but got [${entries.logo}].\n\nPlease try specifying a customZimFavicon (--customZimFavicon=./path/to/your/file.ico)\n********`);
}
const parsedUrl = new URL(entries.logo, MediaWiki.baseUrl);
const logoUrl = parsedUrl.protocol ? entries.logo : MediaWiki.baseUrl.protocol + entries.logo;
const { content } = await Downloader.downloadContent(logoUrl, 'image');
return sharp(content).resize(48, 48, { fit: sharp.fit.inside, withoutEnlargement: true }).png().toBuffer();
}
async function saveFavicon(zimCreator, data) {
logger.log('Saving favicon.png...');
try {
return zimCreator.addItem(new StringItem('favicon', 'image/png', null, { FRONT_ARTICLE: 0 }, data));
}
catch {
throw new Error('Failed to save favicon');
}
}
function getMainPage(dump, dryrun, zimCreator) {
async function createMainPage() {
const doc = domino.createDocument(articleListHomeTemplate.replace('</head>', genHeaderCSSLink(config, 'mobile_main_page', dump.mwMetaData.mainPage) +
'\n' +
genHeaderCSSLink(config, 'style', dump.mwMetaData.mainPage) +
'\n' +
genHeaderScript(config, 'images_loaded.min', dump.mwMetaData.mainPage) +
'\n' +
genHeaderScript(config, 'masonry.min', dump.mwMetaData.mainPage) +
'\n' +
genHeaderScript(config, 'article_list_home', dump.mwMetaData.mainPage) +
'\n' +
genCanonicalLink(config, dump.mwMetaData.webUrl, dump.mwMetaData.mainPage) +
'\n' +
'\n</head>'));
doc.querySelector('title').innerHTML = sanitizeString(dump.mwMetaData.title) || sanitizeString(dump.opts.customZimTitle);
const articlesWithImages = [];
const allArticles = [];
for (const articleId of articleListLines) {
const articleDetail = await articleDetailXId.get(articleId);
if (articleDetail) {
allArticles.push(articleDetail);
if (articleDetail.thumbnail && articleDetail.internalThumbnailUrl) {
articlesWithImages.push(articleDetail);
if (articlesWithImages.length >= 100) {
break;
}
}
}
}
if (articlesWithImages.length > MIN_IMAGE_THRESHOLD_ARTICLELIST_PAGE) {
const articlesWithImagesEl = articlesWithImages.map((article) => makeArticleImageTile(dump, article)).join('\n');
doc.body.innerHTML = `<div id='container'><div id='content'>${articlesWithImagesEl}</div></div>`;
}
else {
const articlesWithoutImagesEl = allArticles.map((article) => makeArticleListItem(dump, article)).join('\n');
doc.body.innerHTML = `<ul id='list'>${articlesWithoutImagesEl}</ul>`;
}
/* Write the static html file */
if (!dryrun) {
const item = new StringItem('index', 'text/html', 'Main Page', {}, doc.documentElement.outerHTML);
return zimCreator.addItem(item);
}
}
function createMainPageRedirect() {
if (!dryrun) {
logger.log(`Create main page redirection from [index] to [${mainPage}]`);
zimCreator.addRedirection('index', '', mainPage, { FRONT_ARTICLE: 1 });
}
}
return mainPage ? createMainPageRedirect() : createMainPage();
}
async function fetchArticleDetail(articleId) {
return articleDetailXId.get(articleId);
}
async function updateArticleThumbnail(articleDetail, articleId) {
const imageUrl = articleDetail.thumbnail;
const { width: oldWidth } = getSizeFromUrl(imageUrl.source);
const suitableResUrl = imageUrl.source.replace(`/${oldWidth}px-`, '/500px-').replace(`-${oldWidth}px-`, '-500px-');
const { mult, width } = getSizeFromUrl(suitableResUrl);
const path = getMediaBase(suitableResUrl, false);
articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true));
await Promise.all([
filesToDownloadXPath.set(path, { url: urlHelper.serializeUrl(suitableResUrl), mult, width, kind: 'image' }),
articleDetailXId.set(articleId, articleDetail),
]);
}
async function getThumbnailsData() {
if (customMainPage || !articleList || articleListLines.length <= MIN_IMAGE_THRESHOLD_ARTICLELIST_PAGE)
return;
logger.log('Updating article thumbnails for articles');
let articleIndex = 0;
let articlesWithImages = 0;
while (articleIndex < articleListLines.length && articlesWithImages < 100) {
const articleId = articleListLines[articleIndex];
articleIndex++;
try {
const articleDetail = await fetchArticleDetail(articleId);
if (!articleDetail || !articleDetail.thumbnail)
continue;
await updateArticleThumbnail(articleDetail, articleId);
articlesWithImages++;
}
catch {
logger.warn(`Failed to parse thumbnail for [${articleId}], skipping...`);
}
}
}
MediaWiki.reset();
RedisStore.close();
return dumps;
}
export { execute };
//# sourceMappingURL=mwoffliner.lib.js.map