mwoffliner
Version:
MediaWiki ZIM scraper
805 lines • 37.3 kB
JavaScript
import * as backoff from 'backoff';
import { config } from './config.js';
import { contains, normalizeMwResponse, DB_ERROR, WEAK_ETAG_REGEX, stripHttpFromUrl, isBitmapImageMimeType, isWebpCandidateImageMimeType } from './util/index.js';
import deepmerge from 'deepmerge';
import * as domino from 'domino';
import { default as imagemin } from 'imagemin';
import imageminAdvPng from 'imagemin-advpng';
import axios, { AxiosError } from 'axios';
import { default as imageminPngquant } from 'imagemin-pngquant';
import imageminGifsicle from 'imagemin-gifsicle';
import imageminJpegoptim from 'imagemin-jpegoptim';
import imageminWebp from 'imagemin-webp';
import sharp from 'sharp';
import { fileTypeFromBuffer } from 'file-type';
import { HttpCookieAgent, HttpsCookieAgent } from 'http-cookie-agent/http';
import { CookieJar } from 'tough-cookie';
import * as logger from './Logger.js';
import MediaWiki from './MediaWiki.js';
import ApiURLDirector from './util/builders/url/api.director.js';
import urlHelper from './util/url.helper.js';
import { findFirstMatchingRule, renderDownloadError } from './error.manager.js';
import RedisStore from './RedisStore.js';
const imageminOptions = new Map();
imageminOptions.set('default', new Map());
imageminOptions.set('webp', new Map());
imageminOptions.get('default').set('image/png', {
plugins: [imageminPngquant({ speed: 3, strip: true, dithering: 0 }), imageminAdvPng({ optimizationLevel: 4, iterations: 5 })],
});
imageminOptions.get('default').set('image/jpeg', {
plugins: [imageminJpegoptim({ max: 60, stripAll: true })],
});
imageminOptions.get('default').set('image/gif', {
plugins: [imageminGifsicle({ optimizationLevel: 3, colors: 64 })],
});
imageminOptions.get('webp').set('image/png', {
plugins: [imageminWebp({ quality: 50, method: 6 })],
});
imageminOptions.get('webp').set('image/jpeg', {
plugins: [imageminWebp({ quality: 50, method: 6 })],
});
export class DownloadError extends Error {
urlCalled;
httpReturnCode;
responseContentType;
responseData;
constructor(message, urlCalled, httpReturnCode, responseContentType, responseData) {
super(message);
this.name = 'DownloadError';
this.urlCalled = urlCalled;
this.httpReturnCode = httpReturnCode;
this.responseContentType = responseContentType;
this.responseData = responseData;
if ('captureStackTrace' in Error) {
// Avoid DownloadError itself in the stack trace
Error.captureStackTrace(this, DownloadError);
}
}
}
/**
* Downloader is a class providing content retrieval functionalities for both Mediawiki and S3 remote instances.
*/
class Downloader {
static instance;
static getInstance() {
if (!Downloader.instance) {
Downloader.instance = new Downloader();
}
return Downloader.instance;
}
_speed;
cssDependenceUrls = {};
_webp = false;
_requestTimeout;
_basicRequestOptions;
_arrayBufferRequestOptions;
_jsonRequestOptions;
_streamRequestOptions;
wikimediaMobileJsDependenciesList = [];
wikimediaMobileStyleDependenciesList = [];
uaString;
backoffOptions;
optimisationCacheUrl;
s3;
_apiUrlDirector;
cookierJar;
articleUrlDirector;
mainPageUrlDirector;
insecure = false;
get speed() {
return this._speed;
}
get webp() {
return this._webp;
}
get requestTimeout() {
return this._requestTimeout;
}
get basicRequestOptions() {
return this._basicRequestOptions;
}
get arrayBufferRequestOptions() {
return this._arrayBufferRequestOptions;
}
get jsonRequestOptions() {
return this._jsonRequestOptions;
}
get streamRequestOptions() {
return this._streamRequestOptions;
}
get apiUrlDirector() {
return this._apiUrlDirector;
}
set init({ uaString, speed, reqTimeout, optimisationCacheUrl, s3, webp, backoffOptions, insecure }) {
this.reset();
this.uaString = uaString;
this._speed = speed;
this._requestTimeout = reqTimeout;
this.optimisationCacheUrl = optimisationCacheUrl;
this._webp = webp;
this.s3 = s3;
this._apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href);
this.insecure = insecure;
this.cookierJar = new CookieJar();
this.backoffOptions = {
// retry up to 10 times, with a minimum of 1sec, maximum of 1min
// this means we retry for up to about 6 mins, which is supposed
// to be sufficient for backend to recover from transient errors
strategy: new backoff.ExponentialStrategy({ initialDelay: 1000, maxDelay: 60000 }),
failAfter: 10,
retryIf: (err) => {
const requestedUrl = err.urlCalled || err.config?.url || 'unknown';
if (err instanceof AxiosError && err.code && !['ERR_BAD_REQUEST', 'ERR_BAD_RESPONSE'].includes(err.code)) {
logger.log(`Retrying ${requestedUrl} URL due to ${err.code} error`);
return true; // retry all connection issues
}
if (err.responseData?.error?.code == 'maxlag') {
logger.log(`Mediawiki server is lagging ${err.responseData?.error?.lag}s; retrying in few seconds`);
return true; // note that we do not honor Retry-After header value because it is not possible in current code architecture
}
const httpReturnCode = err.response?.status || err.httpReturnCode;
if ([429, 500, 502, 503, 504, 524].includes(httpReturnCode)) {
logger.log(`Retrying ${requestedUrl} URL due to HTTP ${httpReturnCode} error`);
return true; // retry these HTTP status codes
}
if (err.responseData?.error?.code &&
![
'missingtitle',
'readapidenied',
'permissiondenied',
'internal_api_error_MediaWiki\\Revision\\BadRevisionException',
'internal_api_error_Wikimedia\\Assert\\UnreachableException',
'internal_api_error_Wikimedia\\Assert\\InvariantException',
'internal_api_error_Wikimedia\\Parsoid\\Core\\ResourceLimitExceededException',
].includes(err.responseData?.error?.code)) {
logger.log(`Retrying ${requestedUrl} URL due to ${err.responseData?.error?.code} Mediawiki error`);
return true; // retry these Mediawiki codes which are known to be transient
}
return false; // don't retry other errors
},
backoffHandler: (number, delay) => {
logger.info(`[backoff] #${number} after ${delay} ms`);
},
...backoffOptions,
};
this._basicRequestOptions = {
// HTTP agent pools with 'keepAlive' to reuse TCP connections, so it's faster
// Set cookie jar and use special Http(s)CookieAgent so that cookies are automatically intercepted and persited across calls
httpAgent: new HttpCookieAgent({ cookies: { jar: this.cookierJar }, keepAlive: true }),
httpsAgent: new HttpsCookieAgent({ cookies: { jar: this.cookierJar }, keepAlive: true, rejectUnauthorized: !this.insecure }), // rejectUnauthorized: false disables TLS
timeout: this.requestTimeout,
headers: {
// Use the base domain of the wiki being scraped as the Referer header, so that we can
// successfully scrap WMF map tiles.
Referer: MediaWiki.baseUrl.href,
'cache-control': 'public, max-stale=86400',
'user-agent': this.uaString,
},
validateStatus(status) {
return (status >= 200 && status < 300) || status === 304;
},
};
this._arrayBufferRequestOptions = {
...this.basicRequestOptions,
responseType: 'arraybuffer',
method: 'GET',
};
this._jsonRequestOptions = {
...this.basicRequestOptions,
headers: {
...this.basicRequestOptions.headers,
accept: 'application/json',
'accept-encoding': 'gzip, deflate',
},
responseType: 'json',
method: 'GET',
};
this._streamRequestOptions = {
...this.basicRequestOptions,
headers: {
...this.basicRequestOptions.headers,
accept: 'application/octet-stream',
'accept-encoding': 'gzip, deflate',
},
responseType: 'stream',
method: 'GET',
};
}
reset() {
this.uaString = undefined;
this._speed = undefined;
this._requestTimeout = undefined;
this.optimisationCacheUrl = undefined;
this._webp = false;
this.s3 = undefined;
this._apiUrlDirector = undefined;
this.insecure = false;
this.backoffOptions = undefined;
this._basicRequestOptions = undefined;
this._arrayBufferRequestOptions = undefined;
this._jsonRequestOptions = undefined;
this._streamRequestOptions = undefined;
this.cssDependenceUrls = {};
this.wikimediaMobileJsDependenciesList = [];
this.wikimediaMobileStyleDependenciesList = [];
this.articleUrlDirector = undefined;
this.mainPageUrlDirector = undefined;
}
getUrlDirector(renderer) {
switch (renderer.constructor.name) {
case 'WikimediaDesktopRenderer':
return MediaWiki.wikimediaDesktopUrlDirector;
case 'VisualEditorRenderer':
return MediaWiki.visualEditorUrlDirector;
case 'WikimediaMobileRenderer':
return MediaWiki.wikimediaMobileUrlDirector;
case 'RestApiRenderer':
return MediaWiki.restApiUrlDirector;
case 'ActionParseRenderer':
return MediaWiki.actionParseUrlDirector;
/* istanbul ignore next */
default:
throw new Error(`Unknown renderer ${renderer.constructor.name}`);
}
}
setUrlsDirectors(mainPageRenderer, articlesRenderer) {
this.articleUrlDirector = this.getUrlDirector(articlesRenderer);
this.mainPageUrlDirector = this.getUrlDirector(mainPageRenderer);
}
getArticleUrl(articleId, articleUrlOpts = {}) {
return this.articleUrlDirector.buildArticleURL(articleId, articleUrlOpts);
}
getMainPageUrl(articleId) {
return this.mainPageUrlDirector.buildArticleURL(articleId);
}
removeEtagWeakPrefix(etag) {
return etag && etag.replace(WEAK_ETAG_REGEX, '');
}
querySiteInfo() {
return this.getJSON(this.apiUrlDirector.buildSiteInfoURL());
}
async getArticleDetailsIds(articleIds, shouldGetThumbnail = false) {
let continuation;
let finalProcessedResp;
while (true) {
const queryOpts = {
...(await this.getArticleQueryOpts(shouldGetThumbnail, true)),
titles: articleIds.join('|'),
...((await MediaWiki.hasCoordinates()) ? { colimit: 'max' } : {}),
...(MediaWiki.getCategories
? {
cllimit: 'max',
clshow: '!hidden',
}
: {}),
...continuation,
};
const reqUrl = this.apiUrlDirector.buildQueryURL(queryOpts);
const resp = await this.getJSON(reqUrl);
Downloader.handleMWWarningsAndErrors(resp);
let processedResponse = resp.query?.pages ? normalizeMwResponse(resp.query) : {};
if (resp.continue) {
continuation = resp.continue;
finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse);
}
else {
if (MediaWiki.getCategories) {
processedResponse = await this.setArticleSubCategories(processedResponse);
}
finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse);
break;
}
}
return finalProcessedResp;
}
async getArticleDetailsNS(ns, gapcontinue = '') {
let queryContinuation;
let finalProcessedResp;
let gCont = null;
while (true) {
const queryOpts = {
...(await this.getArticleQueryOpts()),
...((await MediaWiki.hasCoordinates()) ? { colimit: 'max' } : {}),
...(MediaWiki.getCategories
? {
cllimit: 'max',
clshow: '!hidden',
}
: {}),
rawcontinue: 'true',
generator: 'allpages',
gapfilterredir: 'nonredirects',
gaplimit: 'max',
gapnamespace: String(ns),
gapcontinue,
};
if (queryContinuation) {
queryOpts.cocontinue = queryContinuation?.coordinates?.cocontinue ?? queryOpts.cocontinue;
queryOpts.clcontinue = queryContinuation?.categories?.clcontinue ?? queryOpts.clcontinue;
queryOpts.picontinue = queryContinuation?.pageimages?.picontinue ?? queryOpts.picontinue;
queryOpts.rdcontinue = queryContinuation?.redirects?.rdcontinue ?? queryOpts.rdcontinue;
}
const reqUrl = this.apiUrlDirector.buildQueryURL(queryOpts);
const resp = await this.getJSON(reqUrl);
Downloader.handleMWWarningsAndErrors(resp);
let processedResponse = normalizeMwResponse(resp.query);
gCont = resp['query-continue']?.allpages?.gapcontinue ?? gCont;
const queryComplete = Object.keys(resp['query-continue'] || {}).filter((key) => key !== 'allpages').length === 0;
if (!queryComplete) {
queryContinuation = resp['query-continue'];
finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse);
}
else {
if (MediaWiki.getCategories) {
processedResponse = await this.setArticleSubCategories(processedResponse);
}
finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse);
break;
}
}
return {
articleDetails: finalProcessedResp,
gapContinue: gCont,
};
}
async getLogEvents(letype, articleId) {
const logEventsData = await this.getJSON(this.apiUrlDirector.buildLogEventsQuery(letype, articleId));
return logEventsData.query?.logevents;
}
async getArticle(articleId, articleDetailXId, articleRenderer, articleUrl, dump, articleDetail) {
logger.info(`Getting article [${articleId}] from ${articleUrl}`);
try {
const { data, moduleDependencies, redirects, displayTitle, articleSubtitle, bodyCssClass, htmlCssClass } = await articleRenderer.download({
articleId,
articleUrl,
articleDetail,
});
// Cope with the fact that the page we are fetching might have been moved and replaced by a redirect
// In such a case, the download above is expected to follow the redirect so that we have proper original
// content at original article path, but we probably need to add a redirect since new article location
// probably did not existed when listing articles (might have existed if the move occured during article
// listing). The redirect we add in hence in the "opposite" direction than usual, i.e. it will redirect
// from new location to original location. Note that only ActionParse API gives proper redirects info.
for (const redirect of redirects) {
if (!(await RedisStore.articleDetailXId.exists(redirect.to)) && !(await RedisStore.redirectsXId.exists(redirect.to))) {
RedisStore.redirectsXId.set(redirect.to, { targetId: redirect.from, title: redirect.to, fragment: '' });
}
}
return await articleRenderer.render({
data,
moduleDependencies,
articleId,
articleDetailXId,
articleDetail,
displayTitle,
articleSubtitle,
bodyCssClass,
htmlCssClass,
dump,
});
}
catch (err) {
let downloadErrorContext;
if (err instanceof AxiosError) {
downloadErrorContext = {
errorCode: err.code,
urlCalled: err.config.url,
httpReturnCode: err.status,
responseContentType: err.response ? err.response.headers['content-type'].toString() : null,
responseData: err.response?.data,
};
}
else if (err instanceof DownloadError) {
downloadErrorContext = {
errorCode: null,
urlCalled: err.urlCalled,
httpReturnCode: err.httpReturnCode,
responseContentType: err.responseContentType,
responseData: err.responseData,
};
}
if (!downloadErrorContext) {
throw err;
}
logger.warn(`Article ${articleId} failed to download from '${downloadErrorContext.urlCalled}' with ` +
`'${downloadErrorContext.errorCode}' error code, ` +
`'${downloadErrorContext.httpReturnCode}' HTTP return code ` +
`and '${downloadErrorContext.responseContentType}' content-type ` +
`returned instead:\n${JSON.stringify(downloadErrorContext.responseData)}`);
const errorRule = findFirstMatchingRule(downloadErrorContext);
if (errorRule === null) {
logger.error('This is a fatal download error, aborting');
throw err;
}
if (errorRule.isHardFailure) {
logger.log(`This is a hard ${errorRule.detailsMessageKey} error which will be replaced by a placeholder`);
dump.status.articles.hardFail += 1;
dump.status.articles.hardFailedArticleIds.push(articleId);
if (dump.maxHardFailedArticles > 0 && dump.status.articles.hardFail > dump.maxHardFailedArticles) {
throw new Error('Too many articles failed to download');
}
}
else {
logger.log(`This is a soft ${errorRule.detailsMessageKey} error which will be replaced by a placeholder`);
dump.status.articles.softFail += 1;
dump.status.articles.softFailedArticleIds.push(articleId);
}
RedisStore.articleDetailXId.delete(articleId); // Remove article from list so that we stop creating links to this placeholder
const articleTitle = articleId.replace(/_/g, ' ');
const errorPlaceholderHtml = renderDownloadError(errorRule, dump, articleId, articleTitle);
return [
{
articleId,
displayTitle: articleTitle,
html: errorPlaceholderHtml,
imageDependencies: [],
videoDependencies: [],
mediaDependencies: [],
moduleDependencies: [],
staticFiles: config.output.downloadErrorResources,
subtitles: [],
},
];
}
}
async getJSON(_url) {
const url = urlHelper.deserializeUrl(_url);
return new Promise((resolve, reject) => {
this.backoffCall(this.getJSONCb, url, 'json', (err, val) => {
if (err) {
const httpStatus = (err.response && err.response.status) || err.httpReturnCode;
logger.info(`Failed to get [${url}] [status=${httpStatus}]`);
reject(err);
}
else {
resolve(val);
}
});
});
}
async request(config) {
return axios.request({
...this._basicRequestOptions,
...config,
headers: {
...this._basicRequestOptions.headers,
...config?.headers,
},
signal: AbortSignal.timeout(this.requestTimeout),
});
}
async get(url, config) {
return this.request({ url, method: 'GET', ...config });
}
async post(url, data, config) {
return this.request({ url, data, method: 'POST', ...config });
}
async downloadContent(_url, kind, retry = true) {
if (!_url) {
throw new Error(`Parameter [${_url}] is not a valid url`);
}
let url = urlHelper.deserializeUrl(_url);
if (url.startsWith('//')) {
url = `${MediaWiki.baseUrl.protocol}${url}`;
}
try {
return new Promise((resolve, reject) => {
const cb = (err, val) => {
if (err) {
reject(err);
}
else {
resolve(val);
}
};
if (retry) {
this.backoffCall(this.getContentCb, url, kind, cb);
}
else {
this.getContentCb(url, kind, cb);
}
});
}
catch (err) {
const httpStatus = err.response && err.response.status;
logger.warn(`Failed to get [${url}] [status=${httpStatus}]`);
throw err;
}
}
async canGetUrl(url) {
try {
await this.request({ url, method: 'GET', ...this.arrayBufferRequestOptions });
return true;
}
catch {
return false;
}
}
static handleMWWarningsAndErrors(resp) {
if (resp.warnings)
logger.warn(`Got warning from MW Query ${JSON.stringify(resp.warnings, null, '\t')}`);
if (resp.error?.code === DB_ERROR)
throw new Error(`Got error from MW Query ${JSON.stringify(resp.error, null, '\t')}`);
if (resp.error)
logger.log(`Got error from MW Query ${JSON.stringify(resp.warnings, null, '\t')}`);
}
async getArticleQueryOpts(includePageimages = false, followRedirects = false) {
const prop = `${includePageimages ? '|pageimages' : ''}${(await MediaWiki.hasCoordinates()) ? '|coordinates' : ''}${MediaWiki.getCategories ? '|categories' : ''}`;
return {
...MediaWiki.queryOpts,
prop: MediaWiki.queryOpts.prop.concat(prop),
formatversion: '2',
redirects: followRedirects ? true : undefined,
};
}
async setArticleSubCategories(articleDetails) {
logger.info('Getting subCategories');
for (const [articleId, articleDetail] of Object.entries(articleDetails)) {
const isCategoryArticle = articleDetail.ns === 14;
if (isCategoryArticle) {
const categoryMembers = await this.getSubCategories(articleId);
articleDetails[articleId].subCategories = categoryMembers.slice();
}
}
return articleDetails;
}
getJSONCb = (url, kind, handler) => {
logger.info(`Getting JSON from [${url}]`);
this.request({ url, method: 'GET', ...this.jsonRequestOptions })
.then((val) => {
if (val.data.error) {
handler(new DownloadError(`Error returned while calling API`, url, val.status, val.headers['content-type'].toString(), val.data));
}
else {
handler(null, val.data);
}
})
.catch((err) => handler(err));
};
async getImageMimeType(data) {
const fileType = await fileTypeFromBuffer(data);
if (fileType && fileType.mime === 'application/xml') {
// File type is known to be wrong, might be SVG
return null;
}
return fileType ? fileType.mime : null;
}
async getCompressedBody(input) {
const contentType = await this.getImageMimeType(input.data);
if (isBitmapImageMimeType(contentType)) {
if (this.webp && isWebpCandidateImageMimeType(contentType)) {
return {
data: await imagemin
.buffer(input.data, imageminOptions.get('webp').get(contentType))
.catch(async (err) => {
if (/Unsupported color conversion request/.test(err.stderr)) {
return imagemin
.buffer(await sharp(input.data).toColorspace('srgb').toBuffer(), imageminOptions.get('webp').get(contentType))
.catch(() => {
return input.data;
})
.then((data) => {
return data;
});
}
else {
return imagemin.buffer(input.data, imageminOptions.get('default').get(contentType)).catch(() => {
return input.data;
});
}
})
.then((data) => {
return data;
}),
};
}
else {
return {
data: await imagemin.buffer(input.data, imageminOptions.get('default').get(contentType)).catch(() => {
return input.data;
}),
};
}
}
return {
data: input.data,
};
}
getContentCb = async (url, kind, handler) => {
logger.info(`Downloading [${url}]`);
try {
if (this.optimisationCacheUrl && kind === 'image') {
this.downloadImage(url, handler);
}
else {
const resp = await this.request({ url, method: 'GET', ...this.arrayBufferRequestOptions });
// If content is an image, we might benefit from compressing it
const content = kind === 'image' ? (await this.getCompressedBody({ data: resp.data })).data : resp.data;
// compute content-type from content, since getCompressedBody might have modified it
const contentType = kind === 'image' ? (await this.getImageMimeType(content)) || resp.headers['content-type'] : resp.headers['content-type'];
handler(null, {
contentType,
content,
});
}
}
catch (err) {
try {
this.errHandler(err, url, handler);
}
catch {
handler(err);
}
}
};
async downloadImage(url, handler) {
try {
this.s3
// Check first if we have an entry in the (object storage) cache for this URL
.downloadBlob(stripHttpFromUrl(url), this.webp ? 'webp' : '1')
// Handle the cache response and act accordingly
.then(async (s3Resp) => {
// 'Versioning' of image is made via HTTP ETag. We should
// check if we have the proper version by requesting proper
// ETag from upstream MediaWiki.
if (s3Resp?.Metadata?.etag) {
this.arrayBufferRequestOptions.headers['If-None-Match'] = this.removeEtagWeakPrefix(s3Resp.Metadata.etag);
}
// Use the base domain of the wiki being scraped as the Referer header, so that we can
// successfully scrap WMF map tiles.
const mwResp = await this.request({ url, method: 'GET', ...this.arrayBufferRequestOptions });
// Most of the images, after having been uploaded once to the
// cache, will always have 304 status, until modified. If cache
// is up to date, return cached image. We always have an s3
// response when mwResp is 304, since this can only happen
// when we have an eTag coming from s3.
if (mwResp.status === 304) {
// Proceed with image
const data = (await this.streamToBuffer(s3Resp.Body));
const contentType = await this.getImageMimeType(data);
logger.info(`Using S3-cached image for ${url} (contentType: ${contentType})`);
handler(null, {
contentType,
content: data,
});
return;
}
// Destroy the Readable so that socket is freed and returned to the pool
if (s3Resp?.Body) {
s3Resp.Body.destroy();
}
// Compress content because image blob comes from upstream MediaWiki
const compressedData = (await this.getCompressedBody({ data: mwResp.data })).data;
// Check for the ETag and upload to cache
const etag = this.removeEtagWeakPrefix(mwResp.headers.etag);
if (etag) {
await this.s3.uploadBlob(stripHttpFromUrl(url), compressedData, etag, this.webp ? 'webp' : '1');
}
// get contentType from image, with fallback to response headers should the image be unsupported at all (e.g. SVG)
const contentType = (await this.getImageMimeType(compressedData)) || mwResp.headers['content-type'];
if (s3Resp) {
logger.info(`Using image downloaded from upstream for ${url} (S3-cached image is outdated, contentType: ${contentType})`);
}
else {
logger.info(`Using image downloaded from upstream for ${url} (no S3-cached image found, contentType: ${contentType})`);
}
// Proceed with image
handler(null, {
contentType,
content: compressedData,
});
})
.catch((err) => {
this.errHandler(err, url, handler);
});
}
catch (err) {
this.errHandler(err, url, handler);
}
}
errHandler(err, url, handler) {
logger.info(`Error while downloading content for ${url} due to ${err} ; might be retried`);
handler(err);
}
async getSubCategories(articleId, continueStr = '') {
const apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href);
const { query, continue: cont } = await this.getJSON(apiUrlDirector.buildSubCategoriesURL(articleId, continueStr));
const items = query.categorymembers.filter((a) => a && a.title);
if (cont && cont.cmcontinue) {
const nextItems = await this.getSubCategories(articleId, cont.cmcontinue);
return items.concat(nextItems);
}
else {
return items;
}
}
backoffCall(handler, url, kind, callback) {
this.backoffOptions.strategy.reset(); // reset delay to initial one at each call
const call = backoff.call(handler, url, kind, callback);
call.setStrategy(this.backoffOptions.strategy);
call.retryIf(this.backoffOptions.retryIf);
call.failAfter(this.backoffOptions.failAfter);
call.on('backoff', this.backoffOptions.backoffHandler);
call.start();
}
async getModuleDependencies(title) {
const genericJsModules = config.output.mw.js;
const genericCssModules = config.output.mw.css;
const apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href);
const articleApiUrl = apiUrlDirector.buildArticleApiURL(title);
const articleData = await this.getJSON(articleApiUrl);
if (articleData.error) {
const errorMessage = `Unable to retrieve js/css dependencies for article '${title}': ${articleData.error.code}`;
logger.error(errorMessage);
/* If article is missing (for example because it just has been deleted) or access is denied */
if (articleData.error.code === 'missingtitle' || articleData.error.code === 'permissiondenied') {
return { jsConfigVars: '', jsDependenciesList: [], styleDependenciesList: [] };
}
/* Something went wrong in modules retrieval at app level (no HTTP error) */
throw new Error(errorMessage);
}
const { parse: { modules, modulescripts, modulestyles, headhtml }, } = articleData;
const jsDependenciesList = genericJsModules.concat(modules, modulescripts).filter((a) => a);
const styleDependenciesList = []
.concat(modules, modulestyles, genericCssModules)
.filter((a) => a)
.filter((oneStyleDep) => !contains(config.filters.blackListCssModules, oneStyleDep));
logger.info(`Js dependencies of ${title} : ${jsDependenciesList}`);
logger.info(`Css dependencies of ${title} : ${styleDependenciesList}`);
const jsConfigVars = Downloader.extractJsConfigVars(headhtml);
// Download mobile page dependencies only once
if ((await MediaWiki.hasWikimediaMobileApi()) && this.wikimediaMobileJsDependenciesList.length === 0 && this.wikimediaMobileStyleDependenciesList.length === 0) {
try {
// TODO: An arbitrary title can be placed since all Wikimedia wikis have the same mobile offline resources
const mobileModulesData = await this.getJSON(`${MediaWiki.mobileModulePath}Test`);
mobileModulesData.forEach((module) => {
if (module.includes('javascript')) {
this.wikimediaMobileJsDependenciesList.push(module);
}
else if (module.includes('css')) {
this.wikimediaMobileStyleDependenciesList.push(module);
}
});
}
catch (err) {
throw new Error(`Error getting mobile modules ${err.message}`);
}
}
return {
jsConfigVars,
jsDependenciesList: jsDependenciesList.concat(this.wikimediaMobileJsDependenciesList),
styleDependenciesList: styleDependenciesList.concat(this.wikimediaMobileStyleDependenciesList),
};
}
// Solution to handle aws js sdk v3 from https://github.com/aws/aws-sdk-js-v3/issues/1877
async streamToBuffer(stream) {
return new Promise((resolve, reject) => {
const chunks = [];
stream.on('data', (chunk) => chunks.push(chunk));
stream.on('error', reject);
stream.on('end', () => resolve(Buffer.concat(chunks)));
});
}
static extractJsConfigVars(headhtml) {
let jsConfigVars = '';
// Saving, as a js module, the jsconfigvars that are set in the header of a wikipedia page
// the script below extracts the config with a regex executed on the page header returned from the api
const scriptTags = domino.createDocument(`${headhtml}</body></html>`).getElementsByTagName('script');
const regex = /mw\.config\.set\(\{.*?\}\);/gm;
for (let i = 0; i < scriptTags.length; i += 1) {
if (scriptTags[i].text.includes('mw.config.set')) {
jsConfigVars = regex.exec(scriptTags[i].text)[0] || '';
jsConfigVars = `(window.RLQ=window.RLQ||[]).push(function() {${jsConfigVars}});`;
}
else if (scriptTags[i].text.includes('RLCONF') || scriptTags[i].text.includes('RLSTATE') || scriptTags[i].text.includes('RLPAGEMODULES')) {
jsConfigVars = scriptTags[i].text;
}
}
jsConfigVars = jsConfigVars.replace('nosuchaction', 'view'); // to replace the wgAction config that is set to 'nosuchaction' from api but should be 'view'
return jsConfigVars;
}
}
export { Downloader as DownloaderClass };
const dl = Downloader.getInstance();
export default dl;
//# sourceMappingURL=Downloader.js.map