UNPKG

mwoffliner

Version:
805 lines 37.3 kB
import * as backoff from 'backoff'; import { config } from './config.js'; import { contains, normalizeMwResponse, DB_ERROR, WEAK_ETAG_REGEX, stripHttpFromUrl, isBitmapImageMimeType, isWebpCandidateImageMimeType } from './util/index.js'; import deepmerge from 'deepmerge'; import * as domino from 'domino'; import { default as imagemin } from 'imagemin'; import imageminAdvPng from 'imagemin-advpng'; import axios, { AxiosError } from 'axios'; import { default as imageminPngquant } from 'imagemin-pngquant'; import imageminGifsicle from 'imagemin-gifsicle'; import imageminJpegoptim from 'imagemin-jpegoptim'; import imageminWebp from 'imagemin-webp'; import sharp from 'sharp'; import { fileTypeFromBuffer } from 'file-type'; import { HttpCookieAgent, HttpsCookieAgent } from 'http-cookie-agent/http'; import { CookieJar } from 'tough-cookie'; import * as logger from './Logger.js'; import MediaWiki from './MediaWiki.js'; import ApiURLDirector from './util/builders/url/api.director.js'; import urlHelper from './util/url.helper.js'; import { findFirstMatchingRule, renderDownloadError } from './error.manager.js'; import RedisStore from './RedisStore.js'; const imageminOptions = new Map(); imageminOptions.set('default', new Map()); imageminOptions.set('webp', new Map()); imageminOptions.get('default').set('image/png', { plugins: [imageminPngquant({ speed: 3, strip: true, dithering: 0 }), imageminAdvPng({ optimizationLevel: 4, iterations: 5 })], }); imageminOptions.get('default').set('image/jpeg', { plugins: [imageminJpegoptim({ max: 60, stripAll: true })], }); imageminOptions.get('default').set('image/gif', { plugins: [imageminGifsicle({ optimizationLevel: 3, colors: 64 })], }); imageminOptions.get('webp').set('image/png', { plugins: [imageminWebp({ quality: 50, method: 6 })], }); imageminOptions.get('webp').set('image/jpeg', { plugins: [imageminWebp({ quality: 50, method: 6 })], }); export class DownloadError extends Error { urlCalled; httpReturnCode; responseContentType; responseData; constructor(message, urlCalled, httpReturnCode, responseContentType, responseData) { super(message); this.name = 'DownloadError'; this.urlCalled = urlCalled; this.httpReturnCode = httpReturnCode; this.responseContentType = responseContentType; this.responseData = responseData; if ('captureStackTrace' in Error) { // Avoid DownloadError itself in the stack trace Error.captureStackTrace(this, DownloadError); } } } /** * Downloader is a class providing content retrieval functionalities for both Mediawiki and S3 remote instances. */ class Downloader { static instance; static getInstance() { if (!Downloader.instance) { Downloader.instance = new Downloader(); } return Downloader.instance; } _speed; cssDependenceUrls = {}; _webp = false; _requestTimeout; _basicRequestOptions; _arrayBufferRequestOptions; _jsonRequestOptions; _streamRequestOptions; wikimediaMobileJsDependenciesList = []; wikimediaMobileStyleDependenciesList = []; uaString; backoffOptions; optimisationCacheUrl; s3; _apiUrlDirector; cookierJar; articleUrlDirector; mainPageUrlDirector; insecure = false; get speed() { return this._speed; } get webp() { return this._webp; } get requestTimeout() { return this._requestTimeout; } get basicRequestOptions() { return this._basicRequestOptions; } get arrayBufferRequestOptions() { return this._arrayBufferRequestOptions; } get jsonRequestOptions() { return this._jsonRequestOptions; } get streamRequestOptions() { return this._streamRequestOptions; } get apiUrlDirector() { return this._apiUrlDirector; } set init({ uaString, speed, reqTimeout, optimisationCacheUrl, s3, webp, backoffOptions, insecure }) { this.reset(); this.uaString = uaString; this._speed = speed; this._requestTimeout = reqTimeout; this.optimisationCacheUrl = optimisationCacheUrl; this._webp = webp; this.s3 = s3; this._apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href); this.insecure = insecure; this.cookierJar = new CookieJar(); this.backoffOptions = { // retry up to 10 times, with a minimum of 1sec, maximum of 1min // this means we retry for up to about 6 mins, which is supposed // to be sufficient for backend to recover from transient errors strategy: new backoff.ExponentialStrategy({ initialDelay: 1000, maxDelay: 60000 }), failAfter: 10, retryIf: (err) => { const requestedUrl = err.urlCalled || err.config?.url || 'unknown'; if (err instanceof AxiosError && err.code && !['ERR_BAD_REQUEST', 'ERR_BAD_RESPONSE'].includes(err.code)) { logger.log(`Retrying ${requestedUrl} URL due to ${err.code} error`); return true; // retry all connection issues } if (err.responseData?.error?.code == 'maxlag') { logger.log(`Mediawiki server is lagging ${err.responseData?.error?.lag}s; retrying in few seconds`); return true; // note that we do not honor Retry-After header value because it is not possible in current code architecture } const httpReturnCode = err.response?.status || err.httpReturnCode; if ([429, 500, 502, 503, 504, 524].includes(httpReturnCode)) { logger.log(`Retrying ${requestedUrl} URL due to HTTP ${httpReturnCode} error`); return true; // retry these HTTP status codes } if (err.responseData?.error?.code && ![ 'missingtitle', 'readapidenied', 'permissiondenied', 'internal_api_error_MediaWiki\\Revision\\BadRevisionException', 'internal_api_error_Wikimedia\\Assert\\UnreachableException', 'internal_api_error_Wikimedia\\Assert\\InvariantException', 'internal_api_error_Wikimedia\\Parsoid\\Core\\ResourceLimitExceededException', ].includes(err.responseData?.error?.code)) { logger.log(`Retrying ${requestedUrl} URL due to ${err.responseData?.error?.code} Mediawiki error`); return true; // retry these Mediawiki codes which are known to be transient } return false; // don't retry other errors }, backoffHandler: (number, delay) => { logger.info(`[backoff] #${number} after ${delay} ms`); }, ...backoffOptions, }; this._basicRequestOptions = { // HTTP agent pools with 'keepAlive' to reuse TCP connections, so it's faster // Set cookie jar and use special Http(s)CookieAgent so that cookies are automatically intercepted and persited across calls httpAgent: new HttpCookieAgent({ cookies: { jar: this.cookierJar }, keepAlive: true }), httpsAgent: new HttpsCookieAgent({ cookies: { jar: this.cookierJar }, keepAlive: true, rejectUnauthorized: !this.insecure }), // rejectUnauthorized: false disables TLS timeout: this.requestTimeout, headers: { // Use the base domain of the wiki being scraped as the Referer header, so that we can // successfully scrap WMF map tiles. Referer: MediaWiki.baseUrl.href, 'cache-control': 'public, max-stale=86400', 'user-agent': this.uaString, }, validateStatus(status) { return (status >= 200 && status < 300) || status === 304; }, }; this._arrayBufferRequestOptions = { ...this.basicRequestOptions, responseType: 'arraybuffer', method: 'GET', }; this._jsonRequestOptions = { ...this.basicRequestOptions, headers: { ...this.basicRequestOptions.headers, accept: 'application/json', 'accept-encoding': 'gzip, deflate', }, responseType: 'json', method: 'GET', }; this._streamRequestOptions = { ...this.basicRequestOptions, headers: { ...this.basicRequestOptions.headers, accept: 'application/octet-stream', 'accept-encoding': 'gzip, deflate', }, responseType: 'stream', method: 'GET', }; } reset() { this.uaString = undefined; this._speed = undefined; this._requestTimeout = undefined; this.optimisationCacheUrl = undefined; this._webp = false; this.s3 = undefined; this._apiUrlDirector = undefined; this.insecure = false; this.backoffOptions = undefined; this._basicRequestOptions = undefined; this._arrayBufferRequestOptions = undefined; this._jsonRequestOptions = undefined; this._streamRequestOptions = undefined; this.cssDependenceUrls = {}; this.wikimediaMobileJsDependenciesList = []; this.wikimediaMobileStyleDependenciesList = []; this.articleUrlDirector = undefined; this.mainPageUrlDirector = undefined; } getUrlDirector(renderer) { switch (renderer.constructor.name) { case 'WikimediaDesktopRenderer': return MediaWiki.wikimediaDesktopUrlDirector; case 'VisualEditorRenderer': return MediaWiki.visualEditorUrlDirector; case 'WikimediaMobileRenderer': return MediaWiki.wikimediaMobileUrlDirector; case 'RestApiRenderer': return MediaWiki.restApiUrlDirector; case 'ActionParseRenderer': return MediaWiki.actionParseUrlDirector; /* istanbul ignore next */ default: throw new Error(`Unknown renderer ${renderer.constructor.name}`); } } setUrlsDirectors(mainPageRenderer, articlesRenderer) { this.articleUrlDirector = this.getUrlDirector(articlesRenderer); this.mainPageUrlDirector = this.getUrlDirector(mainPageRenderer); } getArticleUrl(articleId, articleUrlOpts = {}) { return this.articleUrlDirector.buildArticleURL(articleId, articleUrlOpts); } getMainPageUrl(articleId) { return this.mainPageUrlDirector.buildArticleURL(articleId); } removeEtagWeakPrefix(etag) { return etag && etag.replace(WEAK_ETAG_REGEX, ''); } querySiteInfo() { return this.getJSON(this.apiUrlDirector.buildSiteInfoURL()); } async getArticleDetailsIds(articleIds, shouldGetThumbnail = false) { let continuation; let finalProcessedResp; while (true) { const queryOpts = { ...(await this.getArticleQueryOpts(shouldGetThumbnail, true)), titles: articleIds.join('|'), ...((await MediaWiki.hasCoordinates()) ? { colimit: 'max' } : {}), ...(MediaWiki.getCategories ? { cllimit: 'max', clshow: '!hidden', } : {}), ...continuation, }; const reqUrl = this.apiUrlDirector.buildQueryURL(queryOpts); const resp = await this.getJSON(reqUrl); Downloader.handleMWWarningsAndErrors(resp); let processedResponse = resp.query?.pages ? normalizeMwResponse(resp.query) : {}; if (resp.continue) { continuation = resp.continue; finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse); } else { if (MediaWiki.getCategories) { processedResponse = await this.setArticleSubCategories(processedResponse); } finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse); break; } } return finalProcessedResp; } async getArticleDetailsNS(ns, gapcontinue = '') { let queryContinuation; let finalProcessedResp; let gCont = null; while (true) { const queryOpts = { ...(await this.getArticleQueryOpts()), ...((await MediaWiki.hasCoordinates()) ? { colimit: 'max' } : {}), ...(MediaWiki.getCategories ? { cllimit: 'max', clshow: '!hidden', } : {}), rawcontinue: 'true', generator: 'allpages', gapfilterredir: 'nonredirects', gaplimit: 'max', gapnamespace: String(ns), gapcontinue, }; if (queryContinuation) { queryOpts.cocontinue = queryContinuation?.coordinates?.cocontinue ?? queryOpts.cocontinue; queryOpts.clcontinue = queryContinuation?.categories?.clcontinue ?? queryOpts.clcontinue; queryOpts.picontinue = queryContinuation?.pageimages?.picontinue ?? queryOpts.picontinue; queryOpts.rdcontinue = queryContinuation?.redirects?.rdcontinue ?? queryOpts.rdcontinue; } const reqUrl = this.apiUrlDirector.buildQueryURL(queryOpts); const resp = await this.getJSON(reqUrl); Downloader.handleMWWarningsAndErrors(resp); let processedResponse = normalizeMwResponse(resp.query); gCont = resp['query-continue']?.allpages?.gapcontinue ?? gCont; const queryComplete = Object.keys(resp['query-continue'] || {}).filter((key) => key !== 'allpages').length === 0; if (!queryComplete) { queryContinuation = resp['query-continue']; finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse); } else { if (MediaWiki.getCategories) { processedResponse = await this.setArticleSubCategories(processedResponse); } finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse); break; } } return { articleDetails: finalProcessedResp, gapContinue: gCont, }; } async getLogEvents(letype, articleId) { const logEventsData = await this.getJSON(this.apiUrlDirector.buildLogEventsQuery(letype, articleId)); return logEventsData.query?.logevents; } async getArticle(articleId, articleDetailXId, articleRenderer, articleUrl, dump, articleDetail) { logger.info(`Getting article [${articleId}] from ${articleUrl}`); try { const { data, moduleDependencies, redirects, displayTitle, articleSubtitle, bodyCssClass, htmlCssClass } = await articleRenderer.download({ articleId, articleUrl, articleDetail, }); // Cope with the fact that the page we are fetching might have been moved and replaced by a redirect // In such a case, the download above is expected to follow the redirect so that we have proper original // content at original article path, but we probably need to add a redirect since new article location // probably did not existed when listing articles (might have existed if the move occured during article // listing). The redirect we add in hence in the "opposite" direction than usual, i.e. it will redirect // from new location to original location. Note that only ActionParse API gives proper redirects info. for (const redirect of redirects) { if (!(await RedisStore.articleDetailXId.exists(redirect.to)) && !(await RedisStore.redirectsXId.exists(redirect.to))) { RedisStore.redirectsXId.set(redirect.to, { targetId: redirect.from, title: redirect.to, fragment: '' }); } } return await articleRenderer.render({ data, moduleDependencies, articleId, articleDetailXId, articleDetail, displayTitle, articleSubtitle, bodyCssClass, htmlCssClass, dump, }); } catch (err) { let downloadErrorContext; if (err instanceof AxiosError) { downloadErrorContext = { errorCode: err.code, urlCalled: err.config.url, httpReturnCode: err.status, responseContentType: err.response ? err.response.headers['content-type'].toString() : null, responseData: err.response?.data, }; } else if (err instanceof DownloadError) { downloadErrorContext = { errorCode: null, urlCalled: err.urlCalled, httpReturnCode: err.httpReturnCode, responseContentType: err.responseContentType, responseData: err.responseData, }; } if (!downloadErrorContext) { throw err; } logger.warn(`Article ${articleId} failed to download from '${downloadErrorContext.urlCalled}' with ` + `'${downloadErrorContext.errorCode}' error code, ` + `'${downloadErrorContext.httpReturnCode}' HTTP return code ` + `and '${downloadErrorContext.responseContentType}' content-type ` + `returned instead:\n${JSON.stringify(downloadErrorContext.responseData)}`); const errorRule = findFirstMatchingRule(downloadErrorContext); if (errorRule === null) { logger.error('This is a fatal download error, aborting'); throw err; } if (errorRule.isHardFailure) { logger.log(`This is a hard ${errorRule.detailsMessageKey} error which will be replaced by a placeholder`); dump.status.articles.hardFail += 1; dump.status.articles.hardFailedArticleIds.push(articleId); if (dump.maxHardFailedArticles > 0 && dump.status.articles.hardFail > dump.maxHardFailedArticles) { throw new Error('Too many articles failed to download'); } } else { logger.log(`This is a soft ${errorRule.detailsMessageKey} error which will be replaced by a placeholder`); dump.status.articles.softFail += 1; dump.status.articles.softFailedArticleIds.push(articleId); } RedisStore.articleDetailXId.delete(articleId); // Remove article from list so that we stop creating links to this placeholder const articleTitle = articleId.replace(/_/g, ' '); const errorPlaceholderHtml = renderDownloadError(errorRule, dump, articleId, articleTitle); return [ { articleId, displayTitle: articleTitle, html: errorPlaceholderHtml, imageDependencies: [], videoDependencies: [], mediaDependencies: [], moduleDependencies: [], staticFiles: config.output.downloadErrorResources, subtitles: [], }, ]; } } async getJSON(_url) { const url = urlHelper.deserializeUrl(_url); return new Promise((resolve, reject) => { this.backoffCall(this.getJSONCb, url, 'json', (err, val) => { if (err) { const httpStatus = (err.response && err.response.status) || err.httpReturnCode; logger.info(`Failed to get [${url}] [status=${httpStatus}]`); reject(err); } else { resolve(val); } }); }); } async request(config) { return axios.request({ ...this._basicRequestOptions, ...config, headers: { ...this._basicRequestOptions.headers, ...config?.headers, }, signal: AbortSignal.timeout(this.requestTimeout), }); } async get(url, config) { return this.request({ url, method: 'GET', ...config }); } async post(url, data, config) { return this.request({ url, data, method: 'POST', ...config }); } async downloadContent(_url, kind, retry = true) { if (!_url) { throw new Error(`Parameter [${_url}] is not a valid url`); } let url = urlHelper.deserializeUrl(_url); if (url.startsWith('//')) { url = `${MediaWiki.baseUrl.protocol}${url}`; } try { return new Promise((resolve, reject) => { const cb = (err, val) => { if (err) { reject(err); } else { resolve(val); } }; if (retry) { this.backoffCall(this.getContentCb, url, kind, cb); } else { this.getContentCb(url, kind, cb); } }); } catch (err) { const httpStatus = err.response && err.response.status; logger.warn(`Failed to get [${url}] [status=${httpStatus}]`); throw err; } } async canGetUrl(url) { try { await this.request({ url, method: 'GET', ...this.arrayBufferRequestOptions }); return true; } catch { return false; } } static handleMWWarningsAndErrors(resp) { if (resp.warnings) logger.warn(`Got warning from MW Query ${JSON.stringify(resp.warnings, null, '\t')}`); if (resp.error?.code === DB_ERROR) throw new Error(`Got error from MW Query ${JSON.stringify(resp.error, null, '\t')}`); if (resp.error) logger.log(`Got error from MW Query ${JSON.stringify(resp.warnings, null, '\t')}`); } async getArticleQueryOpts(includePageimages = false, followRedirects = false) { const prop = `${includePageimages ? '|pageimages' : ''}${(await MediaWiki.hasCoordinates()) ? '|coordinates' : ''}${MediaWiki.getCategories ? '|categories' : ''}`; return { ...MediaWiki.queryOpts, prop: MediaWiki.queryOpts.prop.concat(prop), formatversion: '2', redirects: followRedirects ? true : undefined, }; } async setArticleSubCategories(articleDetails) { logger.info('Getting subCategories'); for (const [articleId, articleDetail] of Object.entries(articleDetails)) { const isCategoryArticle = articleDetail.ns === 14; if (isCategoryArticle) { const categoryMembers = await this.getSubCategories(articleId); articleDetails[articleId].subCategories = categoryMembers.slice(); } } return articleDetails; } getJSONCb = (url, kind, handler) => { logger.info(`Getting JSON from [${url}]`); this.request({ url, method: 'GET', ...this.jsonRequestOptions }) .then((val) => { if (val.data.error) { handler(new DownloadError(`Error returned while calling API`, url, val.status, val.headers['content-type'].toString(), val.data)); } else { handler(null, val.data); } }) .catch((err) => handler(err)); }; async getImageMimeType(data) { const fileType = await fileTypeFromBuffer(data); if (fileType && fileType.mime === 'application/xml') { // File type is known to be wrong, might be SVG return null; } return fileType ? fileType.mime : null; } async getCompressedBody(input) { const contentType = await this.getImageMimeType(input.data); if (isBitmapImageMimeType(contentType)) { if (this.webp && isWebpCandidateImageMimeType(contentType)) { return { data: await imagemin .buffer(input.data, imageminOptions.get('webp').get(contentType)) .catch(async (err) => { if (/Unsupported color conversion request/.test(err.stderr)) { return imagemin .buffer(await sharp(input.data).toColorspace('srgb').toBuffer(), imageminOptions.get('webp').get(contentType)) .catch(() => { return input.data; }) .then((data) => { return data; }); } else { return imagemin.buffer(input.data, imageminOptions.get('default').get(contentType)).catch(() => { return input.data; }); } }) .then((data) => { return data; }), }; } else { return { data: await imagemin.buffer(input.data, imageminOptions.get('default').get(contentType)).catch(() => { return input.data; }), }; } } return { data: input.data, }; } getContentCb = async (url, kind, handler) => { logger.info(`Downloading [${url}]`); try { if (this.optimisationCacheUrl && kind === 'image') { this.downloadImage(url, handler); } else { const resp = await this.request({ url, method: 'GET', ...this.arrayBufferRequestOptions }); // If content is an image, we might benefit from compressing it const content = kind === 'image' ? (await this.getCompressedBody({ data: resp.data })).data : resp.data; // compute content-type from content, since getCompressedBody might have modified it const contentType = kind === 'image' ? (await this.getImageMimeType(content)) || resp.headers['content-type'] : resp.headers['content-type']; handler(null, { contentType, content, }); } } catch (err) { try { this.errHandler(err, url, handler); } catch { handler(err); } } }; async downloadImage(url, handler) { try { this.s3 // Check first if we have an entry in the (object storage) cache for this URL .downloadBlob(stripHttpFromUrl(url), this.webp ? 'webp' : '1') // Handle the cache response and act accordingly .then(async (s3Resp) => { // 'Versioning' of image is made via HTTP ETag. We should // check if we have the proper version by requesting proper // ETag from upstream MediaWiki. if (s3Resp?.Metadata?.etag) { this.arrayBufferRequestOptions.headers['If-None-Match'] = this.removeEtagWeakPrefix(s3Resp.Metadata.etag); } // Use the base domain of the wiki being scraped as the Referer header, so that we can // successfully scrap WMF map tiles. const mwResp = await this.request({ url, method: 'GET', ...this.arrayBufferRequestOptions }); // Most of the images, after having been uploaded once to the // cache, will always have 304 status, until modified. If cache // is up to date, return cached image. We always have an s3 // response when mwResp is 304, since this can only happen // when we have an eTag coming from s3. if (mwResp.status === 304) { // Proceed with image const data = (await this.streamToBuffer(s3Resp.Body)); const contentType = await this.getImageMimeType(data); logger.info(`Using S3-cached image for ${url} (contentType: ${contentType})`); handler(null, { contentType, content: data, }); return; } // Destroy the Readable so that socket is freed and returned to the pool if (s3Resp?.Body) { s3Resp.Body.destroy(); } // Compress content because image blob comes from upstream MediaWiki const compressedData = (await this.getCompressedBody({ data: mwResp.data })).data; // Check for the ETag and upload to cache const etag = this.removeEtagWeakPrefix(mwResp.headers.etag); if (etag) { await this.s3.uploadBlob(stripHttpFromUrl(url), compressedData, etag, this.webp ? 'webp' : '1'); } // get contentType from image, with fallback to response headers should the image be unsupported at all (e.g. SVG) const contentType = (await this.getImageMimeType(compressedData)) || mwResp.headers['content-type']; if (s3Resp) { logger.info(`Using image downloaded from upstream for ${url} (S3-cached image is outdated, contentType: ${contentType})`); } else { logger.info(`Using image downloaded from upstream for ${url} (no S3-cached image found, contentType: ${contentType})`); } // Proceed with image handler(null, { contentType, content: compressedData, }); }) .catch((err) => { this.errHandler(err, url, handler); }); } catch (err) { this.errHandler(err, url, handler); } } errHandler(err, url, handler) { logger.info(`Error while downloading content for ${url} due to ${err} ; might be retried`); handler(err); } async getSubCategories(articleId, continueStr = '') { const apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href); const { query, continue: cont } = await this.getJSON(apiUrlDirector.buildSubCategoriesURL(articleId, continueStr)); const items = query.categorymembers.filter((a) => a && a.title); if (cont && cont.cmcontinue) { const nextItems = await this.getSubCategories(articleId, cont.cmcontinue); return items.concat(nextItems); } else { return items; } } backoffCall(handler, url, kind, callback) { this.backoffOptions.strategy.reset(); // reset delay to initial one at each call const call = backoff.call(handler, url, kind, callback); call.setStrategy(this.backoffOptions.strategy); call.retryIf(this.backoffOptions.retryIf); call.failAfter(this.backoffOptions.failAfter); call.on('backoff', this.backoffOptions.backoffHandler); call.start(); } async getModuleDependencies(title) { const genericJsModules = config.output.mw.js; const genericCssModules = config.output.mw.css; const apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href); const articleApiUrl = apiUrlDirector.buildArticleApiURL(title); const articleData = await this.getJSON(articleApiUrl); if (articleData.error) { const errorMessage = `Unable to retrieve js/css dependencies for article '${title}': ${articleData.error.code}`; logger.error(errorMessage); /* If article is missing (for example because it just has been deleted) or access is denied */ if (articleData.error.code === 'missingtitle' || articleData.error.code === 'permissiondenied') { return { jsConfigVars: '', jsDependenciesList: [], styleDependenciesList: [] }; } /* Something went wrong in modules retrieval at app level (no HTTP error) */ throw new Error(errorMessage); } const { parse: { modules, modulescripts, modulestyles, headhtml }, } = articleData; const jsDependenciesList = genericJsModules.concat(modules, modulescripts).filter((a) => a); const styleDependenciesList = [] .concat(modules, modulestyles, genericCssModules) .filter((a) => a) .filter((oneStyleDep) => !contains(config.filters.blackListCssModules, oneStyleDep)); logger.info(`Js dependencies of ${title} : ${jsDependenciesList}`); logger.info(`Css dependencies of ${title} : ${styleDependenciesList}`); const jsConfigVars = Downloader.extractJsConfigVars(headhtml); // Download mobile page dependencies only once if ((await MediaWiki.hasWikimediaMobileApi()) && this.wikimediaMobileJsDependenciesList.length === 0 && this.wikimediaMobileStyleDependenciesList.length === 0) { try { // TODO: An arbitrary title can be placed since all Wikimedia wikis have the same mobile offline resources const mobileModulesData = await this.getJSON(`${MediaWiki.mobileModulePath}Test`); mobileModulesData.forEach((module) => { if (module.includes('javascript')) { this.wikimediaMobileJsDependenciesList.push(module); } else if (module.includes('css')) { this.wikimediaMobileStyleDependenciesList.push(module); } }); } catch (err) { throw new Error(`Error getting mobile modules ${err.message}`); } } return { jsConfigVars, jsDependenciesList: jsDependenciesList.concat(this.wikimediaMobileJsDependenciesList), styleDependenciesList: styleDependenciesList.concat(this.wikimediaMobileStyleDependenciesList), }; } // Solution to handle aws js sdk v3 from https://github.com/aws/aws-sdk-js-v3/issues/1877 async streamToBuffer(stream) { return new Promise((resolve, reject) => { const chunks = []; stream.on('data', (chunk) => chunks.push(chunk)); stream.on('error', reject); stream.on('end', () => resolve(Buffer.concat(chunks))); }); } static extractJsConfigVars(headhtml) { let jsConfigVars = ''; // Saving, as a js module, the jsconfigvars that are set in the header of a wikipedia page // the script below extracts the config with a regex executed on the page header returned from the api const scriptTags = domino.createDocument(`${headhtml}</body></html>`).getElementsByTagName('script'); const regex = /mw\.config\.set\(\{.*?\}\);/gm; for (let i = 0; i < scriptTags.length; i += 1) { if (scriptTags[i].text.includes('mw.config.set')) { jsConfigVars = regex.exec(scriptTags[i].text)[0] || ''; jsConfigVars = `(window.RLQ=window.RLQ||[]).push(function() {${jsConfigVars}});`; } else if (scriptTags[i].text.includes('RLCONF') || scriptTags[i].text.includes('RLSTATE') || scriptTags[i].text.includes('RLPAGEMODULES')) { jsConfigVars = scriptTags[i].text; } } jsConfigVars = jsConfigVars.replace('nosuchaction', 'view'); // to replace the wgAction config that is set to 'nosuchaction' from api but should be 'view' return jsConfigVars; } } export { Downloader as DownloaderClass }; const dl = Downloader.getInstance(); export default dl; //# sourceMappingURL=Downloader.js.map