UNPKG

mwoffliner

Version:
651 lines 29.1 kB
import * as backoff from 'backoff'; import { config } from './config.js'; import { contains } from './util/index.js'; import deepmerge from 'deepmerge'; import * as domino from 'domino'; import { default as imagemin } from 'imagemin'; import imageminAdvPng from 'imagemin-advpng'; import axios from 'axios'; import { default as imageminPngquant } from 'imagemin-pngquant'; import imageminGifsicle from 'imagemin-gifsicle'; import imageminJpegoptim from 'imagemin-jpegoptim'; import imageminWebp from 'imagemin-webp'; import sharp from 'sharp'; import http from 'http'; import https from 'https'; import { fileTypeFromBuffer } from 'file-type'; import { normalizeMwResponse, DB_ERROR, WEAK_ETAG_REGEX, stripHttpFromUrl, isBitmapImageMimeType, isWebpCandidateImageMimeType } from './util/index.js'; import * as logger from './Logger.js'; import MediaWiki from './MediaWiki.js'; import ApiURLDirector from './util/builders/url/api.director.js'; import urlHelper from './util/url.helper.js'; const imageminOptions = new Map(); imageminOptions.set('default', new Map()); imageminOptions.set('webp', new Map()); imageminOptions.get('default').set('image/png', { plugins: [imageminPngquant({ speed: 3, strip: true, dithering: 0 }), imageminAdvPng({ optimizationLevel: 4, iterations: 5 })], }); imageminOptions.get('default').set('image/jpeg', { plugins: [imageminJpegoptim({ max: 60, stripAll: true })], }); imageminOptions.get('default').set('image/gif', { plugins: [imageminGifsicle({ optimizationLevel: 3, colors: 64 })], }); imageminOptions.get('webp').set('image/png', { plugins: [imageminWebp({ quality: 50, method: 6 })], }); imageminOptions.get('webp').set('image/jpeg', { plugins: [imageminWebp({ quality: 50, method: 6 })], }); /** * Downloader is a class providing content retrieval functionalities for both Mediawiki and S3 remote instances. */ class Downloader { loginCookie = ''; speed; cssDependenceUrls = {}; webp = false; requestTimeout; basicRequestOptions; arrayBufferRequestOptions; jsonRequestOptions; streamRequestOptions; wikimediaMobileJsDependenciesList = []; wikimediaMobileStyleDependenciesList = []; uaString; activeRequests = 0; maxActiveRequests = 1; backoffOptions; optimisationCacheUrl; s3; apiUrlDirector; articleUrlDirector; mainPageUrlDirector; insecure = false; constructor({ uaString, speed, reqTimeout, optimisationCacheUrl, s3, webp, backoffOptions, insecure }) { this.uaString = uaString; this.speed = speed; this.maxActiveRequests = speed * 10; this.requestTimeout = reqTimeout; this.loginCookie = ''; this.optimisationCacheUrl = optimisationCacheUrl; this.webp = webp; this.s3 = s3; this.apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href); this.insecure = insecure; this.backoffOptions = { strategy: new backoff.ExponentialStrategy(), failAfter: 7, retryIf: (err) => err.code === 'ECONNABORTED' || ![400, 403, 404].includes(err.response?.status), backoffHandler: (number, delay) => { logger.info(`[backoff] #${number} after ${delay} ms`); }, ...backoffOptions, }; this.basicRequestOptions = { // HTTP agent pools with 'keepAlive' to reuse TCP connections, so it's faster httpAgent: new http.Agent({ keepAlive: true }), httpsAgent: new https.Agent({ keepAlive: true, rejectUnauthorized: !this.insecure }), timeout: this.requestTimeout, headers: { 'cache-control': 'public, max-stale=86400', 'user-agent': this.uaString, }, validateStatus(status) { return (status >= 200 && status < 300) || status === 304; }, }; this.arrayBufferRequestOptions = { ...this.basicRequestOptions, responseType: 'arraybuffer', method: 'GET', }; this.jsonRequestOptions = { ...this.basicRequestOptions, headers: { ...this.basicRequestOptions.headers, accept: 'application/json', 'accept-encoding': 'gzip, deflate', }, responseType: 'json', method: 'GET', }; this.streamRequestOptions = { ...this.basicRequestOptions, headers: { ...this.basicRequestOptions.headers, accept: 'application/octet-stream', 'accept-encoding': 'gzip, deflate', }, responseType: 'stream', method: 'GET', }; } getUrlDirector(renderer) { switch (renderer.constructor.name) { case 'WikimediaDesktopRenderer': return MediaWiki.wikimediaDesktopUrlDirector; case 'VisualEditorRenderer': return MediaWiki.visualEditorUrlDirector; case 'WikimediaMobileRenderer': return MediaWiki.wikimediaMobileUrlDirector; case 'RestApiRenderer': return MediaWiki.restApiUrlDirector; } } setUrlsDirectors(mainPageRenderer, articlesRenderer) { if (!this.articleUrlDirector) { this.articleUrlDirector = this.getUrlDirector(articlesRenderer); } if (!this.mainPageUrlDirector) { this.mainPageUrlDirector = this.getUrlDirector(mainPageRenderer); } } getArticleUrl(articleId) { return this.articleUrlDirector.buildArticleURL(articleId); } getMainPageUrl(articleId) { return this.mainPageUrlDirector.buildArticleURL(articleId); } removeEtagWeakPrefix(etag) { return etag && etag.replace(WEAK_ETAG_REGEX, ''); } query() { return this.getJSON(this.apiUrlDirector.buildSiteInfoQueryURL()); } async getArticleDetailsIds(articleIds, shouldGetThumbnail = false) { let continuation; let finalProcessedResp; while (true) { const queryOpts = { ...(await this.getArticleQueryOpts(shouldGetThumbnail, true)), titles: articleIds.join('|'), ...((await MediaWiki.hasCoordinates(this)) ? { colimit: 'max' } : {}), ...(MediaWiki.getCategories ? { cllimit: 'max', clshow: '!hidden', } : {}), ...(continuation || {}), }; const reqUrl = this.apiUrlDirector.buildQueryURL(queryOpts); const resp = await this.getJSON(reqUrl); Downloader.handleMWWarningsAndErrors(resp); let processedResponse = resp.query ? normalizeMwResponse(resp.query) : {}; if (resp.continue) { continuation = resp.continue; finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse); } else { if (MediaWiki.getCategories) { processedResponse = await this.setArticleSubCategories(processedResponse); } finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse); break; } } return finalProcessedResp; } async getArticleDetailsNS(ns, gapcontinue = '') { let queryContinuation; let finalProcessedResp; let gCont = null; while (true) { const queryOpts = { ...(await this.getArticleQueryOpts()), ...((await MediaWiki.hasCoordinates(this)) ? { colimit: 'max' } : {}), ...(MediaWiki.getCategories ? { cllimit: 'max', clshow: '!hidden', } : {}), rawcontinue: 'true', generator: 'allpages', gapfilterredir: 'nonredirects', gaplimit: 'max', gapnamespace: String(ns), gapcontinue, }; if (queryContinuation) { queryOpts.cocontinue = queryContinuation?.coordinates?.cocontinue ?? queryOpts.cocontinue; queryOpts.clcontinue = queryContinuation?.categories?.clcontinue ?? queryOpts.clcontinue; queryOpts.picontinue = queryContinuation?.pageimages?.picontinue ?? queryOpts.picontinue; queryOpts.rdcontinue = queryContinuation?.redirects?.rdcontinue ?? queryOpts.rdcontinue; } const reqUrl = this.apiUrlDirector.buildQueryURL(queryOpts); const resp = await this.getJSON(reqUrl); Downloader.handleMWWarningsAndErrors(resp); let processedResponse = normalizeMwResponse(resp.query); gCont = resp['query-continue']?.allpages?.gapcontinue ?? gCont; const queryComplete = Object.keys(resp['query-continue'] || {}).filter((key) => key !== 'allpages').length === 0; if (!queryComplete) { queryContinuation = resp['query-continue']; finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse); } else { if (MediaWiki.getCategories) { processedResponse = await this.setArticleSubCategories(processedResponse); } finalProcessedResp = finalProcessedResp === undefined ? processedResponse : deepmerge(finalProcessedResp, processedResponse); break; } } return { articleDetails: finalProcessedResp, gapContinue: gCont, }; } async getArticle(webp, _moduleDependencies, articleId, articleDetailXId, articleRenderer, articleUrl, dump, articleDetail, isMainPage) { logger.info(`Getting article [${articleId}] from ${articleUrl}`); const data = await this.getJSON(articleUrl); if (data.error) { throw data.error; } return articleRenderer.render({ data, webp, _moduleDependencies, articleId, articleDetailXId, articleDetail, isMainPage, dump, }); } async getJSON(_url) { const url = urlHelper.deserializeUrl(_url); await this.claimRequest(); return new Promise((resolve, reject) => { this.backoffCall(this.getJSONCb, url, 'json', (err, val) => { this.releaseRequest(); if (err) { const httpStatus = err.response && err.response.status; logger.warn(`Failed to get [${url}] [status=${httpStatus}]`); reject(err); } else { resolve(val); } }); }); } async request(config) { return axios .request({ ...config, headers: { // Use the base domain of the wiki being scraped as the Referer header, so that we can // successfully scrap WMF map tiles. Referer: MediaWiki.baseUrl.href, // Set loginCookie if present (might be dynamic, so we need to override it at every call) cookie: this.loginCookie, ...config.headers, }, signal: AbortSignal.timeout(this.requestTimeout), }) .then(async (resp) => { // Store cookie if needed, so that we can pass it to next requests if (resp.headers['set-cookie']) { this.loginCookie = resp.headers['set-cookie'].join(';'); } return resp; }); } async downloadContent(_url, kind, retry = true) { if (!_url) { throw new Error(`Parameter [${_url}] is not a valid url`); } const url = urlHelper.deserializeUrl(_url); await this.claimRequest(); try { return new Promise((resolve, reject) => { const cb = (err, val) => { if (err) { reject(err); } else { resolve(val); } }; if (retry) { this.backoffCall(this.getContentCb, url, kind, cb); } else { this.getContentCb(url, kind, cb); } }); } catch (err) { const httpStatus = err.response && err.response.status; logger.warn(`Failed to get [${url}] [status=${httpStatus}]`); throw err; } finally { this.releaseRequest(); } } async canGetUrl(url) { try { await this.request({ url, method: 'GET', ...this.arrayBufferRequestOptions }); return true; } catch (err) { return false; } } static handleMWWarningsAndErrors(resp) { if (resp.warnings) logger.warn(`Got warning from MW Query ${JSON.stringify(resp.warnings, null, '\t')}`); if (resp.error?.code === DB_ERROR) throw new Error(`Got error from MW Query ${JSON.stringify(resp.error, null, '\t')}`); if (resp.error) logger.log(`Got error from MW Query ${JSON.stringify(resp.warnings, null, '\t')}`); } async getArticleQueryOpts(includePageimages = false, redirects = false) { const validNamespaceIds = MediaWiki.namespacesToMirror.map((ns) => MediaWiki.namespaces[ns].num); const prop = `${includePageimages ? '|pageimages' : ''}${(await MediaWiki.hasCoordinates(this)) ? '|coordinates' : ''}${MediaWiki.getCategories ? '|categories' : ''}`; return { ...MediaWiki.queryOpts, prop: MediaWiki.queryOpts.prop.concat(prop), rdnamespace: validNamespaceIds.join('|'), formatversion: '2', redirects: redirects ? true : undefined, }; } async setArticleSubCategories(articleDetails) { logger.info('Getting subCategories'); for (const [articleId, articleDetail] of Object.entries(articleDetails)) { const isCategoryArticle = articleDetail.ns === 14; if (isCategoryArticle) { const categoryMembers = await this.getSubCategories(articleId); articleDetails[articleId].subCategories = categoryMembers.slice(); } } return articleDetails; } async claimRequest() { if (this.activeRequests < this.maxActiveRequests) { this.activeRequests += 1; return null; } else { await new Promise((resolve) => { setTimeout(resolve, 200); }); return this.claimRequest(); } } async releaseRequest() { this.activeRequests -= 1; return null; } getJSONCb = (url, kind, handler) => { logger.info(`Getting JSON from [${url}]`); this.request({ url, method: 'GET', ...this.jsonRequestOptions }) .then((a) => handler(null, a.data), handler) .catch((err) => { try { if (err.response && err.response.status === 429) { logger.log('Received a [status=429], slowing down'); const newMaxActiveRequests = Math.max(this.maxActiveRequests - 1, 1); logger.log(`Setting maxActiveRequests from [${this.maxActiveRequests}] to [${newMaxActiveRequests}]`); this.maxActiveRequests = newMaxActiveRequests; return this.getJSONCb(url, kind, handler); } else if (err.response && err.response.status === 404) { handler(err); } } catch (a) { logger.log('ERR', err); handler(err); } }); }; async getImageMimeType(data) { const fileType = await fileTypeFromBuffer(data); return fileType ? fileType.mime : null; } async getCompressedBody(input) { const contentType = await this.getImageMimeType(input.data); if (isBitmapImageMimeType(contentType)) { if (this.webp && isWebpCandidateImageMimeType(contentType)) { return { data: await imagemin .buffer(input.data, imageminOptions.get('webp').get(contentType)) .catch(async (err) => { if (/Unsupported color conversion request/.test(err.stderr)) { return imagemin .buffer(await sharp(input.data).toColorspace('srgb').toBuffer(), imageminOptions.get('webp').get(contentType)) .catch(() => { return input.data; }) .then((data) => { return data; }); } else { return imagemin.buffer(input.data, imageminOptions.get('default').get(contentType)).catch(() => { return input.data; }); } }) .then((data) => { return data; }), }; } else { return { data: await imagemin.buffer(input.data, imageminOptions.get('default').get(contentType)).catch(() => { return input.data; }), }; } } return { data: input.data, }; } getContentCb = async (url, kind, handler) => { logger.info(`Downloading [${url}]`); try { if (this.optimisationCacheUrl && kind === 'image') { this.downloadImage(url, handler); } else { const resp = await this.request({ url, method: 'GET', ...this.arrayBufferRequestOptions }); // If content is an image, we might benefit from compressing it const content = kind === 'image' ? (await this.getCompressedBody({ data: resp.data })).data : resp.data; // compute content-type from content, since getCompressedBody might have modified it const contentType = kind === 'image' ? (await this.getImageMimeType(content)) || resp.headers['content-type'] : resp.headers['content-type']; handler(null, { contentType, content, }); } } catch (err) { try { this.errHandler(err, url, handler); } catch (a) { handler(err); } } }; async downloadImage(url, handler) { try { this.s3 // Check first if we have an entry in the (object storage) cache for this URL .downloadBlob(stripHttpFromUrl(url), this.webp ? 'webp' : '1') // Handle the cache response and act accordingly .then(async (s3Resp) => { // 'Versioning' of image is made via HTTP ETag. We should // check if we have the proper version by requesting proper // ETag from upstream MediaWiki. if (s3Resp?.Metadata?.etag) { this.arrayBufferRequestOptions.headers['If-None-Match'] = this.removeEtagWeakPrefix(s3Resp.Metadata.etag); } // Use the base domain of the wiki being scraped as the Referer header, so that we can // successfully scrap WMF map tiles. const mwResp = await this.request({ url, method: 'GET', ...this.arrayBufferRequestOptions }); // Most of the images, after having been uploaded once to the // cache, will always have 304 status, until modified. If cache // is up to date, return cached image. We always have an s3 // response when mwResp is 304, since this can only happen // when we have an eTag coming from s3. if (mwResp.status === 304) { // Proceed with image const data = (await this.streamToBuffer(s3Resp.Body)); const contentType = await this.getImageMimeType(data); logger.info(`Using S3-cached image for ${url} (contentType: ${contentType})`); handler(null, { contentType, content: data, }); return; } // Destroy the Readable so that socket is freed and returned to the pool if (s3Resp?.Body) { s3Resp.Body.destroy(); } // Compress content because image blob comes from upstream MediaWiki const compressedData = (await this.getCompressedBody({ data: mwResp.data })).data; // Check for the ETag and upload to cache const etag = this.removeEtagWeakPrefix(mwResp.headers.etag); if (etag) { await this.s3.uploadBlob(stripHttpFromUrl(url), compressedData, etag, this.webp ? 'webp' : '1'); } // get contentType from image, with fallback to response headers should the image be unsupported at all (e.g. SVG) const contentType = (await this.getImageMimeType(compressedData)) || mwResp.headers['content-type']; if (s3Resp) { logger.info(`Using image downloaded from upstream for ${url} (S3-cached image is outdated, contentType: ${contentType})`); } else { logger.info(`Using image downloaded from upstream for ${url} (no S3-cached image found, contentType: ${contentType})`); } // Proceed with image handler(null, { contentType, content: compressedData, }); }) .catch((err) => { this.errHandler(err, url, handler); }); } catch (err) { this.errHandler(err, url, handler); } } errHandler(err, url, handler) { if (err.response && err.response.status === 429) { logger.log('Received a [status=429], slowing down'); const newMaxActiveRequests = Math.max(this.maxActiveRequests - 1, 1); logger.log(`Setting maxActiveRequests from [${this.maxActiveRequests}] to [${newMaxActiveRequests}]`); this.maxActiveRequests = newMaxActiveRequests; } logger.log(`Not able to download content for ${url} due to ${err}`); handler(err); } async getSubCategories(articleId, continueStr = '') { const apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href); const { query, continue: cont } = await this.getJSON(apiUrlDirector.buildSubCategoriesURL(articleId, continueStr)); const items = query.categorymembers.filter((a) => a && a.title); if (cont && cont.cmcontinue) { const nextItems = await this.getSubCategories(articleId, cont.cmcontinue); return items.concat(nextItems); } else { return items; } } backoffCall(handler, url, kind, callback) { const call = backoff.call(handler, url, kind, callback); call.setStrategy(this.backoffOptions.strategy); call.retryIf(this.backoffOptions.retryIf); call.failAfter(this.backoffOptions.failAfter); call.on('backoff', this.backoffOptions.backoffHandler); call.start(); } async getModuleDependencies(title) { const genericJsModules = config.output.mw.js; const genericCssModules = config.output.mw.css; /* These vars will store the list of js and css dependencies for the article we are downloading. */ let jsConfigVars = ''; let jsDependenciesList = []; let styleDependenciesList = []; const apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href); const articleApiUrl = apiUrlDirector.buildArticleApiURL(title); const articleData = await this.getJSON(articleApiUrl); if (articleData.error) { const errorMessage = `Unable to retrieve js/css dependencies for article '${title}': ${articleData.error.code}`; logger.error(errorMessage); /* If article is missing (for example because it just has been deleted) */ if (articleData.error.code === 'missingtitle') { return { jsConfigVars, jsDependenciesList, styleDependenciesList }; } /* Something went wrong in modules retrieval at app level (no HTTP error) */ throw new Error(errorMessage); } const { parse: { modules, modulescripts, modulestyles, headhtml }, } = articleData; jsDependenciesList = genericJsModules.concat(modules, modulescripts).filter((a) => a); styleDependenciesList = [].concat(modules, modulestyles, genericCssModules).filter((a) => a); styleDependenciesList = styleDependenciesList.filter((oneStyleDep) => !contains(config.filters.blackListCssModules, oneStyleDep)); logger.info(`Js dependencies of ${title} : ${jsDependenciesList}`); logger.info(`Css dependencies of ${title} : ${styleDependenciesList}`); // Saving, as a js module, the jsconfigvars that are set in the header of a wikipedia page // the script below extracts the config with a regex executed on the page header returned from the api const scriptTags = domino.createDocument(`${headhtml}</body></html>`).getElementsByTagName('script'); const regex = /mw\.config\.set\(\{.*?\}\);/gm; // eslint-disable-next-line @typescript-eslint/prefer-for-of for (let i = 0; i < scriptTags.length; i += 1) { if (scriptTags[i].text.includes('mw.config.set')) { jsConfigVars = regex.exec(scriptTags[i].text)[0] || ''; jsConfigVars = `(window.RLQ=window.RLQ||[]).push(function() {${jsConfigVars}});`; } else if (scriptTags[i].text.includes('RLCONF') || scriptTags[i].text.includes('RLSTATE') || scriptTags[i].text.includes('RLPAGEMODULES')) { jsConfigVars = scriptTags[i].text; } } jsConfigVars = jsConfigVars.replace('nosuchaction', 'view'); // to replace the wgAction config that is set to 'nosuchaction' from api but should be 'view' // Download mobile page dependencies only once if ((await MediaWiki.hasWikimediaMobileApi(this)) && this.wikimediaMobileJsDependenciesList.length === 0 && this.wikimediaMobileStyleDependenciesList.length === 0) { try { // TODO: An arbitrary title can be placed since all Wikimedia wikis have the same mobile offline resources const mobileModulesData = await this.getJSON(`${MediaWiki.mobileModulePath}Test`); mobileModulesData.forEach((module) => { if (module.includes('javascript')) { this.wikimediaMobileJsDependenciesList.push(module); } else if (module.includes('css')) { this.wikimediaMobileStyleDependenciesList.push(module); } }); } catch (err) { throw new Error(`Error getting mobile modules ${err.message}`); } } return { jsConfigVars, jsDependenciesList: jsDependenciesList.concat(this.wikimediaMobileJsDependenciesList), styleDependenciesList: styleDependenciesList.concat(this.wikimediaMobileStyleDependenciesList), }; } // Solution to handle aws js sdk v3 from https://github.com/aws/aws-sdk-js-v3/issues/1877 async streamToBuffer(stream) { return new Promise((resolve, reject) => { const chunks = []; stream.on('data', (chunk) => chunks.push(chunk)); stream.on('error', reject); stream.on('end', () => resolve(Buffer.concat(chunks))); }); } } export default Downloader; //# sourceMappingURL=Downloader.js.map