UNPKG

mwoffliner

Version:

MediaWiki ZIM scraper

github.com/openzim/mwoffliner

openzim/mwoffliner

437 lines • 23.7 kB

JavaScript

import * as logger from '../Logger.js'; import Downloader from '../Downloader.js'; import RedisStore from '../RedisStore.js'; import { StringItem } from '@openzim/libzim'; import pmap from 'p-map'; import * as domino from 'domino'; import Timer from './Timer.js'; import { jsPath } from './index.js'; import { config } from '../config.js'; import { getSizeFromUrl, parseRetryAfterHeader } from './misc.js'; import { FILES_DOWNLOAD_FAILURE_MINIMUM_FOR_CHECK, FILES_DOWNLOAD_FAILURE_TRESHOLD_PER_TEN_THOUSAND, MAX_FILE_DOWNLOAD_RETRIES } from './const.js'; import urlHelper from './url.helper.js'; import RenderingContext from '../renderers/rendering.context.js'; import { fileDownloadMutex, zimCreatorMutex } from '../mutex.js'; import { truncateUtf8Bytes } from './misc.js'; import { isMainPage } from './articles.js'; import RedisQueue from './RedisQueue.js'; // Maximum delay between file download attempts on a given host // Default upload.wikimedia.org Retry-After value is 11 seconds const MAXIMUM_FILE_DOWNLOAD_DELAY = 20000; export async function downloadFiles(fileStore, zimCreator, dump) { let prevPercentProgress; // create structure + Redis queue to requests hosts in a responsible manner const hosts = new Map(); const filesTotal = await RedisStore.filesToDownloadXPath.len(); await RedisStore.filesToDownloadXPath.iterateItems(1, async (filesToDownload) => { for (const [path, { url, mult, width, kind }] of Object.entries(filesToDownload)) { const hostname = new URL(urlHelper.deserializeUrl(url)).hostname; if (!hosts.has(hostname)) { const filesToDownload = new RedisQueue(RedisStore.client, `${hostname}-files`); RedisStore.filesQueues.push(filesToDownload); filesToDownload.flush(); hosts.set(hostname, { filesToDownload, requestInterval: 10, // initial request interval is 10 ms downloadSuccess: 0, downloadFailure: 0, downloadsComplete: false, }); } hosts.get(hostname).filesToDownload.push({ path: path, url: url, mult: mult, width: width, kind: kind, downloadAttempts: 0, }); } }); await RedisStore.filesToDownloadXPath.flush(); /** * Return next file ready to download or wait if all hosts need to make a pause. * Return null when there is no more file to download. */ async function getNextFileToDownload() { const startPolling = Date.now(); // loop until we've found a file to download or list is empty while (true) { // ensure we are not in a dead loop forever (1 hour is way too much, but this is a safety net anyway) if (startPolling + 1000 * 60 * 60 < Date.now()) { logger.warn('No file to download for more than 1 hour, exiting the loop'); for (const hostData of hosts.values()) { if (hostData.downloadsComplete) { continue; } while (await hostData.filesToDownload.pop()) { dump.status.files.fail += 1; if (dump.status.files.fail > FILES_DOWNLOAD_FAILURE_MINIMUM_FOR_CHECK && (dump.status.files.fail * 10000) / filesTotal > FILES_DOWNLOAD_FAILURE_TRESHOLD_PER_TEN_THOUSAND) { throw new Error(`Too many files failed to download: [${dump.status.files.fail}/${filesTotal}]`); } } } return null; } // check if all donwloads have completed and exit const hostValues = Array.from(hosts.values()); const completedHosts = hostValues.reduce((buf, host) => { return host.downloadsComplete ? buf + 1 : 0; }, 0); if (completedHosts == hostValues.length) { return null; } for (const [hostname, hostData] of hosts.entries()) { // check conditions which leads to ignore current host if (hostData.downloadsComplete || (hostData.notBeforeDate && hostData.notBeforeDate > Date.now()) || (hostData.lastRequestDate && hostData.lastRequestDate + hostData.requestInterval > Date.now())) { continue; } // grab next item from Redis queue const fileToDownload = await hostData.filesToDownload.pop(); if (!fileToDownload) { hostData.downloadsComplete = true; continue; } // modify lastRequestDate immediately so that all workers are aware hostData.lastRequestDate = Date.now(); return { fileToDownload, hostData, hostname }; } // pause few milliseconds, no host has something to process (just to not burn CPU) await new Promise((resolve) => { setTimeout(resolve, 10); }); } } /** * Really try to download one file, handling potential errors received to slow down host, stop scraper, ignore failed file * @param fileToDownload information about the file to download * @param hostname hostname to process * @param hostData data about the host to process * @param workerId ID of worker currently processing this download */ async function workerDownloadFile(fileToDownload, hostname, hostData, workerId) { if ((dump.status.files.success + dump.status.files.fail) % (10 * Downloader.speed) === 0) { const percentProgress = (((dump.status.files.success + dump.status.files.fail) / filesTotal) * 100).toFixed(1); if (percentProgress !== prevPercentProgress) { prevPercentProgress = percentProgress; logger.log(`Progress downloading files [${dump.status.files.success + dump.status.files.fail}/${filesTotal}] [${percentProgress}%]`); } if (dump.status.files.fail > FILES_DOWNLOAD_FAILURE_MINIMUM_FOR_CHECK && (dump.status.files.fail * 10000) / filesTotal > FILES_DOWNLOAD_FAILURE_TRESHOLD_PER_TEN_THOUSAND) { throw new Error(`Too many files failed to download: [${dump.status.files.fail}/${filesTotal}]`); } } fileToDownload.downloadAttempts += 1; logger.info(`Worker ${workerId} downloading ${urlHelper.deserializeUrl(fileToDownload.url)} (${fileToDownload.kind})`); await Downloader.downloadContent(fileToDownload.url, fileToDownload.kind, false) .then(async (resp) => { if (resp && resp.content && resp.contentType) { // { FRONT_ARTICLE: 0 } is here very important, should we retrieve HTML we want to be sure the libzim will // not consider it for title index const item = new StringItem(fileToDownload.path, resp.contentType, null, { FRONT_ARTICLE: 0 }, resp.content); await zimCreatorMutex.runExclusive(() => zimCreator.addItem(item)); dump.status.files.success += 1; hostData.downloadSuccess += 1; } else { throw new Error(`Bad response received: ${resp}`); } }) .catch(async (err) => { if (fileToDownload.downloadAttempts > MAX_FILE_DOWNLOAD_RETRIES || (err.response && err.response.status == 404)) { logger.warn(`Error downloading file [${urlHelper.deserializeUrl(fileToDownload.url)}] [status=${err.response?.status}], skipping`); dump.status.files.fail += 1; hostData.downloadFailure += 1; } else { if (err.response) { const retryAfterHeader = err.response.headers['retry-after']?.toString(); if (retryAfterHeader) { const retryDate = parseRetryAfterHeader(retryAfterHeader); if (retryDate) { if (retryDate > Date.now() + MAXIMUM_FILE_DOWNLOAD_DELAY) { logger.log(`Received a [Retry-After=${retryAfterHeader}] on ${hostname} but this is too far away, ignoring`); } else { hostData.notBeforeDate = retryDate; logger.log(`Received a [Retry-After=${retryAfterHeader}], pausing down ${hostname} until ${hostData.notBeforeDate}`); } } else { logger.warn(`Received a [Retry-After=${retryAfterHeader}] from ${hostname} but failed to interpret it`); } } } // slow down except for wikimedia thumbnails whose server is known to be lying (see https://github.com/openzim/mwoffliner/issues/2572) if (err.response && [429, 503, 524].includes(err.response.status) && !urlHelper.deserializeUrl(fileToDownload.url).match(/^https?:\/\/upload\.wikimedia\.org\/.*\/thumb\//)) { hostData.requestInterval = Math.min(MAXIMUM_FILE_DOWNLOAD_DELAY, hostData.requestInterval * 1.2); // 1.2 is arbitrary value to progressively slow requests to host down logger.log(`Received a [status=${err.response.status}], slowing down ${hostname} to ${hostData.requestInterval}ms interval`); } await hostData.filesToDownload.push(fileToDownload); } }); } await pmap(Array.from({ length: Downloader.speed }, (_, i) => i), async (workerId) => { while (true) { // get next file to download in a Mutex (we do not want two workers trying to get next file at same time // since we need to take into account limits per hostname, so this getNextFileToDownload will update // data about load per host) const nextFileData = await fileDownloadMutex.runExclusive(getNextFileToDownload); if (!nextFileData) break; const { fileToDownload, hostname, hostData } = nextFileData; await workerDownloadFile(fileToDownload, hostname, hostData, workerId); } }, { concurrency: Downloader.speed }); logger.log(`Done with downloading ${filesTotal} files: ${dump.status.files.success} success, ${dump.status.files.fail} fail: `, JSON.stringify(Object.fromEntries([...hosts].map(([hostname, hostData]) => [hostname, { success: hostData.downloadSuccess, fail: hostData.downloadFailure }])), null, '\t')); } async function getAllArticlesToKeep(articleDetailXId, dump, mainPageRenderer, articlesRenderer) { await articleDetailXId.iterateItems(Downloader.speed, async (articleKeyValuePairs) => { for (const [articleId, articleDetail] of Object.entries(articleKeyValuePairs)) { let rets; try { const mainPage = isMainPage(articleId); const renderer = mainPage ? mainPageRenderer : articlesRenderer; const leadSectionId = dump.nodet && !articleDetail.contentmodel && !articleDetail.missing ? config.filters.leadSectionId : ''; const articleUrl = mainPage ? Downloader.getMainPageUrl(articleId) : Downloader.getArticleUrl(articleId, { sectionId: leadSectionId }); rets = await Downloader.getArticle(articleId, articleDetailXId, renderer, articleUrl, dump, articleDetail); for (const { articleId, html } of rets) { if (!html) { continue; } const doc = domino.createDocument(html); if (!mainPage && !(await dump.customProcessor.shouldKeepArticle(articleId, doc))) { articleDetailXId.delete(articleId); } } } catch (err) { logger.warn(`Error downloading article [${articleId}], skipping`, err); articleDetailXId.delete(articleId); } } }); } function flattenPromises(promisArr) { return [ // first articleId promisArr[0][0], // promise resolving to first error or void (async () => { const resolved = await Promise.all(promisArr.map((p) => p[1])); return resolved.find((err) => err); })(), ]; } /* * Parse fetched article HTML, store files * and dependencies and save in Zim */ async function saveArticle(zimCreator, finalHTML, mediaDependencies, imageDependencies, videoDependencies, subtitles, articleId, articleTitle) { try { const filesToDownload = {}; if (subtitles?.length > 0) { subtitles.forEach((s) => { filesToDownload[s.path] = { url: s.url, kind: 'subtitle' }; }); } if (mediaDependencies && mediaDependencies.length) { const existingVals = await RedisStore.filesToDownloadXPath.getMany(mediaDependencies.map((dep) => dep.path)); for (const dep of mediaDependencies) { const { mult, width } = getSizeFromUrl(dep.url); const existingVal = existingVals[dep.path]; const currentDepIsHigherRes = !existingVal || existingVal.width < (width || 10e6) || existingVal.mult < (mult || 1); if (currentDepIsHigherRes) { filesToDownload[dep.path] = { url: urlHelper.serializeUrl(dep.url), kind: 'media', mult, width, }; } } } if (imageDependencies && imageDependencies.length) { const existingVals = await RedisStore.filesToDownloadXPath.getMany(imageDependencies.map((dep) => dep.path)); for (const dep of imageDependencies) { const { mult, width } = getSizeFromUrl(dep.url); const existingVal = existingVals[dep.path]; const currentDepIsHigherRes = !existingVal || existingVal.width < (width || 10e6) || existingVal.mult < (mult || 1); if (currentDepIsHigherRes) { filesToDownload[dep.path] = { url: urlHelper.serializeUrl(dep.url), kind: 'image', mult, width, }; } } } if (videoDependencies && videoDependencies.length) { const existingVals = await RedisStore.filesToDownloadXPath.getMany(videoDependencies.map((dep) => dep.path)); for (const dep of videoDependencies) { const { mult, width } = getSizeFromUrl(dep.url); const existingVal = existingVals[dep.path]; const currentDepIsHigherRes = !existingVal || existingVal.width < (width || 10e6) || existingVal.mult < (mult || 1); if (currentDepIsHigherRes) { filesToDownload[dep.path] = { url: urlHelper.serializeUrl(dep.url), kind: 'video', mult, width, }; } } } await RedisStore.filesToDownloadXPath.setMany(filesToDownload); const zimArticle = new StringItem(articleId, 'text/html', truncateUtf8Bytes(articleTitle, 245), { FRONT_ARTICLE: 1 }, finalHTML); await zimCreatorMutex.runExclusive(() => zimCreator.addItem(zimArticle)); return null; } catch (err) { return err; } } /* * Fetch Articles */ export async function saveArticles(zimCreator, dump) { const jsModuleDependencies = new Set(); const cssModuleDependencies = new Set(); const staticFilesList = new Set(); let jsConfigVars = ''; let prevPercentProgress; const { articleDetailXId } = RedisStore; const articlesTotal = await articleDetailXId.len(); // number of articles allowed to fail is the greater of 5 or 0.001% (1 per 100k) articles dump.maxHardFailedArticles = Math.max(5, Math.floor(articlesTotal / 100000)); if (dump.customProcessor?.shouldKeepArticle) { await getAllArticlesToKeep(articleDetailXId, dump, RenderingContext.mainPageRenderer, RenderingContext.articlesRenderer); } const stages = ['Download Article and dependencies', 'Parse and Save to ZIM', 'Await left-over promises']; // depending on which renderer we use, we can have up to 2 requests to make to download article details // each request we need to make can be retried 10 times. Each retry attempttakes at most requestTimeout + retry interval // retry interval is an exponentional value from 1 to 60s // we assume rest of processing is "fast" and takes at most 1 minute const timeout = 2 * (Downloader.requestTimeout * 10 + (1 + 2 + 4 + 8 + 16 + 32 + 60 * 4) * 1000) + 60000; await articleDetailXId.iterateItems(Downloader.speed, (articleKeyValuePairs, runningWorkers) => { // eslint-disable-next-line no-async-promise-executor return new Promise(async (resolve, reject) => { /* * timer to detect freezes */ let curStage = 0; let curArticle = ''; const timer = new Timer(() => { const errorMessage = `Worker timed out after ${timeout} ms at ${stages[curStage]} ${curArticle}`; logger.error(errorMessage); reject(new Error(errorMessage)); }, timeout); logger.info(`Worker processing batch of article ids [${logger.logifyArray(Object.keys(articleKeyValuePairs))}] - ${runningWorkers} worker(s) running`); const parsePromiseQueue = []; for (const [articleId, articleDetail] of Object.entries(articleKeyValuePairs)) { timer.reset(); curStage = 0; curArticle = articleId; const promises = []; let rets; try { const mainPage = isMainPage(articleId); const renderer = mainPage ? RenderingContext.mainPageRenderer : RenderingContext.articlesRenderer; const leadSectionId = dump.nodet && !articleDetail.contentmodel && !articleDetail.missing ? config.filters.leadSectionId : ''; const articleUrl = mainPage ? Downloader.getMainPageUrl(articleId) : Downloader.getArticleUrl(articleId, { sectionId: leadSectionId }); rets = await Downloader.getArticle(articleId, articleDetailXId, renderer, articleUrl, dump, articleDetail); curStage += 1; for (const { articleId, displayTitle: articleTitle, html: finalHTML, mediaDependencies, imageDependencies, videoDependencies, moduleDependencies, staticFiles, subtitles, } of rets) { if (!finalHTML) { logger.warn(`No HTML returned for article [${articleId}], skipping`); continue; } for (const dep of moduleDependencies.jsDependenciesList || []) { jsModuleDependencies.add(dep); } for (const dep of moduleDependencies.styleDependenciesList || []) { cssModuleDependencies.add(dep); } for (const file of staticFiles) { staticFilesList.add(file); } jsConfigVars = moduleDependencies.jsConfigVars || ''; /* * getModuleDependencies and downloader.getArticle are * network heavy while parsing and saving is I/O. * To parse and download simultaniously, we don't await on save, * but instead cache the promise in a queue and check it later */ promises.push([articleId, saveArticle(zimCreator, finalHTML, mediaDependencies, imageDependencies, videoDependencies, subtitles, articleId, articleTitle)]); } } catch (err) { logger.error(`Error downloading/rendering article ${articleId}`); reject(err); return; } curStage += 1; if (parsePromiseQueue.length) { const [articleId, parsePromise] = parsePromiseQueue.shift(); curArticle = articleId; /* * in normal circumstances, where downloading is slower than * saving, this promise will always be resolved here already */ const err = await parsePromise; if (err) { console.log(err); logger.error(`Error parsing article ${articleId}`); timer.clear(); reject(err); return; } dump.status.articles.success += 1; } if (promises.length) { parsePromiseQueue.push(flattenPromises(promises)); } if ((dump.status.articles.success + dump.status.articles.hardFail + dump.status.articles.softFail) % 10 === 0) { const percentProgress = (((dump.status.articles.success + dump.status.articles.hardFail + dump.status.articles.softFail) / articlesTotal) * 100).toFixed(1); if (percentProgress !== prevPercentProgress) { prevPercentProgress = percentProgress; logger.log(`Progress downloading articles [${dump.status.articles.success + dump.status.articles.hardFail + dump.status.articles.softFail}/${articlesTotal}] [${percentProgress}%]`); } } } /* * clear up potentially still pending promises */ curStage += 1; if (parsePromiseQueue.length) { const [articleId, parsePromise] = flattenPromises(parsePromiseQueue); curArticle = articleId; const err = await parsePromise; if (err) { timer.clear(); reject(err); return; } dump.status.articles.success += parsePromiseQueue.length; } timer.clear(); resolve(); }); }); logger.log(`Done with downloading a total of [${articlesTotal}] articles`); if (jsConfigVars) { const jsConfigVarArticle = new StringItem(jsPath('jsConfigVars', config.output.dirs.mediawiki), 'application/javascript', null, { FRONT_ARTICLE: 0 }, jsConfigVars); await zimCreatorMutex.runExclusive(() => zimCreator.addItem(jsConfigVarArticle)); } return { staticFilesList, jsModuleDependencies, cssModuleDependencies, }; } //# sourceMappingURL=saveArticles.js.map