mwoffliner
Version:
MediaWiki ZIM scraper
173 lines • 7.83 kB
JavaScript
import urlParser from 'url';
import * as pathParser from 'path';
import async from 'async';
import * as logger from '../Logger.js';
import RedisStore from '../RedisStore.js';
import { getFullUrl, jsPath, cssPath } from './index.js';
import { config } from '../config.js';
import MediaWiki from '../MediaWiki.js';
import { ZimArticle } from '@openzim/libzim';
import fs from 'fs';
import { DO_PROPAGATION, ALL_READY_FUNCTION, WEBP_HANDLER_URL, LOAD_PHP, RULE_TO_REDIRECT } from './const.js';
import * as path from 'path';
import { fileURLToPath } from 'url';
import urlHelper from './url.helper.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
export async function getAndProcessStylesheets(downloader, links) {
let finalCss = '';
const { filesToDownloadXPath } = RedisStore;
const stylesheetQueue = async.queue(async (link, finished) => {
const cssUrl = typeof link === 'object' ? getFullUrl(link.getAttribute('href'), MediaWiki.baseUrl) : link;
const linkMedia = typeof link === 'object' ? link.getAttribute('media') : null;
try {
/* link might be a 'link' DOM node or an URL */
if (cssUrl && !cssUrl.match('^data')) {
const cssUrlRegexp = new RegExp('url\\([\'"]{0,1}(.+?)[\'"]{0,1}\\)', 'gi');
logger.info(`Downloading CSS from ${decodeURI(cssUrl)}`);
const { content } = await downloader.downloadContent(cssUrl, 'css');
const body = content.toString();
let rewrittenCss = `\n/* start ${cssUrl} */\n\n`;
rewrittenCss += linkMedia ? `@media ${linkMedia} {\n` : '\n';
rewrittenCss += `${body}\n`;
rewrittenCss += linkMedia ? `} /* @media ${linkMedia} */\n` : '\n';
rewrittenCss += `\n/* end ${cssUrl} */\n`;
/* Downloading CSS dependencies */
let match;
// tslint:disable-next-line:no-conditional-assignment
while ((match = cssUrlRegexp.exec(body))) {
let url = match[1];
/* Avoid 'data', so no URL dependency */
if (!url.match('^data')) {
const filePathname = urlParser.parse(url, false, true).pathname;
if (filePathname) {
const filename = pathParser.basename(filePathname).replace(/-.*x./, '.');
/* Rewrite the CSS */
rewrittenCss = rewrittenCss.replace(url, filename);
/* Need a rewrite if url doesn't include protocol */
url = getFullUrl(url, cssUrl);
url = url.indexOf('%') < 0 ? encodeURI(url) : url;
/* Download CSS dependency, but avoid duplicate calls */
if (!downloader.cssDependenceUrls.hasOwnProperty(url) && filename) {
downloader.cssDependenceUrls[url] = true;
filesToDownloadXPath.set(config.output.dirs.mediawiki + '/' + filename, { url: urlHelper.serializeUrl(url), namespace: '-', kind: 'media' });
}
}
else {
logger.warn(`Skipping CSS [url(${url})] because the pathname could not be found [${filePathname}]`);
}
}
}
finalCss += rewrittenCss;
finished();
}
}
catch (err) {
logger.warn(`Failed to get CSS from [${cssUrl}]`);
finished();
}
}, Number(downloader.speed));
stylesheetQueue.push(links);
return new Promise((resolve) => {
stylesheetQueue.drain(resolve);
}).then(() => {
return {
finalCss,
};
});
}
export async function downloadAndSaveModule(zimCreator, downloader, dump, module, type) {
const replaceCodeByRegex = (sourceText, replaceMap) => {
let text;
replaceMap.forEach((textToReplace, regEx) => {
text = sourceText.replace(regEx, textToReplace);
});
return text;
};
// the function hackStartupModule changes startup script by returning true for all modules so that load.php is not called.
// it also removes requestIdleCallback as in our case window is idle after all script tags are called but those script tags
// will require the functions which would have been loaded by doPropagation.
function hackStartUpModule(jsCode) {
if ((!ALL_READY_FUNCTION.test(jsCode) || !DO_PROPAGATION.test(jsCode)) && !LOAD_PHP.test(jsCode)) {
throw new Error('unable to hack startup module');
}
return replaceCodeByRegex(jsCode, new Map([
[DO_PROPAGATION, 'doPropagation();'],
[ALL_READY_FUNCTION, 'function allReady( modules ) { return true;'],
[LOAD_PHP, 'script.src ="";'],
]));
}
let apiParameterOnly;
let moduleApiUrl;
if (type === 'js') {
apiParameterOnly = 'scripts';
}
else if (type === 'css') {
apiParameterOnly = 'styles';
}
if (!module.includes('javascript/mobile') && !module.includes('css/mobile')) {
moduleApiUrl = encodeURI(`${MediaWiki.modulePath}debug=true&lang=en&modules=${module}&only=${apiParameterOnly}&skin=vector&version=&*`);
}
else {
moduleApiUrl = encodeURI(`https:${module}`);
}
logger.info(`Getting [${type}] module [${moduleApiUrl}]`);
const { content } = await downloader.downloadContent(moduleApiUrl, 'module');
let text = content.toString();
if (type === 'js') {
switch (module) {
case 'startap':
text = hackStartUpModule(text);
break;
case 'mediawiki.page.ready':
text = replaceCodeByRegex(text, new Map([[RULE_TO_REDIRECT, 'false']]));
break;
}
}
try {
let articleId;
const pathFunctions = {
js: jsPath,
css: cssPath,
};
const pathFunction = pathFunctions[type];
if (pathFunction) {
articleId = pathFunction(module, config.output.dirs.mediawiki);
}
const article = new ZimArticle({ url: articleId, data: text, ns: '-' });
zimCreator.addArticle(article);
logger.info(`Saved module [${module}]`);
}
catch (e) {
logger.error(`Failed to get module with url [${moduleApiUrl}]\nYou may need to specify a custom --mwModulePath`, e);
throw e;
}
}
// URLs should be kept the same as Kiwix JS relies on it.
export async function importPolyfillModules(downloader, zimCreator) {
;
[
{ name: 'webpHeroPolyfill', path: path.join(__dirname, '../../node_modules/webp-hero/dist-cjs/polyfills.js') },
{ name: 'webpHeroBundle', path: path.join(__dirname, '../../node_modules/webp-hero/dist-cjs/webp-hero.bundle.js') },
].forEach(({ name, path }) => {
const article = new ZimArticle({
url: jsPath(name),
data: fs.readFileSync(path, 'utf8').toString(),
ns: '-',
});
zimCreator.addArticle(article);
});
const content = await downloader
.request({ url: WEBP_HANDLER_URL, method: 'GET', ...downloader.arrayBufferRequestOptions })
.then((a) => a.data)
.catch((err) => {
throw new Error(`Failed to download webpHandler from [${WEBP_HANDLER_URL}]: ${err}`);
});
const article = new ZimArticle({
url: jsPath('webpHandler'),
data: content,
ns: '-',
});
zimCreator.addArticle(article);
}
//# sourceMappingURL=dump.js.map