mwoffliner
Version:
MediaWiki ZIM scraper
442 lines • 17.8 kB
JavaScript
import * as pathParser from 'path';
import * as logger from './Logger.js';
import * as util from './util/index.js';
import * as domino from 'domino';
import qs from 'querystring';
import semver from 'semver';
import basicURLDirector from './util/builders/url/basic.director.js';
import BaseURLDirector from './util/builders/url/base.director.js';
import ApiURLDirector from './util/builders/url/api.director.js';
import WikimediaDesktopURLDirector from './util/builders/url/desktop.director.js';
import WikimediaMobileURLDirector from './util/builders/url/mobile.director.js';
import VisualEditorURLDirector from './util/builders/url/visual-editor.director.js';
import RestApiURLDirector from './util/builders/url/rest-api.director.js';
import { checkApiAvailability } from './util/mw-api.js';
import { BLACKLISTED_NS } from './util/const.js';
class MediaWiki {
static instance;
static getInstance() {
if (!MediaWiki.instance) {
MediaWiki.instance = new MediaWiki();
}
return MediaWiki.instance;
}
metaData;
baseUrl;
getCategories;
namespaces = {};
namespacesToMirror = [];
apiCheckArticleId;
queryOpts;
urlDirector;
#wikiPath;
#actionApiPath;
#modulePathOpt;
#restApiPath;
#username;
#password;
#domain;
wikimediaDesktopUrlDirector;
wikimediaMobileUrlDirector;
visualEditorUrlDirector;
restApiUrlDirector;
visualEditorApiUrl;
actionApiUrl;
restApiUrl;
webUrl;
wikimediaDesktopApiUrl;
wikimediaMobileApiUrl;
modulePath; // only for reading
mobileModulePath;
#apiUrlDirector;
#hasWikimediaDesktopApi;
#hasWikimediaMobileApi;
#hasVisualEditorApi;
#hasRestApi;
#hasCoordinates;
set username(value) {
this.#username = value;
}
set password(value) {
this.#password = value;
}
set actionApiPath(value) {
if (value) {
this.#actionApiPath = value;
this.actionApiUrl = this.urlDirector.buildURL(this.#actionApiPath);
this.setVisualEditorURL();
}
}
set restApiPath(value) {
if (value) {
this.#restApiPath = value;
this.setRestApiURL();
}
}
set domain(value) {
this.#domain = value;
}
set wikiPath(value) {
if (value) {
this.#wikiPath = value;
this.webUrl = this.urlDirector.buildURL(this.#wikiPath);
}
}
set base(value) {
if (value) {
this.baseUrl = basicURLDirector.buildMediawikiBaseURL(value);
this.urlDirector = new BaseURLDirector(this.baseUrl.href);
this.webUrl = this.urlDirector.buildURL(this.#wikiPath);
this.actionApiUrl = this.urlDirector.buildURL(this.#actionApiPath);
this.setWikimediaDesktopApiUrl();
this.setWikimediaMobileApiUrl();
this.setRestApiURL();
this.setVisualEditorURL();
this.setModuleURL();
this.setMobileModuleUrl();
}
}
set modulePathOpt(value) {
if (value !== undefined) {
this.#modulePathOpt = value;
}
if (this.urlDirector) {
this.setModuleURL();
}
else if (value) {
logger.error('Base url director should be specified first');
}
}
initializeMediaWikiDefaults() {
this.#domain = '';
this.#username = '';
this.#password = '';
this.getCategories = false;
this.#actionApiPath = '/w/api.php';
this.#restApiPath = '/w/rest.php';
this.#wikiPath = '/wiki/';
this.#modulePathOpt = '/w/load.php';
this.namespaces = {};
this.namespacesToMirror = [];
this.apiCheckArticleId = 'MediaWiki:Sidebar';
this.queryOpts = {
action: 'query',
format: 'json',
prop: 'redirects|revisions',
rdlimit: 'max',
rdnamespace: 0,
redirects: false,
formatversion: '2',
};
this.#hasWikimediaDesktopApi = null;
this.#hasWikimediaMobileApi = null;
this.#hasVisualEditorApi = null;
this.#hasRestApi = null;
this.#hasCoordinates = null;
}
constructor() {
this.initializeMediaWikiDefaults();
}
async hasWikimediaDesktopApi(downloader) {
if (this.#hasWikimediaDesktopApi === null) {
this.wikimediaDesktopUrlDirector = new WikimediaDesktopURLDirector(this.wikimediaDesktopApiUrl.href);
const checkUrl = this.wikimediaDesktopUrlDirector.buildArticleURL(this.apiCheckArticleId);
this.#hasWikimediaDesktopApi = await checkApiAvailability(downloader, checkUrl);
logger.log('Checked for WikimediaDesktopApi at', checkUrl, '-- result is: ', this.#hasWikimediaDesktopApi);
return this.#hasWikimediaDesktopApi;
}
return this.#hasWikimediaDesktopApi;
}
async hasWikimediaMobileApi(downloader) {
if (this.#hasWikimediaMobileApi === null) {
this.wikimediaMobileUrlDirector = new WikimediaMobileURLDirector(this.wikimediaMobileApiUrl.href);
const checkUrl = this.wikimediaMobileUrlDirector.buildArticleURL(this.apiCheckArticleId);
this.#hasWikimediaMobileApi = await checkApiAvailability(downloader, checkUrl);
logger.log('Checked for WikimediaMobileApi at', checkUrl, '-- result is: ', this.#hasWikimediaMobileApi);
return this.#hasWikimediaMobileApi;
}
return this.#hasWikimediaMobileApi;
}
async hasVisualEditorApi(downloader) {
if (this.#hasVisualEditorApi === null) {
this.visualEditorUrlDirector = new VisualEditorURLDirector(this.visualEditorApiUrl.href);
const checkUrl = this.visualEditorUrlDirector.buildArticleURL(this.apiCheckArticleId);
this.#hasVisualEditorApi = await checkApiAvailability(downloader, checkUrl, this.visualEditorUrlDirector.validMimeTypes);
logger.log('Checked for VisualEditorApi at', checkUrl, '-- result is: ', this.#hasVisualEditorApi);
return this.#hasVisualEditorApi;
}
return this.#hasVisualEditorApi;
}
async hasRestApi(downloader) {
if (this.#hasRestApi === null) {
this.restApiUrlDirector = new RestApiURLDirector(this.restApiUrl.href);
const checkUrl = this.restApiUrlDirector.buildArticleURL(this.apiCheckArticleId);
this.#hasRestApi = await checkApiAvailability(downloader, checkUrl);
logger.log('Checked for RestApi at', checkUrl, '-- result is: ', this.#hasRestApi);
return this.#hasRestApi;
}
return this.#hasRestApi;
}
async hasCoordinates(downloader) {
if (this.#hasCoordinates === null) {
const validNamespaceIds = this.namespacesToMirror.map((ns) => this.namespaces[ns].num);
const reqOpts = {
...this.queryOpts,
prop: this.queryOpts.prop + '|coordinates',
rdnamespace: validNamespaceIds,
};
const resp = await downloader.getJSON(this.#apiUrlDirector.buildQueryURL(reqOpts));
const isCoordinateWarning = JSON.stringify(resp?.warnings?.query ?? '').includes('coordinates');
if (isCoordinateWarning) {
logger.log('Coordinates not available on this wiki');
return (this.#hasCoordinates = false);
}
logger.log('Coordinates available on this wiki');
return (this.#hasCoordinates = true);
}
return this.#hasCoordinates;
}
setWikimediaDesktopApiUrl() {
this.wikimediaDesktopApiUrl = this.urlDirector.buildWikimediaDesktopApiUrl();
}
setWikimediaMobileApiUrl() {
this.wikimediaMobileApiUrl = this.urlDirector.buildWikimediaMobileApiUrl();
}
setRestApiURL() {
this.restApiUrl = this.urlDirector.buildRestApiUrl(this.#restApiPath);
}
setVisualEditorURL() {
this.#apiUrlDirector = new ApiURLDirector(this.actionApiUrl.href);
this.visualEditorApiUrl = this.#apiUrlDirector.buildVisualEditorURL();
}
setModuleURL() {
this.modulePath = this.urlDirector.buildModuleURL(this.#modulePathOpt);
}
setMobileModuleUrl() {
this.mobileModulePath = this.urlDirector.buildMobileModuleURL();
}
async login(downloader) {
if (this.#username && this.#password) {
let url = this.actionApiUrl.href + '?';
// Add domain if configured
if (this.#domain) {
url = `${url}lgdomain=${this.#domain}&`;
}
// Getting token to login.
const { content } = await downloader.downloadContent(url + 'action=query&meta=tokens&type=login&format=json&formatversion=2', 'data');
// Logging in
await downloader
.request({
url: this.actionApiUrl.href,
...downloader.arrayBufferRequestOptions,
data: qs.stringify({
action: 'login',
format: 'json',
lgname: this.#username,
lgpassword: this.#password,
lgtoken: JSON.parse(content.toString()).query.tokens.logintoken,
}),
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
},
method: 'POST',
})
.then(async (resp) => {
if (resp.data.login.result !== 'Success') {
throw new Error('Login Failed');
}
downloader.loginCookie = resp.headers['set-cookie'].join(';');
})
.catch((err) => {
throw err;
});
}
}
async getNamespaces(addNamespaces, downloader) {
const url = this.#apiUrlDirector.buildNamespacesURL();
const json = await downloader.getJSON(url);
['namespaces', 'namespacealiases'].forEach((type) => {
const entries = json.query[type];
Object.keys(entries).forEach((key) => {
const entry = entries[key];
const name = type === 'namespaces' ? entry.name : entry.alias;
const num = entry.id;
const allowedSubpages = 'subpages' in entry;
const isContent = type === 'namespaces' ? !!(entry.content || util.contains(addNamespaces, num)) : !!(entry.content !== undefined || util.contains(addNamespaces, num));
const isBlacklisted = BLACKLISTED_NS.includes(name);
const canonical = entry.canonical ? entry.canonical : '';
const details = { num, allowedSubpages, isContent };
/* Namespaces in local language */
this.namespaces[util.lcFirst(name)] = details;
this.namespaces[util.ucFirst(name)] = details;
/* Namespaces in English (if available) */
if (canonical) {
this.namespaces[util.lcFirst(canonical)] = details;
this.namespaces[util.ucFirst(canonical)] = details;
}
/* Is content to mirror */
if (isContent && !isBlacklisted) {
this.namespacesToMirror.push(name);
}
});
});
}
extractPageTitleFromHref(href) {
try {
const pathname = new URL(href, this.baseUrl).pathname;
// Local relative URL
if (href.indexOf('./') === 0) {
return util.decodeURIComponent(pathname.substr(1));
}
// Absolute path
if (pathname.indexOf(this.webUrl.pathname) === 0) {
return util.decodeURIComponent(pathname.substr(this.webUrl.pathname.length));
}
const isPaginatedRegExp = /\/[0-9]+(\.|$)/;
const isPaginated = isPaginatedRegExp.test(href);
if (isPaginated) {
const withoutDotHtml = href.split('.').slice(0, -1).join('.');
const lastTwoSlashes = withoutDotHtml.split('/').slice(-2).join('/');
return lastTwoSlashes;
}
if (pathParser.parse(href).dir.includes('../')) {
return pathParser.parse(href).name;
}
return null; /* Interwiki link? -- return null */
}
catch (error) {
logger.warn(`Unable to parse href ${href}`);
return null;
}
}
getCreatorName() {
/*
* Find a suitable name to use for ZIM (content) creator
* Heuristic: Use basename of the domain unless
* - it happens to be a wikimedia project OR
* - some domain where the second part of the hostname is longer than the first part
*/
const hostParts = this.baseUrl.hostname.split('.');
let creator = hostParts[0];
if (hostParts.length > 1) {
const wmProjects = new Set(['wikipedia', 'wikisource', 'wikibooks', 'wikiquote', 'wikivoyage', 'wikiversity', 'wikinews', 'wiktionary']);
if (wmProjects.has(hostParts[1]) || hostParts[0].length < hostParts[1].length) {
creator = hostParts[1]; // Name of the wikimedia project
}
}
creator = creator.charAt(0).toUpperCase() + creator.substr(1);
return creator;
}
async getTextDirection(downloader) {
logger.log('Getting text direction...');
const { content } = await downloader.downloadContent(this.webUrl.href, 'data');
const body = content.toString();
const doc = domino.createDocument(body);
const contentNode = doc.getElementById('mw-content-text');
const languageDirectionRegex = /"pageLanguageDir":"(.*?)"/;
const parts = languageDirectionRegex.exec(body);
let isLtr = true;
if (parts && parts[1]) {
isLtr = parts[1] === 'ltr';
}
else if (contentNode) {
isLtr = contentNode.getAttribute('dir') === 'ltr';
}
else {
logger.log('Unable to get the language direction, fallback to ltr');
isLtr = true;
}
const textDir = isLtr ? 'ltr' : 'rtl';
logger.log(`Text direction is [${textDir}]`);
return textDir;
}
async getSiteInfo(downloader) {
logger.log('Getting site info...');
const body = await downloader.query();
const entries = body.query.general;
// Checking mediawiki version
const mwVersion = semver.coerce(entries.generator).raw;
const mwMinimalVersion = 1.27;
if (!entries.generator || !semver.satisfies(mwVersion, `>=${mwMinimalVersion}`)) {
throw new Error(`Mediawiki version ${mwVersion} not supported should be >=${mwMinimalVersion}`);
}
const mainPage = entries.mainpage.replace(/ /g, '_');
const siteName = entries.sitename;
// Gather languages codes (en remove the 'dialect' part)
const langs = [entries.lang].concat(entries.fallback.map((e) => e.code)).map(function (e) {
return e.replace(/\-.*/, '');
});
const [langIso2, langIso3] = await Promise.all(langs.map(async (lang) => {
let langIso3;
try {
langIso3 = await util.getIso3(lang);
}
catch (err) {
langIso3 = lang;
}
try {
return [lang, langIso3];
}
catch (err) {
return false;
}
})).then((possibleLangPairs) => {
possibleLangPairs = possibleLangPairs.filter((a) => a);
return possibleLangPairs[0] || ['en', 'eng'];
});
return {
mainPage,
siteName,
langIso2,
langIso3,
};
}
async getSubTitle(downloader) {
logger.log('Getting sub-title...');
const { content } = await downloader.downloadContent(this.webUrl.href, 'data');
const html = content.toString();
const doc = domino.createDocument(html);
const subTitleNode = doc.getElementById('siteSub');
return subTitleNode ? subTitleNode.innerHTML : '';
}
async getMwMetaData(downloader) {
if (this.metaData) {
return this.metaData;
}
const creator = this.getCreatorName() || 'Kiwix';
const [textDir, { langIso2, langIso3, mainPage, siteName }, subTitle] = await Promise.all([
this.getTextDirection(downloader),
this.getSiteInfo(downloader),
this.getSubTitle(downloader),
]);
const mwMetaData = {
webUrl: this.webUrl.href,
actionApiUrl: this.actionApiUrl.href,
restApiUrl: this.restApiUrl.href,
modulePathOpt: this.#modulePathOpt,
modulePath: this.modulePath,
mobileModulePath: this.mobileModulePath,
webUrlPath: this.webUrl.pathname,
wikiPath: this.#wikiPath,
baseUrl: this.baseUrl.href,
actionApiPath: this.#actionApiPath,
restApiPath: this.#restApiPath,
domain: this.#domain,
textDir: textDir,
langIso2,
langIso3,
title: siteName,
subTitle,
creator,
mainPage,
};
this.metaData = mwMetaData;
return mwMetaData;
}
reset() {
this.initializeMediaWikiDefaults();
}
}
const mw = MediaWiki.getInstance();
export default mw;
//# sourceMappingURL=MediaWiki.js.map