UNPKG

mwoffliner

Version:
236 lines 9.79 kB
import { migrateChildren, getMediaBase, getFullUrl, getRelativeFilePath, encodeArticleIdForZimHtmlUrl } from './misc.js'; import MediaWiki from '../MediaWiki.js'; import RedisStore from '../RedisStore.js'; import DU from '../DOMUtils.js'; import * as logger from '../Logger.js'; function rewriteUrlNoArticleCheck(articleId, dump, linkNode, mediaDependencies) { const classList = (linkNode.getAttribute('class') || '').split(' ').filter((cssClass) => cssClass); const rel = (linkNode.getAttribute('rel') || '').split(' ').filter((rel) => rel); let href = linkNode.getAttribute('href') || ''; let hrefProtocol; // Always keep selflinks if (classList.includes('mw-selflink') && !href) { return null; } // Always remove redlinks if (classList.includes('new')) { migrateChildren(linkNode, linkNode.parentNode, linkNode); linkNode.parentNode.removeChild(linkNode); return null; } const extractScheme = function (href) { const match = href.match(/^([a-zA-Z][a-zA-Z\d+\-.]*):/); return match ? match[1] : null; }; try { hrefProtocol = extractScheme(href); } catch { return null; } if (hrefProtocol && !hrefProtocol.includes('http')) { // e.g. geo:11111,11111 return null; } if (rel.includes('mwo:NoRewrite')) { return null; } if (!hrefProtocol && href.slice(0, 2) === '//') { href = `${MediaWiki.webUrl.protocol}${href}`; linkNode.setAttribute('href', href); hrefProtocol = MediaWiki.webUrl.protocol; } if (!rel.length && linkNode.getAttribute('resource')) { rel.push('mw:MediaLink'); } if (hrefProtocol && hrefProtocol.includes('http') && !rel.length) { rel.push('mw:ExtLink'); } if (!href) { DU.deleteNode(linkNode); return null; } if (href.substring(0, 1) === '#') { return null; } /* Deal with custom geo. URL replacement, for example: * http://maps.wikivoyage-ev.org/w/poimap2.php?lat=44.5044943&lon=34.1969633&zoom=15&layer=M&lang=ru&name=%D0%9C%D0%B0%D1%81%D1%81%D0%B0%D0%BD%D0%B4%D1%80%D0%B0 * http://tools.wmflabs.org/geohack/geohack.php?language=fr&pagename=Tour_Eiffel&params=48.85825_N_2.2945_E_type:landmark_region:fr */ if (!rel.includes('mw:WikiLink')) { let lat; let lon; if (/poimap2\.php/i.test(href)) { const hrefQuery = new URL(href, MediaWiki.baseUrl); lat = parseFloat(hrefQuery.searchParams.get('lat')); lon = parseFloat(hrefQuery.searchParams.get('lon')); } else if (/geohack\.php/i.test(href)) { const params = new URL(href, MediaWiki.baseUrl).searchParams.get('params'); if (params) { // see https://bitbucket.org/magnusmanske/geohack/src/public_html/geo_param.php const pieces = params.toUpperCase().split('_'); const semiPieces = pieces.length > 0 ? pieces[0].split(';') : undefined; if (semiPieces && semiPieces.length === 2) { ; [lat, lon] = semiPieces; } else { const factors = [1, 60, 3600]; let offs = 0; const deg = (hemiHash) => { let out = 0; let hemiSign = 0; for (let i = 0; i < 4 && i + offs < pieces.length; i += 1) { const v = pieces[i + offs]; hemiSign = hemiHash[v]; if (hemiSign) { offs = i + 1; break; } out += +v / factors[i]; } return out * hemiSign; }; lat = deg({ N: 1, S: -1 }); lon = deg({ E: 1, W: -1, O: 1 }); } } } else if (/Special:Map/i.test(href)) { const parts = href.split('/'); lat = parts[4]; lon = parts[5]; } else if (rel.includes('mw:MediaLink') || classList.includes('internal')) { const shouldScrape = (href.includes('.pdf') && !dump.nopdf) || ((href.includes('.ogg') || href.includes('.oga')) && !dump.nopic && !dump.novid && !dump.nodet); if (shouldScrape) { try { const mediaUrl = getFullUrl(href, MediaWiki.baseUrl); const newHref = getRelativeFilePath(articleId, getMediaBase(mediaUrl, true)); linkNode.setAttribute('href', newHref); if (mediaDependencies) { mediaDependencies.push(mediaUrl); } } catch (err) { logger.warn('Error parsing url:', err); DU.deleteNode(linkNode); } } else { linkNode.outerHTML = linkNode.innerHTML; } return null; } if (!isNaN(lat) && !isNaN(lon)) { href = `geo:${lat},${lon}`; linkNode.setAttribute('href', href); return null; } } if (!(href.startsWith(MediaWiki.webUrl.href) && !classList.includes('external')) && !classList.includes('mirror-link')) { /* Add 'external' class to interwiki links */ if (!classList.includes('external') && (rel.includes('mw:ExtLink') || rel.includes('mw:WikiLink/Interwiki') || classList.includes('extiw'))) { DU.appendToAttr(linkNode, 'class', 'external'); classList.push('external'); } /* Rewrite external links starting with // */ if (classList.includes('external') || rel.includes('nofollow')) { if (href.substring(0, 1) === '/') { linkNode.setAttribute('href', getFullUrl(href, MediaWiki.baseUrl)); } else if (href.substring(0, 2) === './') { migrateChildren(linkNode, linkNode.parentNode, linkNode); linkNode.parentNode.removeChild(linkNode); } return null; } if (rel.length && !rel.includes('mw:WikiLink') && !rel.includes('mw:referencedBy')) { return null; } } const title = MediaWiki.extractPageTitleFromHref(href); if (title) { const localAnchor = href.lastIndexOf('#') === -1 ? '' : href.substr(href.lastIndexOf('#')); linkNode.setAttribute('href', encodeArticleIdForZimHtmlUrl(title) + localAnchor); return title; } // Rewrite any urls still remaining DU.appendToAttr(linkNode, 'class', 'external'); linkNode.setAttribute('href', getFullUrl(href, MediaWiki.baseUrl)); return null; } async function checkIfArticlesMirrored(articleTitles, articleDetailXId) { const mirrored = []; const unmirrored = []; if (!articleTitles.length) { return [mirrored, unmirrored]; } const articlesMirrored = await articleDetailXId.existsMany(articleTitles); for (const articleTitle of articleTitles) { if (articlesMirrored[articleTitle]) { mirrored.push(articleTitle); } else { unmirrored.push(articleTitle); } } return [mirrored, unmirrored]; } export async function rewriteUrls(articleId, dump, linkNodes) { const mediaDependencies = []; /* * key: article title * value: Array of linkNodes linking to article */ const wikilinkMappings = {}; for (const linkNode of linkNodes) { const articleLink = rewriteUrlNoArticleCheck(articleId, dump, linkNode, mediaDependencies); if (articleLink) { if (Array.isArray(wikilinkMappings[articleLink])) { wikilinkMappings[articleLink].push(linkNode); } else { wikilinkMappings[articleLink] = [linkNode]; } } } const [, unmirroredTitles] = await checkIfArticlesMirrored(Object.keys(wikilinkMappings), RedisStore.articleDetailXId); if (unmirroredTitles.length) { const articlesRedirected = await RedisStore.redirectsXId.existsMany(unmirroredTitles); for (const articleTitle of unmirroredTitles) { const redirect = articlesRedirected[articleTitle]; if (!redirect) { wikilinkMappings[articleTitle].forEach((linkNode) => { migrateChildren(linkNode, linkNode.parentNode, linkNode); linkNode.parentNode.removeChild(linkNode); }); delete wikilinkMappings[articleTitle]; } } } if (articleId.includes('/')) { const slashesInUrl = articleId.split('/').length - 1; const upStr = slashesInUrl ? '../'.repeat(slashesInUrl) : './'; Object.values(wikilinkMappings).forEach((linkNodes) => { for (const linkNode of linkNodes) { const href = linkNode.getAttribute('href'); linkNode.setAttribute('href', `${upStr}${href}`); } }); } return { mediaDependencies }; } export async function rewriteUrlsOfDoc(parsoidDoc, articleId, dump) { /* Go through all links */ const as = parsoidDoc.getElementsByTagName('a'); const areas = parsoidDoc.getElementsByTagName('area'); const linkNodes = Array.prototype.slice.call(as).concat(Array.prototype.slice.call(areas)); const ret = await rewriteUrls(articleId, dump, linkNodes); return { ...ret, doc: parsoidDoc, }; } //# sourceMappingURL=rewriteUrls.js.map